re PR target/91604 (ICE in extract_insn at recog.c:2310 since r272323)
[gcc.git] / gcc / config / i386 / i386-expand.c
1 /* Copyright (C) 1988-2019 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "params.h"
62 #include "cselib.h"
63 #include "sched-int.h"
64 #include "opts.h"
65 #include "tree-pass.h"
66 #include "context.h"
67 #include "pass_manager.h"
68 #include "target-globals.h"
69 #include "gimple-iterator.h"
70 #include "tree-vectorizer.h"
71 #include "shrink-wrap.h"
72 #include "builtins.h"
73 #include "rtl-iter.h"
74 #include "tree-iterator.h"
75 #include "dbgcnt.h"
76 #include "case-cfn-macros.h"
77 #include "dojump.h"
78 #include "fold-const-call.h"
79 #include "tree-vrp.h"
80 #include "tree-ssanames.h"
81 #include "selftest.h"
82 #include "selftest-rtl.h"
83 #include "print-rtl.h"
84 #include "intl.h"
85 #include "ifcvt.h"
86 #include "symbol-summary.h"
87 #include "ipa-prop.h"
88 #include "ipa-fnsummary.h"
89 #include "wide-int-bitmask.h"
90 #include "tree-vector-builder.h"
91 #include "debug.h"
92 #include "dwarf2out.h"
93 #include "i386-options.h"
94 #include "i386-builtins.h"
95 #include "i386-expand.h"
96
97 /* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
102
103 void
104 split_double_mode (machine_mode mode, rtx operands[],
105 int num, rtx lo_half[], rtx hi_half[])
106 {
107 machine_mode half_mode;
108 unsigned int byte;
109 rtx mem_op = NULL_RTX;
110 int mem_num = 0;
111
112 switch (mode)
113 {
114 case E_TImode:
115 half_mode = DImode;
116 break;
117 case E_DImode:
118 half_mode = SImode;
119 break;
120 default:
121 gcc_unreachable ();
122 }
123
124 byte = GET_MODE_SIZE (half_mode);
125
126 while (num--)
127 {
128 rtx op = operands[num];
129
130 /* simplify_subreg refuse to split volatile memory addresses,
131 but we still have to handle it. */
132 if (MEM_P (op))
133 {
134 if (mem_op && rtx_equal_p (op, mem_op))
135 {
136 lo_half[num] = lo_half[mem_num];
137 hi_half[num] = hi_half[mem_num];
138 }
139 else
140 {
141 mem_op = op;
142 mem_num = num;
143 lo_half[num] = adjust_address (op, half_mode, 0);
144 hi_half[num] = adjust_address (op, half_mode, byte);
145 }
146 }
147 else
148 {
149 lo_half[num] = simplify_gen_subreg (half_mode, op,
150 GET_MODE (op) == VOIDmode
151 ? mode : GET_MODE (op), 0);
152 hi_half[num] = simplify_gen_subreg (half_mode, op,
153 GET_MODE (op) == VOIDmode
154 ? mode : GET_MODE (op), byte);
155 }
156 }
157 }
158
159 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
160 for the target. */
161
162 void
163 ix86_expand_clear (rtx dest)
164 {
165 rtx tmp;
166
167 /* We play register width games, which are only valid after reload. */
168 gcc_assert (reload_completed);
169
170 /* Avoid HImode and its attendant prefix byte. */
171 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
172 dest = gen_rtx_REG (SImode, REGNO (dest));
173 tmp = gen_rtx_SET (dest, const0_rtx);
174
175 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
176 {
177 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
178 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
179 }
180
181 emit_insn (tmp);
182 }
183
184 void
185 ix86_expand_move (machine_mode mode, rtx operands[])
186 {
187 rtx op0, op1;
188 rtx tmp, addend = NULL_RTX;
189 enum tls_model model;
190
191 op0 = operands[0];
192 op1 = operands[1];
193
194 switch (GET_CODE (op1))
195 {
196 case CONST:
197 tmp = XEXP (op1, 0);
198
199 if (GET_CODE (tmp) != PLUS
200 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
201 break;
202
203 op1 = XEXP (tmp, 0);
204 addend = XEXP (tmp, 1);
205 /* FALLTHRU */
206
207 case SYMBOL_REF:
208 model = SYMBOL_REF_TLS_MODEL (op1);
209
210 if (model)
211 op1 = legitimize_tls_address (op1, model, true);
212 else if (ix86_force_load_from_GOT_p (op1))
213 {
214 /* Load the external function address via GOT slot to avoid PLT. */
215 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
216 (TARGET_64BIT
217 ? UNSPEC_GOTPCREL
218 : UNSPEC_GOT));
219 op1 = gen_rtx_CONST (Pmode, op1);
220 op1 = gen_const_mem (Pmode, op1);
221 set_mem_alias_set (op1, ix86_GOT_alias_set ());
222 }
223 else
224 {
225 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
226 if (tmp)
227 {
228 op1 = tmp;
229 if (!addend)
230 break;
231 }
232 else
233 {
234 op1 = operands[1];
235 break;
236 }
237 }
238
239 if (addend)
240 {
241 op1 = force_operand (op1, NULL_RTX);
242 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
243 op0, 1, OPTAB_DIRECT);
244 }
245 else
246 op1 = force_operand (op1, op0);
247
248 if (op1 == op0)
249 return;
250
251 op1 = convert_to_mode (mode, op1, 1);
252
253 default:
254 break;
255 }
256
257 if ((flag_pic || MACHOPIC_INDIRECT)
258 && symbolic_operand (op1, mode))
259 {
260 if (TARGET_MACHO && !TARGET_64BIT)
261 {
262 #if TARGET_MACHO
263 /* dynamic-no-pic */
264 if (MACHOPIC_INDIRECT)
265 {
266 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
267 ? op0 : gen_reg_rtx (Pmode);
268 op1 = machopic_indirect_data_reference (op1, temp);
269 if (MACHOPIC_PURE)
270 op1 = machopic_legitimize_pic_address (op1, mode,
271 temp == op1 ? 0 : temp);
272 }
273 if (op0 != op1 && GET_CODE (op0) != MEM)
274 {
275 rtx insn = gen_rtx_SET (op0, op1);
276 emit_insn (insn);
277 return;
278 }
279 if (GET_CODE (op0) == MEM)
280 op1 = force_reg (Pmode, op1);
281 else
282 {
283 rtx temp = op0;
284 if (GET_CODE (temp) != REG)
285 temp = gen_reg_rtx (Pmode);
286 temp = legitimize_pic_address (op1, temp);
287 if (temp == op0)
288 return;
289 op1 = temp;
290 }
291 /* dynamic-no-pic */
292 #endif
293 }
294 else
295 {
296 if (MEM_P (op0))
297 op1 = force_reg (mode, op1);
298 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
299 {
300 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
301 op1 = legitimize_pic_address (op1, reg);
302 if (op0 == op1)
303 return;
304 op1 = convert_to_mode (mode, op1, 1);
305 }
306 }
307 }
308 else
309 {
310 if (MEM_P (op0)
311 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
312 || !push_operand (op0, mode))
313 && MEM_P (op1))
314 op1 = force_reg (mode, op1);
315
316 if (push_operand (op0, mode)
317 && ! general_no_elim_operand (op1, mode))
318 op1 = copy_to_mode_reg (mode, op1);
319
320 /* Force large constants in 64bit compilation into register
321 to get them CSEed. */
322 if (can_create_pseudo_p ()
323 && (mode == DImode) && TARGET_64BIT
324 && immediate_operand (op1, mode)
325 && !x86_64_zext_immediate_operand (op1, VOIDmode)
326 && !register_operand (op0, mode)
327 && optimize)
328 op1 = copy_to_mode_reg (mode, op1);
329
330 if (can_create_pseudo_p ()
331 && CONST_DOUBLE_P (op1))
332 {
333 /* If we are loading a floating point constant to a register,
334 force the value to memory now, since we'll get better code
335 out the back end. */
336
337 op1 = validize_mem (force_const_mem (mode, op1));
338 if (!register_operand (op0, mode))
339 {
340 rtx temp = gen_reg_rtx (mode);
341 emit_insn (gen_rtx_SET (temp, op1));
342 emit_move_insn (op0, temp);
343 return;
344 }
345 }
346 }
347
348 emit_insn (gen_rtx_SET (op0, op1));
349 }
350
351 void
352 ix86_expand_vector_move (machine_mode mode, rtx operands[])
353 {
354 rtx op0 = operands[0], op1 = operands[1];
355 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
356 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
357 unsigned int align = (TARGET_IAMCU
358 ? GET_MODE_BITSIZE (mode)
359 : GET_MODE_ALIGNMENT (mode));
360
361 if (push_operand (op0, VOIDmode))
362 op0 = emit_move_resolve_push (mode, op0);
363
364 /* Force constants other than zero into memory. We do not know how
365 the instructions used to build constants modify the upper 64 bits
366 of the register, once we have that information we may be able
367 to handle some of them more efficiently. */
368 if (can_create_pseudo_p ()
369 && (CONSTANT_P (op1)
370 || (SUBREG_P (op1)
371 && CONSTANT_P (SUBREG_REG (op1))))
372 && ((register_operand (op0, mode)
373 && !standard_sse_constant_p (op1, mode))
374 /* ix86_expand_vector_move_misalign() does not like constants. */
375 || (SSE_REG_MODE_P (mode)
376 && MEM_P (op0)
377 && MEM_ALIGN (op0) < align)))
378 {
379 if (SUBREG_P (op1))
380 {
381 machine_mode imode = GET_MODE (SUBREG_REG (op1));
382 rtx r = force_const_mem (imode, SUBREG_REG (op1));
383 if (r)
384 r = validize_mem (r);
385 else
386 r = force_reg (imode, SUBREG_REG (op1));
387 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
388 }
389 else
390 op1 = validize_mem (force_const_mem (mode, op1));
391 }
392
393 /* We need to check memory alignment for SSE mode since attribute
394 can make operands unaligned. */
395 if (can_create_pseudo_p ()
396 && SSE_REG_MODE_P (mode)
397 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
398 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
399 {
400 rtx tmp[2];
401
402 /* ix86_expand_vector_move_misalign() does not like both
403 arguments in memory. */
404 if (!register_operand (op0, mode)
405 && !register_operand (op1, mode))
406 op1 = force_reg (mode, op1);
407
408 tmp[0] = op0; tmp[1] = op1;
409 ix86_expand_vector_move_misalign (mode, tmp);
410 return;
411 }
412
413 /* Make operand1 a register if it isn't already. */
414 if (can_create_pseudo_p ()
415 && !register_operand (op0, mode)
416 && !register_operand (op1, mode))
417 {
418 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
419 return;
420 }
421
422 emit_insn (gen_rtx_SET (op0, op1));
423 }
424
425 /* Split 32-byte AVX unaligned load and store if needed. */
426
427 static void
428 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
429 {
430 rtx m;
431 rtx (*extract) (rtx, rtx, rtx);
432 machine_mode mode;
433
434 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
435 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
436 {
437 emit_insn (gen_rtx_SET (op0, op1));
438 return;
439 }
440
441 rtx orig_op0 = NULL_RTX;
442 mode = GET_MODE (op0);
443 switch (GET_MODE_CLASS (mode))
444 {
445 case MODE_VECTOR_INT:
446 case MODE_INT:
447 if (mode != V32QImode)
448 {
449 if (!MEM_P (op0))
450 {
451 orig_op0 = op0;
452 op0 = gen_reg_rtx (V32QImode);
453 }
454 else
455 op0 = gen_lowpart (V32QImode, op0);
456 op1 = gen_lowpart (V32QImode, op1);
457 mode = V32QImode;
458 }
459 break;
460 case MODE_VECTOR_FLOAT:
461 break;
462 default:
463 gcc_unreachable ();
464 }
465
466 switch (mode)
467 {
468 default:
469 gcc_unreachable ();
470 case E_V32QImode:
471 extract = gen_avx_vextractf128v32qi;
472 mode = V16QImode;
473 break;
474 case E_V8SFmode:
475 extract = gen_avx_vextractf128v8sf;
476 mode = V4SFmode;
477 break;
478 case E_V4DFmode:
479 extract = gen_avx_vextractf128v4df;
480 mode = V2DFmode;
481 break;
482 }
483
484 if (MEM_P (op1))
485 {
486 rtx r = gen_reg_rtx (mode);
487 m = adjust_address (op1, mode, 0);
488 emit_move_insn (r, m);
489 m = adjust_address (op1, mode, 16);
490 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
491 emit_move_insn (op0, r);
492 }
493 else if (MEM_P (op0))
494 {
495 m = adjust_address (op0, mode, 0);
496 emit_insn (extract (m, op1, const0_rtx));
497 m = adjust_address (op0, mode, 16);
498 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
499 }
500 else
501 gcc_unreachable ();
502
503 if (orig_op0)
504 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
505 }
506
507 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
508 straight to ix86_expand_vector_move. */
509 /* Code generation for scalar reg-reg moves of single and double precision data:
510 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
511 movaps reg, reg
512 else
513 movss reg, reg
514 if (x86_sse_partial_reg_dependency == true)
515 movapd reg, reg
516 else
517 movsd reg, reg
518
519 Code generation for scalar loads of double precision data:
520 if (x86_sse_split_regs == true)
521 movlpd mem, reg (gas syntax)
522 else
523 movsd mem, reg
524
525 Code generation for unaligned packed loads of single precision data
526 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
527 if (x86_sse_unaligned_move_optimal)
528 movups mem, reg
529
530 if (x86_sse_partial_reg_dependency == true)
531 {
532 xorps reg, reg
533 movlps mem, reg
534 movhps mem+8, reg
535 }
536 else
537 {
538 movlps mem, reg
539 movhps mem+8, reg
540 }
541
542 Code generation for unaligned packed loads of double precision data
543 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
544 if (x86_sse_unaligned_move_optimal)
545 movupd mem, reg
546
547 if (x86_sse_split_regs == true)
548 {
549 movlpd mem, reg
550 movhpd mem+8, reg
551 }
552 else
553 {
554 movsd mem, reg
555 movhpd mem+8, reg
556 }
557 */
558
559 void
560 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
561 {
562 rtx op0, op1, m;
563
564 op0 = operands[0];
565 op1 = operands[1];
566
567 /* Use unaligned load/store for AVX512 or when optimizing for size. */
568 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
569 {
570 emit_insn (gen_rtx_SET (op0, op1));
571 return;
572 }
573
574 if (TARGET_AVX)
575 {
576 if (GET_MODE_SIZE (mode) == 32)
577 ix86_avx256_split_vector_move_misalign (op0, op1);
578 else
579 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
580 emit_insn (gen_rtx_SET (op0, op1));
581 return;
582 }
583
584 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
585 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
586 {
587 emit_insn (gen_rtx_SET (op0, op1));
588 return;
589 }
590
591 /* ??? If we have typed data, then it would appear that using
592 movdqu is the only way to get unaligned data loaded with
593 integer type. */
594 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
595 {
596 emit_insn (gen_rtx_SET (op0, op1));
597 return;
598 }
599
600 if (MEM_P (op1))
601 {
602 if (TARGET_SSE2 && mode == V2DFmode)
603 {
604 rtx zero;
605
606 /* When SSE registers are split into halves, we can avoid
607 writing to the top half twice. */
608 if (TARGET_SSE_SPLIT_REGS)
609 {
610 emit_clobber (op0);
611 zero = op0;
612 }
613 else
614 {
615 /* ??? Not sure about the best option for the Intel chips.
616 The following would seem to satisfy; the register is
617 entirely cleared, breaking the dependency chain. We
618 then store to the upper half, with a dependency depth
619 of one. A rumor has it that Intel recommends two movsd
620 followed by an unpacklpd, but this is unconfirmed. And
621 given that the dependency depth of the unpacklpd would
622 still be one, I'm not sure why this would be better. */
623 zero = CONST0_RTX (V2DFmode);
624 }
625
626 m = adjust_address (op1, DFmode, 0);
627 emit_insn (gen_sse2_loadlpd (op0, zero, m));
628 m = adjust_address (op1, DFmode, 8);
629 emit_insn (gen_sse2_loadhpd (op0, op0, m));
630 }
631 else
632 {
633 rtx t;
634
635 if (mode != V4SFmode)
636 t = gen_reg_rtx (V4SFmode);
637 else
638 t = op0;
639
640 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
641 emit_move_insn (t, CONST0_RTX (V4SFmode));
642 else
643 emit_clobber (t);
644
645 m = adjust_address (op1, V2SFmode, 0);
646 emit_insn (gen_sse_loadlps (t, t, m));
647 m = adjust_address (op1, V2SFmode, 8);
648 emit_insn (gen_sse_loadhps (t, t, m));
649 if (mode != V4SFmode)
650 emit_move_insn (op0, gen_lowpart (mode, t));
651 }
652 }
653 else if (MEM_P (op0))
654 {
655 if (TARGET_SSE2 && mode == V2DFmode)
656 {
657 m = adjust_address (op0, DFmode, 0);
658 emit_insn (gen_sse2_storelpd (m, op1));
659 m = adjust_address (op0, DFmode, 8);
660 emit_insn (gen_sse2_storehpd (m, op1));
661 }
662 else
663 {
664 if (mode != V4SFmode)
665 op1 = gen_lowpart (V4SFmode, op1);
666
667 m = adjust_address (op0, V2SFmode, 0);
668 emit_insn (gen_sse_storelps (m, op1));
669 m = adjust_address (op0, V2SFmode, 8);
670 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
671 }
672 }
673 else
674 gcc_unreachable ();
675 }
676
677 /* Move bits 64:95 to bits 32:63. */
678
679 void
680 ix86_move_vector_high_sse_to_mmx (rtx op)
681 {
682 rtx mask = gen_rtx_PARALLEL (VOIDmode,
683 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
684 GEN_INT (0), GEN_INT (0)));
685 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
686 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
687 rtx insn = gen_rtx_SET (dest, op);
688 emit_insn (insn);
689 }
690
691 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
692
693 void
694 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
695 {
696 rtx op0 = operands[0];
697 rtx op1 = operands[1];
698 rtx op2 = operands[2];
699
700 machine_mode dmode = GET_MODE (op0);
701 machine_mode smode = GET_MODE (op1);
702 machine_mode inner_dmode = GET_MODE_INNER (dmode);
703 machine_mode inner_smode = GET_MODE_INNER (smode);
704
705 /* Get the corresponding SSE mode for destination. */
706 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
707 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
708 nunits).require ();
709 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
710 nunits / 2).require ();
711
712 /* Get the corresponding SSE mode for source. */
713 nunits = 16 / GET_MODE_SIZE (inner_smode);
714 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
715 nunits).require ();
716
717 /* Generate SSE pack with signed/unsigned saturation. */
718 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
719 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
720 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
721
722 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
723 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
724 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
725 op1, op2));
726 emit_insn (insn);
727
728 ix86_move_vector_high_sse_to_mmx (op0);
729 }
730
731 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
732
733 void
734 ix86_split_mmx_punpck (rtx operands[], bool high_p)
735 {
736 rtx op0 = operands[0];
737 rtx op1 = operands[1];
738 rtx op2 = operands[2];
739 machine_mode mode = GET_MODE (op0);
740 rtx mask;
741 /* The corresponding SSE mode. */
742 machine_mode sse_mode, double_sse_mode;
743
744 switch (mode)
745 {
746 case E_V8QImode:
747 sse_mode = V16QImode;
748 double_sse_mode = V32QImode;
749 mask = gen_rtx_PARALLEL (VOIDmode,
750 gen_rtvec (16,
751 GEN_INT (0), GEN_INT (16),
752 GEN_INT (1), GEN_INT (17),
753 GEN_INT (2), GEN_INT (18),
754 GEN_INT (3), GEN_INT (19),
755 GEN_INT (4), GEN_INT (20),
756 GEN_INT (5), GEN_INT (21),
757 GEN_INT (6), GEN_INT (22),
758 GEN_INT (7), GEN_INT (23)));
759 break;
760
761 case E_V4HImode:
762 sse_mode = V8HImode;
763 double_sse_mode = V16HImode;
764 mask = gen_rtx_PARALLEL (VOIDmode,
765 gen_rtvec (8,
766 GEN_INT (0), GEN_INT (8),
767 GEN_INT (1), GEN_INT (9),
768 GEN_INT (2), GEN_INT (10),
769 GEN_INT (3), GEN_INT (11)));
770 break;
771
772 case E_V2SImode:
773 sse_mode = V4SImode;
774 double_sse_mode = V8SImode;
775 mask = gen_rtx_PARALLEL (VOIDmode,
776 gen_rtvec (4,
777 GEN_INT (0), GEN_INT (4),
778 GEN_INT (1), GEN_INT (5)));
779 break;
780
781 default:
782 gcc_unreachable ();
783 }
784
785 /* Generate SSE punpcklXX. */
786 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
787 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
788 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
789
790 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
791 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
792 rtx insn = gen_rtx_SET (dest, op2);
793 emit_insn (insn);
794
795 if (high_p)
796 {
797 /* Move bits 64:127 to bits 0:63. */
798 mask = gen_rtx_PARALLEL (VOIDmode,
799 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
800 GEN_INT (0), GEN_INT (0)));
801 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
802 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
803 insn = gen_rtx_SET (dest, op1);
804 emit_insn (insn);
805 }
806 }
807
808 /* Helper function of ix86_fixup_binary_operands to canonicalize
809 operand order. Returns true if the operands should be swapped. */
810
811 static bool
812 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
813 rtx operands[])
814 {
815 rtx dst = operands[0];
816 rtx src1 = operands[1];
817 rtx src2 = operands[2];
818
819 /* If the operation is not commutative, we can't do anything. */
820 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
821 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
822 return false;
823
824 /* Highest priority is that src1 should match dst. */
825 if (rtx_equal_p (dst, src1))
826 return false;
827 if (rtx_equal_p (dst, src2))
828 return true;
829
830 /* Next highest priority is that immediate constants come second. */
831 if (immediate_operand (src2, mode))
832 return false;
833 if (immediate_operand (src1, mode))
834 return true;
835
836 /* Lowest priority is that memory references should come second. */
837 if (MEM_P (src2))
838 return false;
839 if (MEM_P (src1))
840 return true;
841
842 return false;
843 }
844
845
846 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
847 destination to use for the operation. If different from the true
848 destination in operands[0], a copy operation will be required. */
849
850 rtx
851 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
852 rtx operands[])
853 {
854 rtx dst = operands[0];
855 rtx src1 = operands[1];
856 rtx src2 = operands[2];
857
858 /* Canonicalize operand order. */
859 if (ix86_swap_binary_operands_p (code, mode, operands))
860 {
861 /* It is invalid to swap operands of different modes. */
862 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
863
864 std::swap (src1, src2);
865 }
866
867 /* Both source operands cannot be in memory. */
868 if (MEM_P (src1) && MEM_P (src2))
869 {
870 /* Optimization: Only read from memory once. */
871 if (rtx_equal_p (src1, src2))
872 {
873 src2 = force_reg (mode, src2);
874 src1 = src2;
875 }
876 else if (rtx_equal_p (dst, src1))
877 src2 = force_reg (mode, src2);
878 else
879 src1 = force_reg (mode, src1);
880 }
881
882 /* If the destination is memory, and we do not have matching source
883 operands, do things in registers. */
884 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
885 dst = gen_reg_rtx (mode);
886
887 /* Source 1 cannot be a constant. */
888 if (CONSTANT_P (src1))
889 src1 = force_reg (mode, src1);
890
891 /* Source 1 cannot be a non-matching memory. */
892 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
893 src1 = force_reg (mode, src1);
894
895 /* Improve address combine. */
896 if (code == PLUS
897 && GET_MODE_CLASS (mode) == MODE_INT
898 && MEM_P (src2))
899 src2 = force_reg (mode, src2);
900
901 operands[1] = src1;
902 operands[2] = src2;
903 return dst;
904 }
905
906 /* Similarly, but assume that the destination has already been
907 set up properly. */
908
909 void
910 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
911 machine_mode mode, rtx operands[])
912 {
913 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
914 gcc_assert (dst == operands[0]);
915 }
916
917 /* Attempt to expand a binary operator. Make the expansion closer to the
918 actual machine, then just general_operand, which will allow 3 separate
919 memory references (one output, two input) in a single insn. */
920
921 void
922 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
923 rtx operands[])
924 {
925 rtx src1, src2, dst, op, clob;
926
927 dst = ix86_fixup_binary_operands (code, mode, operands);
928 src1 = operands[1];
929 src2 = operands[2];
930
931 /* Emit the instruction. */
932
933 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
934
935 if (reload_completed
936 && code == PLUS
937 && !rtx_equal_p (dst, src1))
938 {
939 /* This is going to be an LEA; avoid splitting it later. */
940 emit_insn (op);
941 }
942 else
943 {
944 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
945 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
946 }
947
948 /* Fix up the destination if needed. */
949 if (dst != operands[0])
950 emit_move_insn (operands[0], dst);
951 }
952
953 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
954 the given OPERANDS. */
955
956 void
957 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
958 rtx operands[])
959 {
960 rtx op1 = NULL_RTX, op2 = NULL_RTX;
961 if (SUBREG_P (operands[1]))
962 {
963 op1 = operands[1];
964 op2 = operands[2];
965 }
966 else if (SUBREG_P (operands[2]))
967 {
968 op1 = operands[2];
969 op2 = operands[1];
970 }
971 /* Optimize (__m128i) d | (__m128i) e and similar code
972 when d and e are float vectors into float vector logical
973 insn. In C/C++ without using intrinsics there is no other way
974 to express vector logical operation on float vectors than
975 to cast them temporarily to integer vectors. */
976 if (op1
977 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
978 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
979 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
980 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
981 && SUBREG_BYTE (op1) == 0
982 && (GET_CODE (op2) == CONST_VECTOR
983 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
984 && SUBREG_BYTE (op2) == 0))
985 && can_create_pseudo_p ())
986 {
987 rtx dst;
988 switch (GET_MODE (SUBREG_REG (op1)))
989 {
990 case E_V4SFmode:
991 case E_V8SFmode:
992 case E_V16SFmode:
993 case E_V2DFmode:
994 case E_V4DFmode:
995 case E_V8DFmode:
996 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
997 if (GET_CODE (op2) == CONST_VECTOR)
998 {
999 op2 = gen_lowpart (GET_MODE (dst), op2);
1000 op2 = force_reg (GET_MODE (dst), op2);
1001 }
1002 else
1003 {
1004 op1 = operands[1];
1005 op2 = SUBREG_REG (operands[2]);
1006 if (!vector_operand (op2, GET_MODE (dst)))
1007 op2 = force_reg (GET_MODE (dst), op2);
1008 }
1009 op1 = SUBREG_REG (op1);
1010 if (!vector_operand (op1, GET_MODE (dst)))
1011 op1 = force_reg (GET_MODE (dst), op1);
1012 emit_insn (gen_rtx_SET (dst,
1013 gen_rtx_fmt_ee (code, GET_MODE (dst),
1014 op1, op2)));
1015 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1016 return;
1017 default:
1018 break;
1019 }
1020 }
1021 if (!vector_operand (operands[1], mode))
1022 operands[1] = force_reg (mode, operands[1]);
1023 if (!vector_operand (operands[2], mode))
1024 operands[2] = force_reg (mode, operands[2]);
1025 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1026 emit_insn (gen_rtx_SET (operands[0],
1027 gen_rtx_fmt_ee (code, mode, operands[1],
1028 operands[2])));
1029 }
1030
1031 /* Return TRUE or FALSE depending on whether the binary operator meets the
1032 appropriate constraints. */
1033
1034 bool
1035 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1036 rtx operands[3])
1037 {
1038 rtx dst = operands[0];
1039 rtx src1 = operands[1];
1040 rtx src2 = operands[2];
1041
1042 /* Both source operands cannot be in memory. */
1043 if (MEM_P (src1) && MEM_P (src2))
1044 return false;
1045
1046 /* Canonicalize operand order for commutative operators. */
1047 if (ix86_swap_binary_operands_p (code, mode, operands))
1048 std::swap (src1, src2);
1049
1050 /* If the destination is memory, we must have a matching source operand. */
1051 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1052 return false;
1053
1054 /* Source 1 cannot be a constant. */
1055 if (CONSTANT_P (src1))
1056 return false;
1057
1058 /* Source 1 cannot be a non-matching memory. */
1059 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1060 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1061 return (code == AND
1062 && (mode == HImode
1063 || mode == SImode
1064 || (TARGET_64BIT && mode == DImode))
1065 && satisfies_constraint_L (src2));
1066
1067 return true;
1068 }
1069
1070 /* Attempt to expand a unary operator. Make the expansion closer to the
1071 actual machine, then just general_operand, which will allow 2 separate
1072 memory references (one output, one input) in a single insn. */
1073
1074 void
1075 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1076 rtx operands[])
1077 {
1078 bool matching_memory = false;
1079 rtx src, dst, op, clob;
1080
1081 dst = operands[0];
1082 src = operands[1];
1083
1084 /* If the destination is memory, and we do not have matching source
1085 operands, do things in registers. */
1086 if (MEM_P (dst))
1087 {
1088 if (rtx_equal_p (dst, src))
1089 matching_memory = true;
1090 else
1091 dst = gen_reg_rtx (mode);
1092 }
1093
1094 /* When source operand is memory, destination must match. */
1095 if (MEM_P (src) && !matching_memory)
1096 src = force_reg (mode, src);
1097
1098 /* Emit the instruction. */
1099
1100 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1101
1102 if (code == NOT)
1103 emit_insn (op);
1104 else
1105 {
1106 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1107 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1108 }
1109
1110 /* Fix up the destination if needed. */
1111 if (dst != operands[0])
1112 emit_move_insn (operands[0], dst);
1113 }
1114
1115 /* Predict just emitted jump instruction to be taken with probability PROB. */
1116
1117 static void
1118 predict_jump (int prob)
1119 {
1120 rtx_insn *insn = get_last_insn ();
1121 gcc_assert (JUMP_P (insn));
1122 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1123 }
1124
1125 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1126 divisor are within the range [0-255]. */
1127
1128 void
1129 ix86_split_idivmod (machine_mode mode, rtx operands[],
1130 bool unsigned_p)
1131 {
1132 rtx_code_label *end_label, *qimode_label;
1133 rtx div, mod;
1134 rtx_insn *insn;
1135 rtx scratch, tmp0, tmp1, tmp2;
1136 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1137
1138 switch (mode)
1139 {
1140 case E_SImode:
1141 if (GET_MODE (operands[0]) == SImode)
1142 {
1143 if (GET_MODE (operands[1]) == SImode)
1144 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1145 else
1146 gen_divmod4_1
1147 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1148 }
1149 else
1150 gen_divmod4_1
1151 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1152 break;
1153
1154 case E_DImode:
1155 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1156 break;
1157
1158 default:
1159 gcc_unreachable ();
1160 }
1161
1162 end_label = gen_label_rtx ();
1163 qimode_label = gen_label_rtx ();
1164
1165 scratch = gen_reg_rtx (mode);
1166
1167 /* Use 8bit unsigned divimod if dividend and divisor are within
1168 the range [0-255]. */
1169 emit_move_insn (scratch, operands[2]);
1170 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1171 scratch, 1, OPTAB_DIRECT);
1172 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1173 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1174 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1175 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1176 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1177 pc_rtx);
1178 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1179 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1180 JUMP_LABEL (insn) = qimode_label;
1181
1182 /* Generate original signed/unsigned divimod. */
1183 div = gen_divmod4_1 (operands[0], operands[1],
1184 operands[2], operands[3]);
1185 emit_insn (div);
1186
1187 /* Branch to the end. */
1188 emit_jump_insn (gen_jump (end_label));
1189 emit_barrier ();
1190
1191 /* Generate 8bit unsigned divide. */
1192 emit_label (qimode_label);
1193 /* Don't use operands[0] for result of 8bit divide since not all
1194 registers support QImode ZERO_EXTRACT. */
1195 tmp0 = lowpart_subreg (HImode, scratch, mode);
1196 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1197 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1198 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1199
1200 if (unsigned_p)
1201 {
1202 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1203 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1204 }
1205 else
1206 {
1207 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1208 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1209 }
1210 if (mode == SImode)
1211 {
1212 if (GET_MODE (operands[0]) != SImode)
1213 div = gen_rtx_ZERO_EXTEND (DImode, div);
1214 if (GET_MODE (operands[1]) != SImode)
1215 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1216 }
1217
1218 /* Extract remainder from AH. */
1219 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
1220 tmp0, GEN_INT (8), GEN_INT (8));
1221 if (REG_P (operands[1]))
1222 insn = emit_move_insn (operands[1], tmp1);
1223 else
1224 {
1225 /* Need a new scratch register since the old one has result
1226 of 8bit divide. */
1227 scratch = gen_reg_rtx (GET_MODE (operands[1]));
1228 emit_move_insn (scratch, tmp1);
1229 insn = emit_move_insn (operands[1], scratch);
1230 }
1231 set_unique_reg_note (insn, REG_EQUAL, mod);
1232
1233 /* Zero extend quotient from AL. */
1234 tmp1 = gen_lowpart (QImode, tmp0);
1235 insn = emit_insn (gen_extend_insn
1236 (operands[0], tmp1,
1237 GET_MODE (operands[0]), QImode, 1));
1238 set_unique_reg_note (insn, REG_EQUAL, div);
1239
1240 emit_label (end_label);
1241 }
1242
1243 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1244 matches destination. RTX includes clobber of FLAGS_REG. */
1245
1246 void
1247 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1248 rtx dst, rtx src)
1249 {
1250 rtx op, clob;
1251
1252 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1253 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1254
1255 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1256 }
1257
1258 /* Return true if regno1 def is nearest to the insn. */
1259
1260 static bool
1261 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1262 {
1263 rtx_insn *prev = insn;
1264 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1265
1266 if (insn == start)
1267 return false;
1268 while (prev && prev != start)
1269 {
1270 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1271 {
1272 prev = PREV_INSN (prev);
1273 continue;
1274 }
1275 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1276 return true;
1277 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1278 return false;
1279 prev = PREV_INSN (prev);
1280 }
1281
1282 /* None of the regs is defined in the bb. */
1283 return false;
1284 }
1285
1286 /* Split lea instructions into a sequence of instructions
1287 which are executed on ALU to avoid AGU stalls.
1288 It is assumed that it is allowed to clobber flags register
1289 at lea position. */
1290
1291 void
1292 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1293 {
1294 unsigned int regno0, regno1, regno2;
1295 struct ix86_address parts;
1296 rtx target, tmp;
1297 int ok, adds;
1298
1299 ok = ix86_decompose_address (operands[1], &parts);
1300 gcc_assert (ok);
1301
1302 target = gen_lowpart (mode, operands[0]);
1303
1304 regno0 = true_regnum (target);
1305 regno1 = INVALID_REGNUM;
1306 regno2 = INVALID_REGNUM;
1307
1308 if (parts.base)
1309 {
1310 parts.base = gen_lowpart (mode, parts.base);
1311 regno1 = true_regnum (parts.base);
1312 }
1313
1314 if (parts.index)
1315 {
1316 parts.index = gen_lowpart (mode, parts.index);
1317 regno2 = true_regnum (parts.index);
1318 }
1319
1320 if (parts.disp)
1321 parts.disp = gen_lowpart (mode, parts.disp);
1322
1323 if (parts.scale > 1)
1324 {
1325 /* Case r1 = r1 + ... */
1326 if (regno1 == regno0)
1327 {
1328 /* If we have a case r1 = r1 + C * r2 then we
1329 should use multiplication which is very
1330 expensive. Assume cost model is wrong if we
1331 have such case here. */
1332 gcc_assert (regno2 != regno0);
1333
1334 for (adds = parts.scale; adds > 0; adds--)
1335 ix86_emit_binop (PLUS, mode, target, parts.index);
1336 }
1337 else
1338 {
1339 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1340 if (regno0 != regno2)
1341 emit_insn (gen_rtx_SET (target, parts.index));
1342
1343 /* Use shift for scaling. */
1344 ix86_emit_binop (ASHIFT, mode, target,
1345 GEN_INT (exact_log2 (parts.scale)));
1346
1347 if (parts.base)
1348 ix86_emit_binop (PLUS, mode, target, parts.base);
1349
1350 if (parts.disp && parts.disp != const0_rtx)
1351 ix86_emit_binop (PLUS, mode, target, parts.disp);
1352 }
1353 }
1354 else if (!parts.base && !parts.index)
1355 {
1356 gcc_assert(parts.disp);
1357 emit_insn (gen_rtx_SET (target, parts.disp));
1358 }
1359 else
1360 {
1361 if (!parts.base)
1362 {
1363 if (regno0 != regno2)
1364 emit_insn (gen_rtx_SET (target, parts.index));
1365 }
1366 else if (!parts.index)
1367 {
1368 if (regno0 != regno1)
1369 emit_insn (gen_rtx_SET (target, parts.base));
1370 }
1371 else
1372 {
1373 if (regno0 == regno1)
1374 tmp = parts.index;
1375 else if (regno0 == regno2)
1376 tmp = parts.base;
1377 else
1378 {
1379 rtx tmp1;
1380
1381 /* Find better operand for SET instruction, depending
1382 on which definition is farther from the insn. */
1383 if (find_nearest_reg_def (insn, regno1, regno2))
1384 tmp = parts.index, tmp1 = parts.base;
1385 else
1386 tmp = parts.base, tmp1 = parts.index;
1387
1388 emit_insn (gen_rtx_SET (target, tmp));
1389
1390 if (parts.disp && parts.disp != const0_rtx)
1391 ix86_emit_binop (PLUS, mode, target, parts.disp);
1392
1393 ix86_emit_binop (PLUS, mode, target, tmp1);
1394 return;
1395 }
1396
1397 ix86_emit_binop (PLUS, mode, target, tmp);
1398 }
1399
1400 if (parts.disp && parts.disp != const0_rtx)
1401 ix86_emit_binop (PLUS, mode, target, parts.disp);
1402 }
1403 }
1404
1405 /* Post-reload splitter for converting an SF or DFmode value in an
1406 SSE register into an unsigned SImode. */
1407
1408 void
1409 ix86_split_convert_uns_si_sse (rtx operands[])
1410 {
1411 machine_mode vecmode;
1412 rtx value, large, zero_or_two31, input, two31, x;
1413
1414 large = operands[1];
1415 zero_or_two31 = operands[2];
1416 input = operands[3];
1417 two31 = operands[4];
1418 vecmode = GET_MODE (large);
1419 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1420
1421 /* Load up the value into the low element. We must ensure that the other
1422 elements are valid floats -- zero is the easiest such value. */
1423 if (MEM_P (input))
1424 {
1425 if (vecmode == V4SFmode)
1426 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1427 else
1428 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1429 }
1430 else
1431 {
1432 input = gen_rtx_REG (vecmode, REGNO (input));
1433 emit_move_insn (value, CONST0_RTX (vecmode));
1434 if (vecmode == V4SFmode)
1435 emit_insn (gen_sse_movss (value, value, input));
1436 else
1437 emit_insn (gen_sse2_movsd (value, value, input));
1438 }
1439
1440 emit_move_insn (large, two31);
1441 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1442
1443 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1444 emit_insn (gen_rtx_SET (large, x));
1445
1446 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1447 emit_insn (gen_rtx_SET (zero_or_two31, x));
1448
1449 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1450 emit_insn (gen_rtx_SET (value, x));
1451
1452 large = gen_rtx_REG (V4SImode, REGNO (large));
1453 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1454
1455 x = gen_rtx_REG (V4SImode, REGNO (value));
1456 if (vecmode == V4SFmode)
1457 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1458 else
1459 emit_insn (gen_sse2_cvttpd2dq (x, value));
1460 value = x;
1461
1462 emit_insn (gen_xorv4si3 (value, value, large));
1463 }
1464
1465 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1466 machine_mode mode, rtx target,
1467 rtx var, int one_var);
1468
1469 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1470 Expects the 64-bit DImode to be supplied in a pair of integral
1471 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1472 -mfpmath=sse, !optimize_size only. */
1473
1474 void
1475 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1476 {
1477 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1478 rtx int_xmm, fp_xmm;
1479 rtx biases, exponents;
1480 rtx x;
1481
1482 int_xmm = gen_reg_rtx (V4SImode);
1483 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1484 emit_insn (gen_movdi_to_sse (int_xmm, input));
1485 else if (TARGET_SSE_SPLIT_REGS)
1486 {
1487 emit_clobber (int_xmm);
1488 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1489 }
1490 else
1491 {
1492 x = gen_reg_rtx (V2DImode);
1493 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1494 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1495 }
1496
1497 x = gen_rtx_CONST_VECTOR (V4SImode,
1498 gen_rtvec (4, GEN_INT (0x43300000UL),
1499 GEN_INT (0x45300000UL),
1500 const0_rtx, const0_rtx));
1501 exponents = validize_mem (force_const_mem (V4SImode, x));
1502
1503 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1504 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1505
1506 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1507 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1508 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1509 (0x1.0p84 + double(fp_value_hi_xmm)).
1510 Note these exponents differ by 32. */
1511
1512 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1513
1514 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1515 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1516 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1517 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1518 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1519 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1520 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1521 biases = validize_mem (force_const_mem (V2DFmode, biases));
1522 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1523
1524 /* Add the upper and lower DFmode values together. */
1525 if (TARGET_SSE3)
1526 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1527 else
1528 {
1529 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1530 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1531 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1532 }
1533
1534 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1535 }
1536
1537 /* Not used, but eases macroization of patterns. */
1538 void
1539 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1540 {
1541 gcc_unreachable ();
1542 }
1543
1544 /* Convert an unsigned SImode value into a DFmode. Only currently used
1545 for SSE, but applicable anywhere. */
1546
1547 void
1548 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1549 {
1550 REAL_VALUE_TYPE TWO31r;
1551 rtx x, fp;
1552
1553 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1554 NULL, 1, OPTAB_DIRECT);
1555
1556 fp = gen_reg_rtx (DFmode);
1557 emit_insn (gen_floatsidf2 (fp, x));
1558
1559 real_ldexp (&TWO31r, &dconst1, 31);
1560 x = const_double_from_real_value (TWO31r, DFmode);
1561
1562 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1563 if (x != target)
1564 emit_move_insn (target, x);
1565 }
1566
1567 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1568 32-bit mode; otherwise we have a direct convert instruction. */
1569
1570 void
1571 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1572 {
1573 REAL_VALUE_TYPE TWO32r;
1574 rtx fp_lo, fp_hi, x;
1575
1576 fp_lo = gen_reg_rtx (DFmode);
1577 fp_hi = gen_reg_rtx (DFmode);
1578
1579 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1580
1581 real_ldexp (&TWO32r, &dconst1, 32);
1582 x = const_double_from_real_value (TWO32r, DFmode);
1583 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1584
1585 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1586
1587 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1588 0, OPTAB_DIRECT);
1589 if (x != target)
1590 emit_move_insn (target, x);
1591 }
1592
1593 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1594 For x86_32, -mfpmath=sse, !optimize_size only. */
1595 void
1596 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1597 {
1598 REAL_VALUE_TYPE ONE16r;
1599 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1600
1601 real_ldexp (&ONE16r, &dconst1, 16);
1602 x = const_double_from_real_value (ONE16r, SFmode);
1603 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1604 NULL, 0, OPTAB_DIRECT);
1605 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1606 NULL, 0, OPTAB_DIRECT);
1607 fp_hi = gen_reg_rtx (SFmode);
1608 fp_lo = gen_reg_rtx (SFmode);
1609 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1610 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1611 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1612 0, OPTAB_DIRECT);
1613 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1614 0, OPTAB_DIRECT);
1615 if (!rtx_equal_p (target, fp_hi))
1616 emit_move_insn (target, fp_hi);
1617 }
1618
1619 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1620 a vector of unsigned ints VAL to vector of floats TARGET. */
1621
1622 void
1623 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1624 {
1625 rtx tmp[8];
1626 REAL_VALUE_TYPE TWO16r;
1627 machine_mode intmode = GET_MODE (val);
1628 machine_mode fltmode = GET_MODE (target);
1629 rtx (*cvt) (rtx, rtx);
1630
1631 if (intmode == V4SImode)
1632 cvt = gen_floatv4siv4sf2;
1633 else
1634 cvt = gen_floatv8siv8sf2;
1635 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1636 tmp[0] = force_reg (intmode, tmp[0]);
1637 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1638 OPTAB_DIRECT);
1639 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1640 NULL_RTX, 1, OPTAB_DIRECT);
1641 tmp[3] = gen_reg_rtx (fltmode);
1642 emit_insn (cvt (tmp[3], tmp[1]));
1643 tmp[4] = gen_reg_rtx (fltmode);
1644 emit_insn (cvt (tmp[4], tmp[2]));
1645 real_ldexp (&TWO16r, &dconst1, 16);
1646 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1647 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1648 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1649 OPTAB_DIRECT);
1650 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1651 OPTAB_DIRECT);
1652 if (tmp[7] != target)
1653 emit_move_insn (target, tmp[7]);
1654 }
1655
1656 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1657 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1658 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1659 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1660
1661 rtx
1662 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1663 {
1664 REAL_VALUE_TYPE TWO31r;
1665 rtx two31r, tmp[4];
1666 machine_mode mode = GET_MODE (val);
1667 machine_mode scalarmode = GET_MODE_INNER (mode);
1668 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1669 rtx (*cmp) (rtx, rtx, rtx, rtx);
1670 int i;
1671
1672 for (i = 0; i < 3; i++)
1673 tmp[i] = gen_reg_rtx (mode);
1674 real_ldexp (&TWO31r, &dconst1, 31);
1675 two31r = const_double_from_real_value (TWO31r, scalarmode);
1676 two31r = ix86_build_const_vector (mode, 1, two31r);
1677 two31r = force_reg (mode, two31r);
1678 switch (mode)
1679 {
1680 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1681 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1682 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1683 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1684 default: gcc_unreachable ();
1685 }
1686 tmp[3] = gen_rtx_LE (mode, two31r, val);
1687 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1688 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1689 0, OPTAB_DIRECT);
1690 if (intmode == V4SImode || TARGET_AVX2)
1691 *xorp = expand_simple_binop (intmode, ASHIFT,
1692 gen_lowpart (intmode, tmp[0]),
1693 GEN_INT (31), NULL_RTX, 0,
1694 OPTAB_DIRECT);
1695 else
1696 {
1697 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
1698 two31 = ix86_build_const_vector (intmode, 1, two31);
1699 *xorp = expand_simple_binop (intmode, AND,
1700 gen_lowpart (intmode, tmp[0]),
1701 two31, NULL_RTX, 0,
1702 OPTAB_DIRECT);
1703 }
1704 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1705 0, OPTAB_DIRECT);
1706 }
1707
1708 /* Generate code for floating point ABS or NEG. */
1709
1710 void
1711 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1712 rtx operands[])
1713 {
1714 rtx set, dst, src;
1715 bool use_sse = false;
1716 bool vector_mode = VECTOR_MODE_P (mode);
1717 machine_mode vmode = mode;
1718 rtvec par;
1719
1720 if (vector_mode)
1721 use_sse = true;
1722 else if (mode == TFmode)
1723 use_sse = true;
1724 else if (TARGET_SSE_MATH)
1725 {
1726 use_sse = SSE_FLOAT_MODE_P (mode);
1727 if (mode == SFmode)
1728 vmode = V4SFmode;
1729 else if (mode == DFmode)
1730 vmode = V2DFmode;
1731 }
1732
1733 dst = operands[0];
1734 src = operands[1];
1735
1736 set = gen_rtx_fmt_e (code, mode, src);
1737 set = gen_rtx_SET (dst, set);
1738
1739 if (use_sse)
1740 {
1741 rtx mask, use, clob;
1742
1743 /* NEG and ABS performed with SSE use bitwise mask operations.
1744 Create the appropriate mask now. */
1745 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
1746 use = gen_rtx_USE (VOIDmode, mask);
1747 if (vector_mode)
1748 par = gen_rtvec (2, set, use);
1749 else
1750 {
1751 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1752 par = gen_rtvec (3, set, use, clob);
1753 }
1754 }
1755 else
1756 {
1757 rtx clob;
1758
1759 /* Changing of sign for FP values is doable using integer unit too. */
1760 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1761 par = gen_rtvec (2, set, clob);
1762 }
1763
1764 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1765 }
1766
1767 /* Deconstruct a floating point ABS or NEG operation
1768 with integer registers into integer operations. */
1769
1770 void
1771 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1772 rtx operands[])
1773 {
1774 enum rtx_code absneg_op;
1775 rtx dst, set;
1776
1777 gcc_assert (operands_match_p (operands[0], operands[1]));
1778
1779 switch (mode)
1780 {
1781 case E_SFmode:
1782 dst = gen_lowpart (SImode, operands[0]);
1783
1784 if (code == ABS)
1785 {
1786 set = gen_int_mode (0x7fffffff, SImode);
1787 absneg_op = AND;
1788 }
1789 else
1790 {
1791 set = gen_int_mode (0x80000000, SImode);
1792 absneg_op = XOR;
1793 }
1794 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1795 break;
1796
1797 case E_DFmode:
1798 if (TARGET_64BIT)
1799 {
1800 dst = gen_lowpart (DImode, operands[0]);
1801 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1802
1803 if (code == ABS)
1804 set = const0_rtx;
1805 else
1806 set = gen_rtx_NOT (DImode, dst);
1807 }
1808 else
1809 {
1810 dst = gen_highpart (SImode, operands[0]);
1811
1812 if (code == ABS)
1813 {
1814 set = gen_int_mode (0x7fffffff, SImode);
1815 absneg_op = AND;
1816 }
1817 else
1818 {
1819 set = gen_int_mode (0x80000000, SImode);
1820 absneg_op = XOR;
1821 }
1822 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1823 }
1824 break;
1825
1826 case E_XFmode:
1827 dst = gen_rtx_REG (SImode,
1828 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1829 if (code == ABS)
1830 {
1831 set = GEN_INT (0x7fff);
1832 absneg_op = AND;
1833 }
1834 else
1835 {
1836 set = GEN_INT (0x8000);
1837 absneg_op = XOR;
1838 }
1839 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1840 break;
1841
1842 default:
1843 gcc_unreachable ();
1844 }
1845
1846 set = gen_rtx_SET (dst, set);
1847
1848 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1849 rtvec par = gen_rtvec (2, set, clob);
1850
1851 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1852 }
1853
1854 /* Expand a copysign operation. Special case operand 0 being a constant. */
1855
1856 void
1857 ix86_expand_copysign (rtx operands[])
1858 {
1859 machine_mode mode, vmode;
1860 rtx dest, op0, op1, mask;
1861
1862 dest = operands[0];
1863 op0 = operands[1];
1864 op1 = operands[2];
1865
1866 mode = GET_MODE (dest);
1867
1868 if (mode == SFmode)
1869 vmode = V4SFmode;
1870 else if (mode == DFmode)
1871 vmode = V2DFmode;
1872 else if (mode == TFmode)
1873 vmode = mode;
1874 else
1875 gcc_unreachable ();
1876
1877 mask = ix86_build_signbit_mask (vmode, 0, 0);
1878
1879 if (CONST_DOUBLE_P (op0))
1880 {
1881 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1882 op0 = simplify_unary_operation (ABS, mode, op0, mode);
1883
1884 if (mode == SFmode || mode == DFmode)
1885 {
1886 if (op0 == CONST0_RTX (mode))
1887 op0 = CONST0_RTX (vmode);
1888 else
1889 {
1890 rtx v = ix86_build_const_vector (vmode, false, op0);
1891
1892 op0 = force_reg (vmode, v);
1893 }
1894 }
1895 else if (op0 != CONST0_RTX (mode))
1896 op0 = force_reg (mode, op0);
1897
1898 emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
1899 }
1900 else
1901 {
1902 rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
1903
1904 emit_insn (gen_copysign3_var
1905 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
1906 }
1907 }
1908
1909 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1910 be a constant, and so has already been expanded into a vector constant. */
1911
1912 void
1913 ix86_split_copysign_const (rtx operands[])
1914 {
1915 machine_mode mode, vmode;
1916 rtx dest, op0, mask, x;
1917
1918 dest = operands[0];
1919 op0 = operands[1];
1920 mask = operands[3];
1921
1922 mode = GET_MODE (dest);
1923 vmode = GET_MODE (mask);
1924
1925 dest = lowpart_subreg (vmode, dest, mode);
1926 x = gen_rtx_AND (vmode, dest, mask);
1927 emit_insn (gen_rtx_SET (dest, x));
1928
1929 if (op0 != CONST0_RTX (vmode))
1930 {
1931 x = gen_rtx_IOR (vmode, dest, op0);
1932 emit_insn (gen_rtx_SET (dest, x));
1933 }
1934 }
1935
1936 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1937 so we have to do two masks. */
1938
1939 void
1940 ix86_split_copysign_var (rtx operands[])
1941 {
1942 machine_mode mode, vmode;
1943 rtx dest, scratch, op0, op1, mask, nmask, x;
1944
1945 dest = operands[0];
1946 scratch = operands[1];
1947 op0 = operands[2];
1948 op1 = operands[3];
1949 nmask = operands[4];
1950 mask = operands[5];
1951
1952 mode = GET_MODE (dest);
1953 vmode = GET_MODE (mask);
1954
1955 if (rtx_equal_p (op0, op1))
1956 {
1957 /* Shouldn't happen often (it's useless, obviously), but when it does
1958 we'd generate incorrect code if we continue below. */
1959 emit_move_insn (dest, op0);
1960 return;
1961 }
1962
1963 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
1964 {
1965 gcc_assert (REGNO (op1) == REGNO (scratch));
1966
1967 x = gen_rtx_AND (vmode, scratch, mask);
1968 emit_insn (gen_rtx_SET (scratch, x));
1969
1970 dest = mask;
1971 op0 = lowpart_subreg (vmode, op0, mode);
1972 x = gen_rtx_NOT (vmode, dest);
1973 x = gen_rtx_AND (vmode, x, op0);
1974 emit_insn (gen_rtx_SET (dest, x));
1975 }
1976 else
1977 {
1978 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
1979 {
1980 x = gen_rtx_AND (vmode, scratch, mask);
1981 }
1982 else /* alternative 2,4 */
1983 {
1984 gcc_assert (REGNO (mask) == REGNO (scratch));
1985 op1 = lowpart_subreg (vmode, op1, mode);
1986 x = gen_rtx_AND (vmode, scratch, op1);
1987 }
1988 emit_insn (gen_rtx_SET (scratch, x));
1989
1990 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
1991 {
1992 dest = lowpart_subreg (vmode, op0, mode);
1993 x = gen_rtx_AND (vmode, dest, nmask);
1994 }
1995 else /* alternative 3,4 */
1996 {
1997 gcc_assert (REGNO (nmask) == REGNO (dest));
1998 dest = nmask;
1999 op0 = lowpart_subreg (vmode, op0, mode);
2000 x = gen_rtx_AND (vmode, dest, op0);
2001 }
2002 emit_insn (gen_rtx_SET (dest, x));
2003 }
2004
2005 x = gen_rtx_IOR (vmode, dest, scratch);
2006 emit_insn (gen_rtx_SET (dest, x));
2007 }
2008
2009 /* Expand an xorsign operation. */
2010
2011 void
2012 ix86_expand_xorsign (rtx operands[])
2013 {
2014 machine_mode mode, vmode;
2015 rtx dest, op0, op1, mask;
2016
2017 dest = operands[0];
2018 op0 = operands[1];
2019 op1 = operands[2];
2020
2021 mode = GET_MODE (dest);
2022
2023 if (mode == SFmode)
2024 vmode = V4SFmode;
2025 else if (mode == DFmode)
2026 vmode = V2DFmode;
2027 else
2028 gcc_unreachable ();
2029
2030 mask = ix86_build_signbit_mask (vmode, 0, 0);
2031
2032 emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2033 }
2034
2035 /* Deconstruct an xorsign operation into bit masks. */
2036
2037 void
2038 ix86_split_xorsign (rtx operands[])
2039 {
2040 machine_mode mode, vmode;
2041 rtx dest, op0, mask, x;
2042
2043 dest = operands[0];
2044 op0 = operands[1];
2045 mask = operands[3];
2046
2047 mode = GET_MODE (dest);
2048 vmode = GET_MODE (mask);
2049
2050 dest = lowpart_subreg (vmode, dest, mode);
2051 x = gen_rtx_AND (vmode, dest, mask);
2052 emit_insn (gen_rtx_SET (dest, x));
2053
2054 op0 = lowpart_subreg (vmode, op0, mode);
2055 x = gen_rtx_XOR (vmode, dest, op0);
2056 emit_insn (gen_rtx_SET (dest, x));
2057 }
2058
2059 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2060
2061 void
2062 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2063 {
2064 machine_mode mode = GET_MODE (op0);
2065 rtx tmp;
2066
2067 /* Handle special case - vector comparsion with boolean result, transform
2068 it using ptest instruction. */
2069 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2070 {
2071 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2072 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2073
2074 gcc_assert (code == EQ || code == NE);
2075 /* Generate XOR since we can't check that one operand is zero vector. */
2076 tmp = gen_reg_rtx (mode);
2077 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2078 tmp = gen_lowpart (p_mode, tmp);
2079 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2080 gen_rtx_UNSPEC (CCmode,
2081 gen_rtvec (2, tmp, tmp),
2082 UNSPEC_PTEST)));
2083 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2084 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2085 gen_rtx_LABEL_REF (VOIDmode, label),
2086 pc_rtx);
2087 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2088 return;
2089 }
2090
2091 switch (mode)
2092 {
2093 case E_SFmode:
2094 case E_DFmode:
2095 case E_XFmode:
2096 case E_QImode:
2097 case E_HImode:
2098 case E_SImode:
2099 simple:
2100 tmp = ix86_expand_compare (code, op0, op1);
2101 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2102 gen_rtx_LABEL_REF (VOIDmode, label),
2103 pc_rtx);
2104 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2105 return;
2106
2107 case E_DImode:
2108 if (TARGET_64BIT)
2109 goto simple;
2110 /* For 32-bit target DI comparison may be performed on
2111 SSE registers. To allow this we should avoid split
2112 to SI mode which is achieved by doing xor in DI mode
2113 and then comparing with zero (which is recognized by
2114 STV pass). We don't compare using xor when optimizing
2115 for size. */
2116 if (!optimize_insn_for_size_p ()
2117 && TARGET_STV
2118 && (code == EQ || code == NE))
2119 {
2120 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2121 op1 = const0_rtx;
2122 }
2123 /* FALLTHRU */
2124 case E_TImode:
2125 /* Expand DImode branch into multiple compare+branch. */
2126 {
2127 rtx lo[2], hi[2];
2128 rtx_code_label *label2;
2129 enum rtx_code code1, code2, code3;
2130 machine_mode submode;
2131
2132 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2133 {
2134 std::swap (op0, op1);
2135 code = swap_condition (code);
2136 }
2137
2138 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2139 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2140
2141 submode = mode == DImode ? SImode : DImode;
2142
2143 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2144 avoid two branches. This costs one extra insn, so disable when
2145 optimizing for size. */
2146
2147 if ((code == EQ || code == NE)
2148 && (!optimize_insn_for_size_p ()
2149 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2150 {
2151 rtx xor0, xor1;
2152
2153 xor1 = hi[0];
2154 if (hi[1] != const0_rtx)
2155 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2156 NULL_RTX, 0, OPTAB_WIDEN);
2157
2158 xor0 = lo[0];
2159 if (lo[1] != const0_rtx)
2160 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2161 NULL_RTX, 0, OPTAB_WIDEN);
2162
2163 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2164 NULL_RTX, 0, OPTAB_WIDEN);
2165
2166 ix86_expand_branch (code, tmp, const0_rtx, label);
2167 return;
2168 }
2169
2170 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2171 op1 is a constant and the low word is zero, then we can just
2172 examine the high word. Similarly for low word -1 and
2173 less-or-equal-than or greater-than. */
2174
2175 if (CONST_INT_P (hi[1]))
2176 switch (code)
2177 {
2178 case LT: case LTU: case GE: case GEU:
2179 if (lo[1] == const0_rtx)
2180 {
2181 ix86_expand_branch (code, hi[0], hi[1], label);
2182 return;
2183 }
2184 break;
2185 case LE: case LEU: case GT: case GTU:
2186 if (lo[1] == constm1_rtx)
2187 {
2188 ix86_expand_branch (code, hi[0], hi[1], label);
2189 return;
2190 }
2191 break;
2192 default:
2193 break;
2194 }
2195
2196 /* Emulate comparisons that do not depend on Zero flag with
2197 double-word subtraction. Note that only Overflow, Sign
2198 and Carry flags are valid, so swap arguments and condition
2199 of comparisons that would otherwise test Zero flag. */
2200
2201 switch (code)
2202 {
2203 case LE: case LEU: case GT: case GTU:
2204 std::swap (lo[0], lo[1]);
2205 std::swap (hi[0], hi[1]);
2206 code = swap_condition (code);
2207 /* FALLTHRU */
2208
2209 case LT: case LTU: case GE: case GEU:
2210 {
2211 bool uns = (code == LTU || code == GEU);
2212 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2213 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2214
2215 if (!nonimmediate_operand (lo[0], submode))
2216 lo[0] = force_reg (submode, lo[0]);
2217 if (!x86_64_general_operand (lo[1], submode))
2218 lo[1] = force_reg (submode, lo[1]);
2219
2220 if (!register_operand (hi[0], submode))
2221 hi[0] = force_reg (submode, hi[0]);
2222 if ((uns && !nonimmediate_operand (hi[1], submode))
2223 || (!uns && !x86_64_general_operand (hi[1], submode)))
2224 hi[1] = force_reg (submode, hi[1]);
2225
2226 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2227
2228 tmp = gen_rtx_SCRATCH (submode);
2229 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2230
2231 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2232 ix86_expand_branch (code, tmp, const0_rtx, label);
2233 return;
2234 }
2235
2236 default:
2237 break;
2238 }
2239
2240 /* Otherwise, we need two or three jumps. */
2241
2242 label2 = gen_label_rtx ();
2243
2244 code1 = code;
2245 code2 = swap_condition (code);
2246 code3 = unsigned_condition (code);
2247
2248 switch (code)
2249 {
2250 case LT: case GT: case LTU: case GTU:
2251 break;
2252
2253 case LE: code1 = LT; code2 = GT; break;
2254 case GE: code1 = GT; code2 = LT; break;
2255 case LEU: code1 = LTU; code2 = GTU; break;
2256 case GEU: code1 = GTU; code2 = LTU; break;
2257
2258 case EQ: code1 = UNKNOWN; code2 = NE; break;
2259 case NE: code2 = UNKNOWN; break;
2260
2261 default:
2262 gcc_unreachable ();
2263 }
2264
2265 /*
2266 * a < b =>
2267 * if (hi(a) < hi(b)) goto true;
2268 * if (hi(a) > hi(b)) goto false;
2269 * if (lo(a) < lo(b)) goto true;
2270 * false:
2271 */
2272
2273 if (code1 != UNKNOWN)
2274 ix86_expand_branch (code1, hi[0], hi[1], label);
2275 if (code2 != UNKNOWN)
2276 ix86_expand_branch (code2, hi[0], hi[1], label2);
2277
2278 ix86_expand_branch (code3, lo[0], lo[1], label);
2279
2280 if (code2 != UNKNOWN)
2281 emit_label (label2);
2282 return;
2283 }
2284
2285 default:
2286 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2287 goto simple;
2288 }
2289 }
2290
2291 /* Figure out whether to use unordered fp comparisons. */
2292
2293 static bool
2294 ix86_unordered_fp_compare (enum rtx_code code)
2295 {
2296 if (!TARGET_IEEE_FP)
2297 return false;
2298
2299 switch (code)
2300 {
2301 case LT:
2302 case LE:
2303 case GT:
2304 case GE:
2305 case LTGT:
2306 return false;
2307
2308 case EQ:
2309 case NE:
2310
2311 case UNORDERED:
2312 case ORDERED:
2313 case UNLT:
2314 case UNLE:
2315 case UNGT:
2316 case UNGE:
2317 case UNEQ:
2318 return true;
2319
2320 default:
2321 gcc_unreachable ();
2322 }
2323 }
2324
2325 /* Return a comparison we can do and that it is equivalent to
2326 swap_condition (code) apart possibly from orderedness.
2327 But, never change orderedness if TARGET_IEEE_FP, returning
2328 UNKNOWN in that case if necessary. */
2329
2330 static enum rtx_code
2331 ix86_fp_swap_condition (enum rtx_code code)
2332 {
2333 switch (code)
2334 {
2335 case GT: /* GTU - CF=0 & ZF=0 */
2336 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2337 case GE: /* GEU - CF=0 */
2338 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2339 case UNLT: /* LTU - CF=1 */
2340 return TARGET_IEEE_FP ? UNKNOWN : GT;
2341 case UNLE: /* LEU - CF=1 | ZF=1 */
2342 return TARGET_IEEE_FP ? UNKNOWN : GE;
2343 default:
2344 return swap_condition (code);
2345 }
2346 }
2347
2348 /* Return cost of comparison CODE using the best strategy for performance.
2349 All following functions do use number of instructions as a cost metrics.
2350 In future this should be tweaked to compute bytes for optimize_size and
2351 take into account performance of various instructions on various CPUs. */
2352
2353 static int
2354 ix86_fp_comparison_cost (enum rtx_code code)
2355 {
2356 int arith_cost;
2357
2358 /* The cost of code using bit-twiddling on %ah. */
2359 switch (code)
2360 {
2361 case UNLE:
2362 case UNLT:
2363 case LTGT:
2364 case GT:
2365 case GE:
2366 case UNORDERED:
2367 case ORDERED:
2368 case UNEQ:
2369 arith_cost = 4;
2370 break;
2371 case LT:
2372 case NE:
2373 case EQ:
2374 case UNGE:
2375 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2376 break;
2377 case LE:
2378 case UNGT:
2379 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2380 break;
2381 default:
2382 gcc_unreachable ();
2383 }
2384
2385 switch (ix86_fp_comparison_strategy (code))
2386 {
2387 case IX86_FPCMP_COMI:
2388 return arith_cost > 4 ? 3 : 2;
2389 case IX86_FPCMP_SAHF:
2390 return arith_cost > 4 ? 4 : 3;
2391 default:
2392 return arith_cost;
2393 }
2394 }
2395
2396 /* Swap, force into registers, or otherwise massage the two operands
2397 to a fp comparison. The operands are updated in place; the new
2398 comparison code is returned. */
2399
2400 static enum rtx_code
2401 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2402 {
2403 bool unordered_compare = ix86_unordered_fp_compare (code);
2404 rtx op0 = *pop0, op1 = *pop1;
2405 machine_mode op_mode = GET_MODE (op0);
2406 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2407
2408 /* All of the unordered compare instructions only work on registers.
2409 The same is true of the fcomi compare instructions. The XFmode
2410 compare instructions require registers except when comparing
2411 against zero or when converting operand 1 from fixed point to
2412 floating point. */
2413
2414 if (!is_sse
2415 && (unordered_compare
2416 || (op_mode == XFmode
2417 && ! (standard_80387_constant_p (op0) == 1
2418 || standard_80387_constant_p (op1) == 1)
2419 && GET_CODE (op1) != FLOAT)
2420 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2421 {
2422 op0 = force_reg (op_mode, op0);
2423 op1 = force_reg (op_mode, op1);
2424 }
2425 else
2426 {
2427 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2428 things around if they appear profitable, otherwise force op0
2429 into a register. */
2430
2431 if (standard_80387_constant_p (op0) == 0
2432 || (MEM_P (op0)
2433 && ! (standard_80387_constant_p (op1) == 0
2434 || MEM_P (op1))))
2435 {
2436 enum rtx_code new_code = ix86_fp_swap_condition (code);
2437 if (new_code != UNKNOWN)
2438 {
2439 std::swap (op0, op1);
2440 code = new_code;
2441 }
2442 }
2443
2444 if (!REG_P (op0))
2445 op0 = force_reg (op_mode, op0);
2446
2447 if (CONSTANT_P (op1))
2448 {
2449 int tmp = standard_80387_constant_p (op1);
2450 if (tmp == 0)
2451 op1 = validize_mem (force_const_mem (op_mode, op1));
2452 else if (tmp == 1)
2453 {
2454 if (TARGET_CMOVE)
2455 op1 = force_reg (op_mode, op1);
2456 }
2457 else
2458 op1 = force_reg (op_mode, op1);
2459 }
2460 }
2461
2462 /* Try to rearrange the comparison to make it cheaper. */
2463 if (ix86_fp_comparison_cost (code)
2464 > ix86_fp_comparison_cost (swap_condition (code))
2465 && (REG_P (op1) || can_create_pseudo_p ()))
2466 {
2467 std::swap (op0, op1);
2468 code = swap_condition (code);
2469 if (!REG_P (op0))
2470 op0 = force_reg (op_mode, op0);
2471 }
2472
2473 *pop0 = op0;
2474 *pop1 = op1;
2475 return code;
2476 }
2477
2478 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2479
2480 static rtx
2481 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2482 {
2483 bool unordered_compare = ix86_unordered_fp_compare (code);
2484 machine_mode cmp_mode;
2485 rtx tmp, scratch;
2486
2487 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2488
2489 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2490 if (unordered_compare)
2491 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2492
2493 /* Do fcomi/sahf based test when profitable. */
2494 switch (ix86_fp_comparison_strategy (code))
2495 {
2496 case IX86_FPCMP_COMI:
2497 cmp_mode = CCFPmode;
2498 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2499 break;
2500
2501 case IX86_FPCMP_SAHF:
2502 cmp_mode = CCFPmode;
2503 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2504 scratch = gen_reg_rtx (HImode);
2505 emit_insn (gen_rtx_SET (scratch, tmp));
2506 emit_insn (gen_x86_sahf_1 (scratch));
2507 break;
2508
2509 case IX86_FPCMP_ARITH:
2510 cmp_mode = CCNOmode;
2511 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2512 scratch = gen_reg_rtx (HImode);
2513 emit_insn (gen_rtx_SET (scratch, tmp));
2514
2515 /* In the unordered case, we have to check C2 for NaN's, which
2516 doesn't happen to work out to anything nice combination-wise.
2517 So do some bit twiddling on the value we've got in AH to come
2518 up with an appropriate set of condition codes. */
2519
2520 switch (code)
2521 {
2522 case GT:
2523 case UNGT:
2524 if (code == GT || !TARGET_IEEE_FP)
2525 {
2526 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2527 code = EQ;
2528 }
2529 else
2530 {
2531 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2532 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2533 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2534 cmp_mode = CCmode;
2535 code = GEU;
2536 }
2537 break;
2538 case LT:
2539 case UNLT:
2540 if (code == LT && TARGET_IEEE_FP)
2541 {
2542 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2543 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2544 cmp_mode = CCmode;
2545 code = EQ;
2546 }
2547 else
2548 {
2549 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2550 code = NE;
2551 }
2552 break;
2553 case GE:
2554 case UNGE:
2555 if (code == GE || !TARGET_IEEE_FP)
2556 {
2557 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2558 code = EQ;
2559 }
2560 else
2561 {
2562 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2563 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2564 code = NE;
2565 }
2566 break;
2567 case LE:
2568 case UNLE:
2569 if (code == LE && TARGET_IEEE_FP)
2570 {
2571 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2572 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2573 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2574 cmp_mode = CCmode;
2575 code = LTU;
2576 }
2577 else
2578 {
2579 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2580 code = NE;
2581 }
2582 break;
2583 case EQ:
2584 case UNEQ:
2585 if (code == EQ && TARGET_IEEE_FP)
2586 {
2587 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2588 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2589 cmp_mode = CCmode;
2590 code = EQ;
2591 }
2592 else
2593 {
2594 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2595 code = NE;
2596 }
2597 break;
2598 case NE:
2599 case LTGT:
2600 if (code == NE && TARGET_IEEE_FP)
2601 {
2602 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2603 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2604 GEN_INT (0x40)));
2605 code = NE;
2606 }
2607 else
2608 {
2609 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2610 code = EQ;
2611 }
2612 break;
2613
2614 case UNORDERED:
2615 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2616 code = NE;
2617 break;
2618 case ORDERED:
2619 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2620 code = EQ;
2621 break;
2622
2623 default:
2624 gcc_unreachable ();
2625 }
2626 break;
2627
2628 default:
2629 gcc_unreachable();
2630 }
2631
2632 /* Return the test that should be put into the flags user, i.e.
2633 the bcc, scc, or cmov instruction. */
2634 return gen_rtx_fmt_ee (code, VOIDmode,
2635 gen_rtx_REG (cmp_mode, FLAGS_REG),
2636 const0_rtx);
2637 }
2638
2639 /* Generate insn patterns to do an integer compare of OPERANDS. */
2640
2641 static rtx
2642 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2643 {
2644 machine_mode cmpmode;
2645 rtx tmp, flags;
2646
2647 cmpmode = SELECT_CC_MODE (code, op0, op1);
2648 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2649
2650 /* This is very simple, but making the interface the same as in the
2651 FP case makes the rest of the code easier. */
2652 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2653 emit_insn (gen_rtx_SET (flags, tmp));
2654
2655 /* Return the test that should be put into the flags user, i.e.
2656 the bcc, scc, or cmov instruction. */
2657 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2658 }
2659
2660 static rtx
2661 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2662 {
2663 rtx ret;
2664
2665 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2666 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2667
2668 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2669 {
2670 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2671 ret = ix86_expand_fp_compare (code, op0, op1);
2672 }
2673 else
2674 ret = ix86_expand_int_compare (code, op0, op1);
2675
2676 return ret;
2677 }
2678
2679 void
2680 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2681 {
2682 rtx ret;
2683
2684 gcc_assert (GET_MODE (dest) == QImode);
2685
2686 ret = ix86_expand_compare (code, op0, op1);
2687 PUT_MODE (ret, QImode);
2688 emit_insn (gen_rtx_SET (dest, ret));
2689 }
2690
2691 /* Expand comparison setting or clearing carry flag. Return true when
2692 successful and set pop for the operation. */
2693 static bool
2694 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2695 {
2696 machine_mode mode
2697 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2698
2699 /* Do not handle double-mode compares that go through special path. */
2700 if (mode == (TARGET_64BIT ? TImode : DImode))
2701 return false;
2702
2703 if (SCALAR_FLOAT_MODE_P (mode))
2704 {
2705 rtx compare_op;
2706 rtx_insn *compare_seq;
2707
2708 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2709
2710 /* Shortcut: following common codes never translate
2711 into carry flag compares. */
2712 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2713 || code == ORDERED || code == UNORDERED)
2714 return false;
2715
2716 /* These comparisons require zero flag; swap operands so they won't. */
2717 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2718 && !TARGET_IEEE_FP)
2719 {
2720 std::swap (op0, op1);
2721 code = swap_condition (code);
2722 }
2723
2724 /* Try to expand the comparison and verify that we end up with
2725 carry flag based comparison. This fails to be true only when
2726 we decide to expand comparison using arithmetic that is not
2727 too common scenario. */
2728 start_sequence ();
2729 compare_op = ix86_expand_fp_compare (code, op0, op1);
2730 compare_seq = get_insns ();
2731 end_sequence ();
2732
2733 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2734 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2735 else
2736 code = GET_CODE (compare_op);
2737
2738 if (code != LTU && code != GEU)
2739 return false;
2740
2741 emit_insn (compare_seq);
2742 *pop = compare_op;
2743 return true;
2744 }
2745
2746 if (!INTEGRAL_MODE_P (mode))
2747 return false;
2748
2749 switch (code)
2750 {
2751 case LTU:
2752 case GEU:
2753 break;
2754
2755 /* Convert a==0 into (unsigned)a<1. */
2756 case EQ:
2757 case NE:
2758 if (op1 != const0_rtx)
2759 return false;
2760 op1 = const1_rtx;
2761 code = (code == EQ ? LTU : GEU);
2762 break;
2763
2764 /* Convert a>b into b<a or a>=b-1. */
2765 case GTU:
2766 case LEU:
2767 if (CONST_INT_P (op1))
2768 {
2769 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2770 /* Bail out on overflow. We still can swap operands but that
2771 would force loading of the constant into register. */
2772 if (op1 == const0_rtx
2773 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2774 return false;
2775 code = (code == GTU ? GEU : LTU);
2776 }
2777 else
2778 {
2779 std::swap (op0, op1);
2780 code = (code == GTU ? LTU : GEU);
2781 }
2782 break;
2783
2784 /* Convert a>=0 into (unsigned)a<0x80000000. */
2785 case LT:
2786 case GE:
2787 if (mode == DImode || op1 != const0_rtx)
2788 return false;
2789 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2790 code = (code == LT ? GEU : LTU);
2791 break;
2792 case LE:
2793 case GT:
2794 if (mode == DImode || op1 != constm1_rtx)
2795 return false;
2796 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2797 code = (code == LE ? GEU : LTU);
2798 break;
2799
2800 default:
2801 return false;
2802 }
2803 /* Swapping operands may cause constant to appear as first operand. */
2804 if (!nonimmediate_operand (op0, VOIDmode))
2805 {
2806 if (!can_create_pseudo_p ())
2807 return false;
2808 op0 = force_reg (mode, op0);
2809 }
2810 *pop = ix86_expand_compare (code, op0, op1);
2811 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2812 return true;
2813 }
2814
2815 /* Expand conditional increment or decrement using adb/sbb instructions.
2816 The default case using setcc followed by the conditional move can be
2817 done by generic code. */
2818 bool
2819 ix86_expand_int_addcc (rtx operands[])
2820 {
2821 enum rtx_code code = GET_CODE (operands[1]);
2822 rtx flags;
2823 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2824 rtx compare_op;
2825 rtx val = const0_rtx;
2826 bool fpcmp = false;
2827 machine_mode mode;
2828 rtx op0 = XEXP (operands[1], 0);
2829 rtx op1 = XEXP (operands[1], 1);
2830
2831 if (operands[3] != const1_rtx
2832 && operands[3] != constm1_rtx)
2833 return false;
2834 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2835 return false;
2836 code = GET_CODE (compare_op);
2837
2838 flags = XEXP (compare_op, 0);
2839
2840 if (GET_MODE (flags) == CCFPmode)
2841 {
2842 fpcmp = true;
2843 code = ix86_fp_compare_code_to_integer (code);
2844 }
2845
2846 if (code != LTU)
2847 {
2848 val = constm1_rtx;
2849 if (fpcmp)
2850 PUT_CODE (compare_op,
2851 reverse_condition_maybe_unordered
2852 (GET_CODE (compare_op)));
2853 else
2854 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2855 }
2856
2857 mode = GET_MODE (operands[0]);
2858
2859 /* Construct either adc or sbb insn. */
2860 if ((code == LTU) == (operands[3] == constm1_rtx))
2861 insn = gen_sub3_carry;
2862 else
2863 insn = gen_add3_carry;
2864
2865 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2866
2867 return true;
2868 }
2869
2870 bool
2871 ix86_expand_int_movcc (rtx operands[])
2872 {
2873 enum rtx_code code = GET_CODE (operands[1]), compare_code;
2874 rtx_insn *compare_seq;
2875 rtx compare_op;
2876 machine_mode mode = GET_MODE (operands[0]);
2877 bool sign_bit_compare_p = false;
2878 rtx op0 = XEXP (operands[1], 0);
2879 rtx op1 = XEXP (operands[1], 1);
2880
2881 if (GET_MODE (op0) == TImode
2882 || (GET_MODE (op0) == DImode
2883 && !TARGET_64BIT))
2884 return false;
2885
2886 start_sequence ();
2887 compare_op = ix86_expand_compare (code, op0, op1);
2888 compare_seq = get_insns ();
2889 end_sequence ();
2890
2891 compare_code = GET_CODE (compare_op);
2892
2893 if ((op1 == const0_rtx && (code == GE || code == LT))
2894 || (op1 == constm1_rtx && (code == GT || code == LE)))
2895 sign_bit_compare_p = true;
2896
2897 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2898 HImode insns, we'd be swallowed in word prefix ops. */
2899
2900 if ((mode != HImode || TARGET_FAST_PREFIX)
2901 && (mode != (TARGET_64BIT ? TImode : DImode))
2902 && CONST_INT_P (operands[2])
2903 && CONST_INT_P (operands[3]))
2904 {
2905 rtx out = operands[0];
2906 HOST_WIDE_INT ct = INTVAL (operands[2]);
2907 HOST_WIDE_INT cf = INTVAL (operands[3]);
2908 HOST_WIDE_INT diff;
2909
2910 diff = ct - cf;
2911 /* Sign bit compares are better done using shifts than we do by using
2912 sbb. */
2913 if (sign_bit_compare_p
2914 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2915 {
2916 /* Detect overlap between destination and compare sources. */
2917 rtx tmp = out;
2918
2919 if (!sign_bit_compare_p)
2920 {
2921 rtx flags;
2922 bool fpcmp = false;
2923
2924 compare_code = GET_CODE (compare_op);
2925
2926 flags = XEXP (compare_op, 0);
2927
2928 if (GET_MODE (flags) == CCFPmode)
2929 {
2930 fpcmp = true;
2931 compare_code
2932 = ix86_fp_compare_code_to_integer (compare_code);
2933 }
2934
2935 /* To simplify rest of code, restrict to the GEU case. */
2936 if (compare_code == LTU)
2937 {
2938 std::swap (ct, cf);
2939 compare_code = reverse_condition (compare_code);
2940 code = reverse_condition (code);
2941 }
2942 else
2943 {
2944 if (fpcmp)
2945 PUT_CODE (compare_op,
2946 reverse_condition_maybe_unordered
2947 (GET_CODE (compare_op)));
2948 else
2949 PUT_CODE (compare_op,
2950 reverse_condition (GET_CODE (compare_op)));
2951 }
2952 diff = ct - cf;
2953
2954 if (reg_overlap_mentioned_p (out, op0)
2955 || reg_overlap_mentioned_p (out, op1))
2956 tmp = gen_reg_rtx (mode);
2957
2958 if (mode == DImode)
2959 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2960 else
2961 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
2962 flags, compare_op));
2963 }
2964 else
2965 {
2966 if (code == GT || code == GE)
2967 code = reverse_condition (code);
2968 else
2969 {
2970 std::swap (ct, cf);
2971 diff = ct - cf;
2972 }
2973 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2974 }
2975
2976 if (diff == 1)
2977 {
2978 /*
2979 * cmpl op0,op1
2980 * sbbl dest,dest
2981 * [addl dest, ct]
2982 *
2983 * Size 5 - 8.
2984 */
2985 if (ct)
2986 tmp = expand_simple_binop (mode, PLUS,
2987 tmp, GEN_INT (ct),
2988 copy_rtx (tmp), 1, OPTAB_DIRECT);
2989 }
2990 else if (cf == -1)
2991 {
2992 /*
2993 * cmpl op0,op1
2994 * sbbl dest,dest
2995 * orl $ct, dest
2996 *
2997 * Size 8.
2998 */
2999 tmp = expand_simple_binop (mode, IOR,
3000 tmp, GEN_INT (ct),
3001 copy_rtx (tmp), 1, OPTAB_DIRECT);
3002 }
3003 else if (diff == -1 && ct)
3004 {
3005 /*
3006 * cmpl op0,op1
3007 * sbbl dest,dest
3008 * notl dest
3009 * [addl dest, cf]
3010 *
3011 * Size 8 - 11.
3012 */
3013 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3014 if (cf)
3015 tmp = expand_simple_binop (mode, PLUS,
3016 copy_rtx (tmp), GEN_INT (cf),
3017 copy_rtx (tmp), 1, OPTAB_DIRECT);
3018 }
3019 else
3020 {
3021 /*
3022 * cmpl op0,op1
3023 * sbbl dest,dest
3024 * [notl dest]
3025 * andl cf - ct, dest
3026 * [addl dest, ct]
3027 *
3028 * Size 8 - 11.
3029 */
3030
3031 if (cf == 0)
3032 {
3033 cf = ct;
3034 ct = 0;
3035 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3036 }
3037
3038 tmp = expand_simple_binop (mode, AND,
3039 copy_rtx (tmp),
3040 gen_int_mode (cf - ct, mode),
3041 copy_rtx (tmp), 1, OPTAB_DIRECT);
3042 if (ct)
3043 tmp = expand_simple_binop (mode, PLUS,
3044 copy_rtx (tmp), GEN_INT (ct),
3045 copy_rtx (tmp), 1, OPTAB_DIRECT);
3046 }
3047
3048 if (!rtx_equal_p (tmp, out))
3049 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3050
3051 return true;
3052 }
3053
3054 if (diff < 0)
3055 {
3056 machine_mode cmp_mode = GET_MODE (op0);
3057 enum rtx_code new_code;
3058
3059 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3060 {
3061 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3062
3063 /* We may be reversing unordered compare to normal compare, that
3064 is not valid in general (we may convert non-trapping condition
3065 to trapping one), however on i386 we currently emit all
3066 comparisons unordered. */
3067 new_code = reverse_condition_maybe_unordered (code);
3068 }
3069 else
3070 new_code = ix86_reverse_condition (code, cmp_mode);
3071 if (new_code != UNKNOWN)
3072 {
3073 std::swap (ct, cf);
3074 diff = -diff;
3075 code = new_code;
3076 }
3077 }
3078
3079 compare_code = UNKNOWN;
3080 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3081 && CONST_INT_P (op1))
3082 {
3083 if (op1 == const0_rtx
3084 && (code == LT || code == GE))
3085 compare_code = code;
3086 else if (op1 == constm1_rtx)
3087 {
3088 if (code == LE)
3089 compare_code = LT;
3090 else if (code == GT)
3091 compare_code = GE;
3092 }
3093 }
3094
3095 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3096 if (compare_code != UNKNOWN
3097 && GET_MODE (op0) == GET_MODE (out)
3098 && (cf == -1 || ct == -1))
3099 {
3100 /* If lea code below could be used, only optimize
3101 if it results in a 2 insn sequence. */
3102
3103 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3104 || diff == 3 || diff == 5 || diff == 9)
3105 || (compare_code == LT && ct == -1)
3106 || (compare_code == GE && cf == -1))
3107 {
3108 /*
3109 * notl op1 (if necessary)
3110 * sarl $31, op1
3111 * orl cf, op1
3112 */
3113 if (ct != -1)
3114 {
3115 cf = ct;
3116 ct = -1;
3117 code = reverse_condition (code);
3118 }
3119
3120 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3121
3122 out = expand_simple_binop (mode, IOR,
3123 out, GEN_INT (cf),
3124 out, 1, OPTAB_DIRECT);
3125 if (out != operands[0])
3126 emit_move_insn (operands[0], out);
3127
3128 return true;
3129 }
3130 }
3131
3132
3133 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3134 || diff == 3 || diff == 5 || diff == 9)
3135 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3136 && (mode != DImode
3137 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3138 {
3139 /*
3140 * xorl dest,dest
3141 * cmpl op1,op2
3142 * setcc dest
3143 * lea cf(dest*(ct-cf)),dest
3144 *
3145 * Size 14.
3146 *
3147 * This also catches the degenerate setcc-only case.
3148 */
3149
3150 rtx tmp;
3151 int nops;
3152
3153 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3154
3155 nops = 0;
3156 /* On x86_64 the lea instruction operates on Pmode, so we need
3157 to get arithmetics done in proper mode to match. */
3158 if (diff == 1)
3159 tmp = copy_rtx (out);
3160 else
3161 {
3162 rtx out1;
3163 out1 = copy_rtx (out);
3164 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3165 nops++;
3166 if (diff & 1)
3167 {
3168 tmp = gen_rtx_PLUS (mode, tmp, out1);
3169 nops++;
3170 }
3171 }
3172 if (cf != 0)
3173 {
3174 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
3175 nops++;
3176 }
3177 if (!rtx_equal_p (tmp, out))
3178 {
3179 if (nops == 1)
3180 out = force_operand (tmp, copy_rtx (out));
3181 else
3182 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3183 }
3184 if (!rtx_equal_p (out, operands[0]))
3185 emit_move_insn (operands[0], copy_rtx (out));
3186
3187 return true;
3188 }
3189
3190 /*
3191 * General case: Jumpful:
3192 * xorl dest,dest cmpl op1, op2
3193 * cmpl op1, op2 movl ct, dest
3194 * setcc dest jcc 1f
3195 * decl dest movl cf, dest
3196 * andl (cf-ct),dest 1:
3197 * addl ct,dest
3198 *
3199 * Size 20. Size 14.
3200 *
3201 * This is reasonably steep, but branch mispredict costs are
3202 * high on modern cpus, so consider failing only if optimizing
3203 * for space.
3204 */
3205
3206 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3207 && BRANCH_COST (optimize_insn_for_speed_p (),
3208 false) >= 2)
3209 {
3210 if (cf == 0)
3211 {
3212 machine_mode cmp_mode = GET_MODE (op0);
3213 enum rtx_code new_code;
3214
3215 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3216 {
3217 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3218
3219 /* We may be reversing unordered compare to normal compare,
3220 that is not valid in general (we may convert non-trapping
3221 condition to trapping one), however on i386 we currently
3222 emit all comparisons unordered. */
3223 new_code = reverse_condition_maybe_unordered (code);
3224 }
3225 else
3226 {
3227 new_code = ix86_reverse_condition (code, cmp_mode);
3228 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3229 compare_code = reverse_condition (compare_code);
3230 }
3231
3232 if (new_code != UNKNOWN)
3233 {
3234 cf = ct;
3235 ct = 0;
3236 code = new_code;
3237 }
3238 }
3239
3240 if (compare_code != UNKNOWN)
3241 {
3242 /* notl op1 (if needed)
3243 sarl $31, op1
3244 andl (cf-ct), op1
3245 addl ct, op1
3246
3247 For x < 0 (resp. x <= -1) there will be no notl,
3248 so if possible swap the constants to get rid of the
3249 complement.
3250 True/false will be -1/0 while code below (store flag
3251 followed by decrement) is 0/-1, so the constants need
3252 to be exchanged once more. */
3253
3254 if (compare_code == GE || !cf)
3255 {
3256 code = reverse_condition (code);
3257 compare_code = LT;
3258 }
3259 else
3260 std::swap (ct, cf);
3261
3262 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3263 }
3264 else
3265 {
3266 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3267
3268 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3269 constm1_rtx,
3270 copy_rtx (out), 1, OPTAB_DIRECT);
3271 }
3272
3273 out = expand_simple_binop (mode, AND, copy_rtx (out),
3274 gen_int_mode (cf - ct, mode),
3275 copy_rtx (out), 1, OPTAB_DIRECT);
3276 if (ct)
3277 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3278 copy_rtx (out), 1, OPTAB_DIRECT);
3279 if (!rtx_equal_p (out, operands[0]))
3280 emit_move_insn (operands[0], copy_rtx (out));
3281
3282 return true;
3283 }
3284 }
3285
3286 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3287 {
3288 /* Try a few things more with specific constants and a variable. */
3289
3290 optab op;
3291 rtx var, orig_out, out, tmp;
3292
3293 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3294 return false;
3295
3296 /* If one of the two operands is an interesting constant, load a
3297 constant with the above and mask it in with a logical operation. */
3298
3299 if (CONST_INT_P (operands[2]))
3300 {
3301 var = operands[3];
3302 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3303 operands[3] = constm1_rtx, op = and_optab;
3304 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3305 operands[3] = const0_rtx, op = ior_optab;
3306 else
3307 return false;
3308 }
3309 else if (CONST_INT_P (operands[3]))
3310 {
3311 var = operands[2];
3312 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3313 operands[2] = constm1_rtx, op = and_optab;
3314 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3315 operands[2] = const0_rtx, op = ior_optab;
3316 else
3317 return false;
3318 }
3319 else
3320 return false;
3321
3322 orig_out = operands[0];
3323 tmp = gen_reg_rtx (mode);
3324 operands[0] = tmp;
3325
3326 /* Recurse to get the constant loaded. */
3327 if (!ix86_expand_int_movcc (operands))
3328 return false;
3329
3330 /* Mask in the interesting variable. */
3331 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3332 OPTAB_WIDEN);
3333 if (!rtx_equal_p (out, orig_out))
3334 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3335
3336 return true;
3337 }
3338
3339 /*
3340 * For comparison with above,
3341 *
3342 * movl cf,dest
3343 * movl ct,tmp
3344 * cmpl op1,op2
3345 * cmovcc tmp,dest
3346 *
3347 * Size 15.
3348 */
3349
3350 if (! nonimmediate_operand (operands[2], mode))
3351 operands[2] = force_reg (mode, operands[2]);
3352 if (! nonimmediate_operand (operands[3], mode))
3353 operands[3] = force_reg (mode, operands[3]);
3354
3355 if (! register_operand (operands[2], VOIDmode)
3356 && (mode == QImode
3357 || ! register_operand (operands[3], VOIDmode)))
3358 operands[2] = force_reg (mode, operands[2]);
3359
3360 if (mode == QImode
3361 && ! register_operand (operands[3], VOIDmode))
3362 operands[3] = force_reg (mode, operands[3]);
3363
3364 emit_insn (compare_seq);
3365 emit_insn (gen_rtx_SET (operands[0],
3366 gen_rtx_IF_THEN_ELSE (mode,
3367 compare_op, operands[2],
3368 operands[3])));
3369 return true;
3370 }
3371
3372 /* Detect conditional moves that exactly match min/max operational
3373 semantics. Note that this is IEEE safe, as long as we don't
3374 interchange the operands.
3375
3376 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3377 and TRUE if the operation is successful and instructions are emitted. */
3378
3379 static bool
3380 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3381 rtx cmp_op1, rtx if_true, rtx if_false)
3382 {
3383 machine_mode mode;
3384 bool is_min;
3385 rtx tmp;
3386
3387 if (code == LT)
3388 ;
3389 else if (code == UNGE)
3390 std::swap (if_true, if_false);
3391 else
3392 return false;
3393
3394 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3395 is_min = true;
3396 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3397 is_min = false;
3398 else
3399 return false;
3400
3401 mode = GET_MODE (dest);
3402
3403 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3404 but MODE may be a vector mode and thus not appropriate. */
3405 if (!flag_finite_math_only || flag_signed_zeros)
3406 {
3407 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3408 rtvec v;
3409
3410 if_true = force_reg (mode, if_true);
3411 v = gen_rtvec (2, if_true, if_false);
3412 tmp = gen_rtx_UNSPEC (mode, v, u);
3413 }
3414 else
3415 {
3416 code = is_min ? SMIN : SMAX;
3417 if (MEM_P (if_true) && MEM_P (if_false))
3418 if_true = force_reg (mode, if_true);
3419 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3420 }
3421
3422 emit_insn (gen_rtx_SET (dest, tmp));
3423 return true;
3424 }
3425
3426 /* Expand an SSE comparison. Return the register with the result. */
3427
3428 static rtx
3429 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3430 rtx op_true, rtx op_false)
3431 {
3432 machine_mode mode = GET_MODE (dest);
3433 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3434
3435 /* In general case result of comparison can differ from operands' type. */
3436 machine_mode cmp_mode;
3437
3438 /* In AVX512F the result of comparison is an integer mask. */
3439 bool maskcmp = false;
3440 rtx x;
3441
3442 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
3443 {
3444 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3445 cmp_mode = int_mode_for_size (nbits, 0).require ();
3446 maskcmp = true;
3447 }
3448 else
3449 cmp_mode = cmp_ops_mode;
3450
3451 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3452
3453 int (*op1_predicate)(rtx, machine_mode)
3454 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3455
3456 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3457 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3458
3459 if (optimize
3460 || (maskcmp && cmp_mode != mode)
3461 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3462 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3463 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3464
3465 /* Compare patterns for int modes are unspec in AVX512F only. */
3466 if (maskcmp && (code == GT || code == EQ))
3467 {
3468 rtx (*gen)(rtx, rtx, rtx);
3469
3470 switch (cmp_ops_mode)
3471 {
3472 case E_V64QImode:
3473 gcc_assert (TARGET_AVX512BW);
3474 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
3475 break;
3476 case E_V32HImode:
3477 gcc_assert (TARGET_AVX512BW);
3478 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
3479 break;
3480 case E_V16SImode:
3481 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
3482 break;
3483 case E_V8DImode:
3484 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
3485 break;
3486 default:
3487 gen = NULL;
3488 }
3489
3490 if (gen)
3491 {
3492 emit_insn (gen (dest, cmp_op0, cmp_op1));
3493 return dest;
3494 }
3495 }
3496 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3497
3498 if (cmp_mode != mode && !maskcmp)
3499 {
3500 x = force_reg (cmp_ops_mode, x);
3501 convert_move (dest, x, false);
3502 }
3503 else
3504 emit_insn (gen_rtx_SET (dest, x));
3505
3506 return dest;
3507 }
3508
3509 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3510 operations. This is used for both scalar and vector conditional moves. */
3511
3512 void
3513 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3514 {
3515 machine_mode mode = GET_MODE (dest);
3516 machine_mode cmpmode = GET_MODE (cmp);
3517
3518 /* In AVX512F the result of comparison is an integer mask. */
3519 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
3520
3521 rtx t2, t3, x;
3522
3523 /* If we have an integer mask and FP value then we need
3524 to cast mask to FP mode. */
3525 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3526 {
3527 cmp = force_reg (cmpmode, cmp);
3528 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3529 }
3530
3531 if (maskcmp)
3532 {
3533 rtx (*gen) (rtx, rtx) = NULL;
3534 if ((op_true == CONST0_RTX (mode)
3535 && vector_all_ones_operand (op_false, mode))
3536 || (op_false == CONST0_RTX (mode)
3537 && vector_all_ones_operand (op_true, mode)))
3538 switch (mode)
3539 {
3540 case E_V64QImode:
3541 if (TARGET_AVX512BW)
3542 gen = gen_avx512bw_cvtmask2bv64qi;
3543 break;
3544 case E_V32QImode:
3545 if (TARGET_AVX512VL && TARGET_AVX512BW)
3546 gen = gen_avx512vl_cvtmask2bv32qi;
3547 break;
3548 case E_V16QImode:
3549 if (TARGET_AVX512VL && TARGET_AVX512BW)
3550 gen = gen_avx512vl_cvtmask2bv16qi;
3551 break;
3552 case E_V32HImode:
3553 if (TARGET_AVX512BW)
3554 gen = gen_avx512bw_cvtmask2wv32hi;
3555 break;
3556 case E_V16HImode:
3557 if (TARGET_AVX512VL && TARGET_AVX512BW)
3558 gen = gen_avx512vl_cvtmask2wv16hi;
3559 break;
3560 case E_V8HImode:
3561 if (TARGET_AVX512VL && TARGET_AVX512BW)
3562 gen = gen_avx512vl_cvtmask2wv8hi;
3563 break;
3564 case E_V16SImode:
3565 if (TARGET_AVX512DQ)
3566 gen = gen_avx512f_cvtmask2dv16si;
3567 break;
3568 case E_V8SImode:
3569 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3570 gen = gen_avx512vl_cvtmask2dv8si;
3571 break;
3572 case E_V4SImode:
3573 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3574 gen = gen_avx512vl_cvtmask2dv4si;
3575 break;
3576 case E_V8DImode:
3577 if (TARGET_AVX512DQ)
3578 gen = gen_avx512f_cvtmask2qv8di;
3579 break;
3580 case E_V4DImode:
3581 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3582 gen = gen_avx512vl_cvtmask2qv4di;
3583 break;
3584 case E_V2DImode:
3585 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3586 gen = gen_avx512vl_cvtmask2qv2di;
3587 break;
3588 default:
3589 break;
3590 }
3591 if (gen && SCALAR_INT_MODE_P (cmpmode))
3592 {
3593 cmp = force_reg (cmpmode, cmp);
3594 if (op_true == CONST0_RTX (mode))
3595 {
3596 rtx (*gen_not) (rtx, rtx);
3597 switch (cmpmode)
3598 {
3599 case E_QImode: gen_not = gen_knotqi; break;
3600 case E_HImode: gen_not = gen_knothi; break;
3601 case E_SImode: gen_not = gen_knotsi; break;
3602 case E_DImode: gen_not = gen_knotdi; break;
3603 default: gcc_unreachable ();
3604 }
3605 rtx n = gen_reg_rtx (cmpmode);
3606 emit_insn (gen_not (n, cmp));
3607 cmp = n;
3608 }
3609 emit_insn (gen (dest, cmp));
3610 return;
3611 }
3612 }
3613 else if (vector_all_ones_operand (op_true, mode)
3614 && op_false == CONST0_RTX (mode))
3615 {
3616 emit_insn (gen_rtx_SET (dest, cmp));
3617 return;
3618 }
3619 else if (op_false == CONST0_RTX (mode))
3620 {
3621 op_true = force_reg (mode, op_true);
3622 x = gen_rtx_AND (mode, cmp, op_true);
3623 emit_insn (gen_rtx_SET (dest, x));
3624 return;
3625 }
3626 else if (op_true == CONST0_RTX (mode))
3627 {
3628 op_false = force_reg (mode, op_false);
3629 x = gen_rtx_NOT (mode, cmp);
3630 x = gen_rtx_AND (mode, x, op_false);
3631 emit_insn (gen_rtx_SET (dest, x));
3632 return;
3633 }
3634 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3635 {
3636 op_false = force_reg (mode, op_false);
3637 x = gen_rtx_IOR (mode, cmp, op_false);
3638 emit_insn (gen_rtx_SET (dest, x));
3639 return;
3640 }
3641 else if (TARGET_XOP)
3642 {
3643 op_true = force_reg (mode, op_true);
3644
3645 if (!nonimmediate_operand (op_false, mode))
3646 op_false = force_reg (mode, op_false);
3647
3648 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3649 op_true,
3650 op_false)));
3651 return;
3652 }
3653
3654 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3655 rtx d = dest;
3656
3657 if (!vector_operand (op_true, mode))
3658 op_true = force_reg (mode, op_true);
3659
3660 op_false = force_reg (mode, op_false);
3661
3662 switch (mode)
3663 {
3664 case E_V4SFmode:
3665 if (TARGET_SSE4_1)
3666 gen = gen_sse4_1_blendvps;
3667 break;
3668 case E_V2DFmode:
3669 if (TARGET_SSE4_1)
3670 gen = gen_sse4_1_blendvpd;
3671 break;
3672 case E_SFmode:
3673 if (TARGET_SSE4_1)
3674 {
3675 gen = gen_sse4_1_blendvss;
3676 op_true = force_reg (mode, op_true);
3677 }
3678 break;
3679 case E_DFmode:
3680 if (TARGET_SSE4_1)
3681 {
3682 gen = gen_sse4_1_blendvsd;
3683 op_true = force_reg (mode, op_true);
3684 }
3685 break;
3686 case E_V16QImode:
3687 case E_V8HImode:
3688 case E_V4SImode:
3689 case E_V2DImode:
3690 if (TARGET_SSE4_1)
3691 {
3692 gen = gen_sse4_1_pblendvb;
3693 if (mode != V16QImode)
3694 d = gen_reg_rtx (V16QImode);
3695 op_false = gen_lowpart (V16QImode, op_false);
3696 op_true = gen_lowpart (V16QImode, op_true);
3697 cmp = gen_lowpart (V16QImode, cmp);
3698 }
3699 break;
3700 case E_V8SFmode:
3701 if (TARGET_AVX)
3702 gen = gen_avx_blendvps256;
3703 break;
3704 case E_V4DFmode:
3705 if (TARGET_AVX)
3706 gen = gen_avx_blendvpd256;
3707 break;
3708 case E_V32QImode:
3709 case E_V16HImode:
3710 case E_V8SImode:
3711 case E_V4DImode:
3712 if (TARGET_AVX2)
3713 {
3714 gen = gen_avx2_pblendvb;
3715 if (mode != V32QImode)
3716 d = gen_reg_rtx (V32QImode);
3717 op_false = gen_lowpart (V32QImode, op_false);
3718 op_true = gen_lowpart (V32QImode, op_true);
3719 cmp = gen_lowpart (V32QImode, cmp);
3720 }
3721 break;
3722
3723 case E_V64QImode:
3724 gen = gen_avx512bw_blendmv64qi;
3725 break;
3726 case E_V32HImode:
3727 gen = gen_avx512bw_blendmv32hi;
3728 break;
3729 case E_V16SImode:
3730 gen = gen_avx512f_blendmv16si;
3731 break;
3732 case E_V8DImode:
3733 gen = gen_avx512f_blendmv8di;
3734 break;
3735 case E_V8DFmode:
3736 gen = gen_avx512f_blendmv8df;
3737 break;
3738 case E_V16SFmode:
3739 gen = gen_avx512f_blendmv16sf;
3740 break;
3741
3742 default:
3743 break;
3744 }
3745
3746 if (gen != NULL)
3747 {
3748 emit_insn (gen (d, op_false, op_true, cmp));
3749 if (d != dest)
3750 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3751 }
3752 else
3753 {
3754 op_true = force_reg (mode, op_true);
3755
3756 t2 = gen_reg_rtx (mode);
3757 if (optimize)
3758 t3 = gen_reg_rtx (mode);
3759 else
3760 t3 = dest;
3761
3762 x = gen_rtx_AND (mode, op_true, cmp);
3763 emit_insn (gen_rtx_SET (t2, x));
3764
3765 x = gen_rtx_NOT (mode, cmp);
3766 x = gen_rtx_AND (mode, x, op_false);
3767 emit_insn (gen_rtx_SET (t3, x));
3768
3769 x = gen_rtx_IOR (mode, t3, t2);
3770 emit_insn (gen_rtx_SET (dest, x));
3771 }
3772 }
3773
3774 /* Swap, force into registers, or otherwise massage the two operands
3775 to an sse comparison with a mask result. Thus we differ a bit from
3776 ix86_prepare_fp_compare_args which expects to produce a flags result.
3777
3778 The DEST operand exists to help determine whether to commute commutative
3779 operators. The POP0/POP1 operands are updated in place. The new
3780 comparison code is returned, or UNKNOWN if not implementable. */
3781
3782 static enum rtx_code
3783 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3784 rtx *pop0, rtx *pop1)
3785 {
3786 switch (code)
3787 {
3788 case LTGT:
3789 case UNEQ:
3790 /* AVX supports all the needed comparisons. */
3791 if (TARGET_AVX)
3792 break;
3793 /* We have no LTGT as an operator. We could implement it with
3794 NE & ORDERED, but this requires an extra temporary. It's
3795 not clear that it's worth it. */
3796 return UNKNOWN;
3797
3798 case LT:
3799 case LE:
3800 case UNGT:
3801 case UNGE:
3802 /* These are supported directly. */
3803 break;
3804
3805 case EQ:
3806 case NE:
3807 case UNORDERED:
3808 case ORDERED:
3809 /* AVX has 3 operand comparisons, no need to swap anything. */
3810 if (TARGET_AVX)
3811 break;
3812 /* For commutative operators, try to canonicalize the destination
3813 operand to be first in the comparison - this helps reload to
3814 avoid extra moves. */
3815 if (!dest || !rtx_equal_p (dest, *pop1))
3816 break;
3817 /* FALLTHRU */
3818
3819 case GE:
3820 case GT:
3821 case UNLE:
3822 case UNLT:
3823 /* These are not supported directly before AVX, and furthermore
3824 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3825 comparison operands to transform into something that is
3826 supported. */
3827 std::swap (*pop0, *pop1);
3828 code = swap_condition (code);
3829 break;
3830
3831 default:
3832 gcc_unreachable ();
3833 }
3834
3835 return code;
3836 }
3837
3838 /* Expand a floating-point conditional move. Return true if successful. */
3839
3840 bool
3841 ix86_expand_fp_movcc (rtx operands[])
3842 {
3843 machine_mode mode = GET_MODE (operands[0]);
3844 enum rtx_code code = GET_CODE (operands[1]);
3845 rtx tmp, compare_op;
3846 rtx op0 = XEXP (operands[1], 0);
3847 rtx op1 = XEXP (operands[1], 1);
3848
3849 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3850 {
3851 machine_mode cmode;
3852
3853 /* Since we've no cmove for sse registers, don't force bad register
3854 allocation just to gain access to it. Deny movcc when the
3855 comparison mode doesn't match the move mode. */
3856 cmode = GET_MODE (op0);
3857 if (cmode == VOIDmode)
3858 cmode = GET_MODE (op1);
3859 if (cmode != mode)
3860 return false;
3861
3862 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3863 if (code == UNKNOWN)
3864 return false;
3865
3866 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3867 operands[2], operands[3]))
3868 return true;
3869
3870 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3871 operands[2], operands[3]);
3872 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3873 return true;
3874 }
3875
3876 if (GET_MODE (op0) == TImode
3877 || (GET_MODE (op0) == DImode
3878 && !TARGET_64BIT))
3879 return false;
3880
3881 /* The floating point conditional move instructions don't directly
3882 support conditions resulting from a signed integer comparison. */
3883
3884 compare_op = ix86_expand_compare (code, op0, op1);
3885 if (!fcmov_comparison_operator (compare_op, VOIDmode))
3886 {
3887 tmp = gen_reg_rtx (QImode);
3888 ix86_expand_setcc (tmp, code, op0, op1);
3889
3890 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3891 }
3892
3893 emit_insn (gen_rtx_SET (operands[0],
3894 gen_rtx_IF_THEN_ELSE (mode, compare_op,
3895 operands[2], operands[3])));
3896
3897 return true;
3898 }
3899
3900 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3901
3902 static int
3903 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3904 {
3905 switch (code)
3906 {
3907 case EQ:
3908 return 0;
3909 case LT:
3910 case LTU:
3911 return 1;
3912 case LE:
3913 case LEU:
3914 return 2;
3915 case NE:
3916 return 4;
3917 case GE:
3918 case GEU:
3919 return 5;
3920 case GT:
3921 case GTU:
3922 return 6;
3923 default:
3924 gcc_unreachable ();
3925 }
3926 }
3927
3928 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3929
3930 static int
3931 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3932 {
3933 switch (code)
3934 {
3935 case EQ:
3936 return 0x00;
3937 case NE:
3938 return 0x04;
3939 case GT:
3940 return 0x0e;
3941 case LE:
3942 return 0x02;
3943 case GE:
3944 return 0x0d;
3945 case LT:
3946 return 0x01;
3947 case UNLE:
3948 return 0x0a;
3949 case UNLT:
3950 return 0x09;
3951 case UNGE:
3952 return 0x05;
3953 case UNGT:
3954 return 0x06;
3955 case UNEQ:
3956 return 0x18;
3957 case LTGT:
3958 return 0x0c;
3959 case ORDERED:
3960 return 0x07;
3961 case UNORDERED:
3962 return 0x03;
3963 default:
3964 gcc_unreachable ();
3965 }
3966 }
3967
3968 /* Return immediate value to be used in UNSPEC_PCMP
3969 for comparison CODE in MODE. */
3970
3971 static int
3972 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3973 {
3974 if (FLOAT_MODE_P (mode))
3975 return ix86_fp_cmp_code_to_pcmp_immediate (code);
3976 return ix86_int_cmp_code_to_pcmp_immediate (code);
3977 }
3978
3979 /* Expand AVX-512 vector comparison. */
3980
3981 bool
3982 ix86_expand_mask_vec_cmp (rtx operands[])
3983 {
3984 machine_mode mask_mode = GET_MODE (operands[0]);
3985 machine_mode cmp_mode = GET_MODE (operands[2]);
3986 enum rtx_code code = GET_CODE (operands[1]);
3987 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3988 int unspec_code;
3989 rtx unspec;
3990
3991 switch (code)
3992 {
3993 case LEU:
3994 case GTU:
3995 case GEU:
3996 case LTU:
3997 unspec_code = UNSPEC_UNSIGNED_PCMP;
3998 break;
3999
4000 default:
4001 unspec_code = UNSPEC_PCMP;
4002 }
4003
4004 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
4005 operands[3], imm),
4006 unspec_code);
4007 emit_insn (gen_rtx_SET (operands[0], unspec));
4008
4009 return true;
4010 }
4011
4012 /* Expand fp vector comparison. */
4013
4014 bool
4015 ix86_expand_fp_vec_cmp (rtx operands[])
4016 {
4017 enum rtx_code code = GET_CODE (operands[1]);
4018 rtx cmp;
4019
4020 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4021 &operands[2], &operands[3]);
4022 if (code == UNKNOWN)
4023 {
4024 rtx temp;
4025 switch (GET_CODE (operands[1]))
4026 {
4027 case LTGT:
4028 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4029 operands[3], NULL, NULL);
4030 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4031 operands[3], NULL, NULL);
4032 code = AND;
4033 break;
4034 case UNEQ:
4035 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4036 operands[3], NULL, NULL);
4037 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4038 operands[3], NULL, NULL);
4039 code = IOR;
4040 break;
4041 default:
4042 gcc_unreachable ();
4043 }
4044 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4045 OPTAB_DIRECT);
4046 }
4047 else
4048 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4049 operands[1], operands[2]);
4050
4051 if (operands[0] != cmp)
4052 emit_move_insn (operands[0], cmp);
4053
4054 return true;
4055 }
4056
4057 static rtx
4058 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4059 rtx op_true, rtx op_false, bool *negate)
4060 {
4061 machine_mode data_mode = GET_MODE (dest);
4062 machine_mode mode = GET_MODE (cop0);
4063 rtx x;
4064
4065 *negate = false;
4066
4067 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4068 if (TARGET_XOP
4069 && (mode == V16QImode || mode == V8HImode
4070 || mode == V4SImode || mode == V2DImode))
4071 ;
4072 else
4073 {
4074 /* Canonicalize the comparison to EQ, GT, GTU. */
4075 switch (code)
4076 {
4077 case EQ:
4078 case GT:
4079 case GTU:
4080 break;
4081
4082 case NE:
4083 case LE:
4084 case LEU:
4085 code = reverse_condition (code);
4086 *negate = true;
4087 break;
4088
4089 case GE:
4090 case GEU:
4091 code = reverse_condition (code);
4092 *negate = true;
4093 /* FALLTHRU */
4094
4095 case LT:
4096 case LTU:
4097 std::swap (cop0, cop1);
4098 code = swap_condition (code);
4099 break;
4100
4101 default:
4102 gcc_unreachable ();
4103 }
4104
4105 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4106 if (mode == V2DImode)
4107 {
4108 switch (code)
4109 {
4110 case EQ:
4111 /* SSE4.1 supports EQ. */
4112 if (!TARGET_SSE4_1)
4113 return NULL;
4114 break;
4115
4116 case GT:
4117 case GTU:
4118 /* SSE4.2 supports GT/GTU. */
4119 if (!TARGET_SSE4_2)
4120 return NULL;
4121 break;
4122
4123 default:
4124 gcc_unreachable ();
4125 }
4126 }
4127
4128 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4129 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4130 if (*negate)
4131 std::swap (optrue, opfalse);
4132
4133 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4134 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4135 min (x, y) == x). While we add one instruction (the minimum),
4136 we remove the need for two instructions in the negation, as the
4137 result is done this way.
4138 When using masks, do it for SI/DImode element types, as it is shorter
4139 than the two subtractions. */
4140 if ((code != EQ
4141 && GET_MODE_SIZE (mode) != 64
4142 && vector_all_ones_operand (opfalse, data_mode)
4143 && optrue == CONST0_RTX (data_mode))
4144 || (code == GTU
4145 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4146 /* Don't do it if not using integer masks and we'd end up with
4147 the right values in the registers though. */
4148 && (GET_MODE_SIZE (mode) == 64
4149 || !vector_all_ones_operand (optrue, data_mode)
4150 || opfalse != CONST0_RTX (data_mode))))
4151 {
4152 rtx (*gen) (rtx, rtx, rtx) = NULL;
4153
4154 switch (mode)
4155 {
4156 case E_V16SImode:
4157 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4158 break;
4159 case E_V8DImode:
4160 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4161 cop0 = force_reg (mode, cop0);
4162 cop1 = force_reg (mode, cop1);
4163 break;
4164 case E_V32QImode:
4165 if (TARGET_AVX2)
4166 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4167 break;
4168 case E_V16HImode:
4169 if (TARGET_AVX2)
4170 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4171 break;
4172 case E_V8SImode:
4173 if (TARGET_AVX2)
4174 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4175 break;
4176 case E_V4DImode:
4177 if (TARGET_AVX512VL)
4178 {
4179 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4180 cop0 = force_reg (mode, cop0);
4181 cop1 = force_reg (mode, cop1);
4182 }
4183 break;
4184 case E_V16QImode:
4185 if (code == GTU && TARGET_SSE2)
4186 gen = gen_uminv16qi3;
4187 else if (code == GT && TARGET_SSE4_1)
4188 gen = gen_sminv16qi3;
4189 break;
4190 case E_V8HImode:
4191 if (code == GTU && TARGET_SSE4_1)
4192 gen = gen_uminv8hi3;
4193 else if (code == GT && TARGET_SSE2)
4194 gen = gen_sminv8hi3;
4195 break;
4196 case E_V4SImode:
4197 if (TARGET_SSE4_1)
4198 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4199 break;
4200 case E_V2DImode:
4201 if (TARGET_AVX512VL)
4202 {
4203 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4204 cop0 = force_reg (mode, cop0);
4205 cop1 = force_reg (mode, cop1);
4206 }
4207 break;
4208 default:
4209 break;
4210 }
4211
4212 if (gen)
4213 {
4214 rtx tem = gen_reg_rtx (mode);
4215 if (!vector_operand (cop0, mode))
4216 cop0 = force_reg (mode, cop0);
4217 if (!vector_operand (cop1, mode))
4218 cop1 = force_reg (mode, cop1);
4219 *negate = !*negate;
4220 emit_insn (gen (tem, cop0, cop1));
4221 cop1 = tem;
4222 code = EQ;
4223 }
4224 }
4225
4226 /* Unsigned parallel compare is not supported by the hardware.
4227 Play some tricks to turn this into a signed comparison
4228 against 0. */
4229 if (code == GTU)
4230 {
4231 cop0 = force_reg (mode, cop0);
4232
4233 switch (mode)
4234 {
4235 case E_V16SImode:
4236 case E_V8DImode:
4237 case E_V8SImode:
4238 case E_V4DImode:
4239 case E_V4SImode:
4240 case E_V2DImode:
4241 {
4242 rtx t1, t2, mask;
4243
4244 /* Subtract (-(INT MAX) - 1) from both operands to make
4245 them signed. */
4246 mask = ix86_build_signbit_mask (mode, true, false);
4247 t1 = gen_reg_rtx (mode);
4248 emit_insn (gen_sub3_insn (t1, cop0, mask));
4249
4250 t2 = gen_reg_rtx (mode);
4251 emit_insn (gen_sub3_insn (t2, cop1, mask));
4252
4253 cop0 = t1;
4254 cop1 = t2;
4255 code = GT;
4256 }
4257 break;
4258
4259 case E_V64QImode:
4260 case E_V32HImode:
4261 case E_V32QImode:
4262 case E_V16HImode:
4263 case E_V16QImode:
4264 case E_V8HImode:
4265 /* Perform a parallel unsigned saturating subtraction. */
4266 x = gen_reg_rtx (mode);
4267 emit_insn (gen_rtx_SET
4268 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4269 cop0 = x;
4270 cop1 = CONST0_RTX (mode);
4271 code = EQ;
4272 *negate = !*negate;
4273 break;
4274
4275 default:
4276 gcc_unreachable ();
4277 }
4278 }
4279 }
4280
4281 if (*negate)
4282 std::swap (op_true, op_false);
4283
4284 /* Allow the comparison to be done in one mode, but the movcc to
4285 happen in another mode. */
4286 if (data_mode == mode)
4287 {
4288 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4289 op_true, op_false);
4290 }
4291 else
4292 {
4293 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4294 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4295 op_true, op_false);
4296 if (GET_MODE (x) == mode)
4297 x = gen_lowpart (data_mode, x);
4298 }
4299
4300 return x;
4301 }
4302
4303 /* Expand integer vector comparison. */
4304
4305 bool
4306 ix86_expand_int_vec_cmp (rtx operands[])
4307 {
4308 rtx_code code = GET_CODE (operands[1]);
4309 bool negate = false;
4310 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4311 operands[3], NULL, NULL, &negate);
4312
4313 if (!cmp)
4314 return false;
4315
4316 if (negate)
4317 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4318 CONST0_RTX (GET_MODE (cmp)),
4319 NULL, NULL, &negate);
4320
4321 gcc_assert (!negate);
4322
4323 if (operands[0] != cmp)
4324 emit_move_insn (operands[0], cmp);
4325
4326 return true;
4327 }
4328
4329 /* Expand a floating-point vector conditional move; a vcond operation
4330 rather than a movcc operation. */
4331
4332 bool
4333 ix86_expand_fp_vcond (rtx operands[])
4334 {
4335 enum rtx_code code = GET_CODE (operands[3]);
4336 rtx cmp;
4337
4338 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4339 &operands[4], &operands[5]);
4340 if (code == UNKNOWN)
4341 {
4342 rtx temp;
4343 switch (GET_CODE (operands[3]))
4344 {
4345 case LTGT:
4346 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4347 operands[5], operands[0], operands[0]);
4348 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4349 operands[5], operands[1], operands[2]);
4350 code = AND;
4351 break;
4352 case UNEQ:
4353 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4354 operands[5], operands[0], operands[0]);
4355 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4356 operands[5], operands[1], operands[2]);
4357 code = IOR;
4358 break;
4359 default:
4360 gcc_unreachable ();
4361 }
4362 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4363 OPTAB_DIRECT);
4364 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4365 return true;
4366 }
4367
4368 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4369 operands[5], operands[1], operands[2]))
4370 return true;
4371
4372 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4373 operands[1], operands[2]);
4374 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4375 return true;
4376 }
4377
4378 /* Expand a signed/unsigned integral vector conditional move. */
4379
4380 bool
4381 ix86_expand_int_vcond (rtx operands[])
4382 {
4383 machine_mode data_mode = GET_MODE (operands[0]);
4384 machine_mode mode = GET_MODE (operands[4]);
4385 enum rtx_code code = GET_CODE (operands[3]);
4386 bool negate = false;
4387 rtx x, cop0, cop1;
4388
4389 cop0 = operands[4];
4390 cop1 = operands[5];
4391
4392 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4393 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4394 if ((code == LT || code == GE)
4395 && data_mode == mode
4396 && cop1 == CONST0_RTX (mode)
4397 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4398 && GET_MODE_UNIT_SIZE (data_mode) > 1
4399 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4400 && (GET_MODE_SIZE (data_mode) == 16
4401 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4402 {
4403 rtx negop = operands[2 - (code == LT)];
4404 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4405 if (negop == CONST1_RTX (data_mode))
4406 {
4407 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4408 operands[0], 1, OPTAB_DIRECT);
4409 if (res != operands[0])
4410 emit_move_insn (operands[0], res);
4411 return true;
4412 }
4413 else if (GET_MODE_INNER (data_mode) != DImode
4414 && vector_all_ones_operand (negop, data_mode))
4415 {
4416 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4417 operands[0], 0, OPTAB_DIRECT);
4418 if (res != operands[0])
4419 emit_move_insn (operands[0], res);
4420 return true;
4421 }
4422 }
4423
4424 if (!nonimmediate_operand (cop1, mode))
4425 cop1 = force_reg (mode, cop1);
4426 if (!general_operand (operands[1], data_mode))
4427 operands[1] = force_reg (data_mode, operands[1]);
4428 if (!general_operand (operands[2], data_mode))
4429 operands[2] = force_reg (data_mode, operands[2]);
4430
4431 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4432 operands[1], operands[2], &negate);
4433
4434 if (!x)
4435 return false;
4436
4437 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4438 operands[2-negate]);
4439 return true;
4440 }
4441
4442 static bool
4443 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4444 struct expand_vec_perm_d *d)
4445 {
4446 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4447 expander, so args are either in d, or in op0, op1 etc. */
4448 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4449 machine_mode maskmode = mode;
4450 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4451
4452 switch (mode)
4453 {
4454 case E_V8HImode:
4455 if (TARGET_AVX512VL && TARGET_AVX512BW)
4456 gen = gen_avx512vl_vpermt2varv8hi3;
4457 break;
4458 case E_V16HImode:
4459 if (TARGET_AVX512VL && TARGET_AVX512BW)
4460 gen = gen_avx512vl_vpermt2varv16hi3;
4461 break;
4462 case E_V64QImode:
4463 if (TARGET_AVX512VBMI)
4464 gen = gen_avx512bw_vpermt2varv64qi3;
4465 break;
4466 case E_V32HImode:
4467 if (TARGET_AVX512BW)
4468 gen = gen_avx512bw_vpermt2varv32hi3;
4469 break;
4470 case E_V4SImode:
4471 if (TARGET_AVX512VL)
4472 gen = gen_avx512vl_vpermt2varv4si3;
4473 break;
4474 case E_V8SImode:
4475 if (TARGET_AVX512VL)
4476 gen = gen_avx512vl_vpermt2varv8si3;
4477 break;
4478 case E_V16SImode:
4479 if (TARGET_AVX512F)
4480 gen = gen_avx512f_vpermt2varv16si3;
4481 break;
4482 case E_V4SFmode:
4483 if (TARGET_AVX512VL)
4484 {
4485 gen = gen_avx512vl_vpermt2varv4sf3;
4486 maskmode = V4SImode;
4487 }
4488 break;
4489 case E_V8SFmode:
4490 if (TARGET_AVX512VL)
4491 {
4492 gen = gen_avx512vl_vpermt2varv8sf3;
4493 maskmode = V8SImode;
4494 }
4495 break;
4496 case E_V16SFmode:
4497 if (TARGET_AVX512F)
4498 {
4499 gen = gen_avx512f_vpermt2varv16sf3;
4500 maskmode = V16SImode;
4501 }
4502 break;
4503 case E_V2DImode:
4504 if (TARGET_AVX512VL)
4505 gen = gen_avx512vl_vpermt2varv2di3;
4506 break;
4507 case E_V4DImode:
4508 if (TARGET_AVX512VL)
4509 gen = gen_avx512vl_vpermt2varv4di3;
4510 break;
4511 case E_V8DImode:
4512 if (TARGET_AVX512F)
4513 gen = gen_avx512f_vpermt2varv8di3;
4514 break;
4515 case E_V2DFmode:
4516 if (TARGET_AVX512VL)
4517 {
4518 gen = gen_avx512vl_vpermt2varv2df3;
4519 maskmode = V2DImode;
4520 }
4521 break;
4522 case E_V4DFmode:
4523 if (TARGET_AVX512VL)
4524 {
4525 gen = gen_avx512vl_vpermt2varv4df3;
4526 maskmode = V4DImode;
4527 }
4528 break;
4529 case E_V8DFmode:
4530 if (TARGET_AVX512F)
4531 {
4532 gen = gen_avx512f_vpermt2varv8df3;
4533 maskmode = V8DImode;
4534 }
4535 break;
4536 default:
4537 break;
4538 }
4539
4540 if (gen == NULL)
4541 return false;
4542
4543 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4544 expander, so args are either in d, or in op0, op1 etc. */
4545 if (d)
4546 {
4547 rtx vec[64];
4548 target = d->target;
4549 op0 = d->op0;
4550 op1 = d->op1;
4551 for (int i = 0; i < d->nelt; ++i)
4552 vec[i] = GEN_INT (d->perm[i]);
4553 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4554 }
4555
4556 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4557 return true;
4558 }
4559
4560 /* Expand a variable vector permutation. */
4561
4562 void
4563 ix86_expand_vec_perm (rtx operands[])
4564 {
4565 rtx target = operands[0];
4566 rtx op0 = operands[1];
4567 rtx op1 = operands[2];
4568 rtx mask = operands[3];
4569 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4570 machine_mode mode = GET_MODE (op0);
4571 machine_mode maskmode = GET_MODE (mask);
4572 int w, e, i;
4573 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4574
4575 /* Number of elements in the vector. */
4576 w = GET_MODE_NUNITS (mode);
4577 e = GET_MODE_UNIT_SIZE (mode);
4578 gcc_assert (w <= 64);
4579
4580 if (TARGET_AVX512F && one_operand_shuffle)
4581 {
4582 rtx (*gen) (rtx, rtx, rtx) = NULL;
4583 switch (mode)
4584 {
4585 case E_V16SImode:
4586 gen =gen_avx512f_permvarv16si;
4587 break;
4588 case E_V16SFmode:
4589 gen = gen_avx512f_permvarv16sf;
4590 break;
4591 case E_V8DImode:
4592 gen = gen_avx512f_permvarv8di;
4593 break;
4594 case E_V8DFmode:
4595 gen = gen_avx512f_permvarv8df;
4596 break;
4597 default:
4598 break;
4599 }
4600 if (gen != NULL)
4601 {
4602 emit_insn (gen (target, op0, mask));
4603 return;
4604 }
4605 }
4606
4607 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4608 return;
4609
4610 if (TARGET_AVX2)
4611 {
4612 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4613 {
4614 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4615 an constant shuffle operand. With a tiny bit of effort we can
4616 use VPERMD instead. A re-interpretation stall for V4DFmode is
4617 unfortunate but there's no avoiding it.
4618 Similarly for V16HImode we don't have instructions for variable
4619 shuffling, while for V32QImode we can use after preparing suitable
4620 masks vpshufb; vpshufb; vpermq; vpor. */
4621
4622 if (mode == V16HImode)
4623 {
4624 maskmode = mode = V32QImode;
4625 w = 32;
4626 e = 1;
4627 }
4628 else
4629 {
4630 maskmode = mode = V8SImode;
4631 w = 8;
4632 e = 4;
4633 }
4634 t1 = gen_reg_rtx (maskmode);
4635
4636 /* Replicate the low bits of the V4DImode mask into V8SImode:
4637 mask = { A B C D }
4638 t1 = { A A B B C C D D }. */
4639 for (i = 0; i < w / 2; ++i)
4640 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4641 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4642 vt = force_reg (maskmode, vt);
4643 mask = gen_lowpart (maskmode, mask);
4644 if (maskmode == V8SImode)
4645 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4646 else
4647 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4648
4649 /* Multiply the shuffle indicies by two. */
4650 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4651 OPTAB_DIRECT);
4652
4653 /* Add one to the odd shuffle indicies:
4654 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4655 for (i = 0; i < w / 2; ++i)
4656 {
4657 vec[i * 2] = const0_rtx;
4658 vec[i * 2 + 1] = const1_rtx;
4659 }
4660 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4661 vt = validize_mem (force_const_mem (maskmode, vt));
4662 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4663 OPTAB_DIRECT);
4664
4665 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4666 operands[3] = mask = t1;
4667 target = gen_reg_rtx (mode);
4668 op0 = gen_lowpart (mode, op0);
4669 op1 = gen_lowpart (mode, op1);
4670 }
4671
4672 switch (mode)
4673 {
4674 case E_V8SImode:
4675 /* The VPERMD and VPERMPS instructions already properly ignore
4676 the high bits of the shuffle elements. No need for us to
4677 perform an AND ourselves. */
4678 if (one_operand_shuffle)
4679 {
4680 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4681 if (target != operands[0])
4682 emit_move_insn (operands[0],
4683 gen_lowpart (GET_MODE (operands[0]), target));
4684 }
4685 else
4686 {
4687 t1 = gen_reg_rtx (V8SImode);
4688 t2 = gen_reg_rtx (V8SImode);
4689 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4690 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4691 goto merge_two;
4692 }
4693 return;
4694
4695 case E_V8SFmode:
4696 mask = gen_lowpart (V8SImode, mask);
4697 if (one_operand_shuffle)
4698 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4699 else
4700 {
4701 t1 = gen_reg_rtx (V8SFmode);
4702 t2 = gen_reg_rtx (V8SFmode);
4703 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4704 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4705 goto merge_two;
4706 }
4707 return;
4708
4709 case E_V4SImode:
4710 /* By combining the two 128-bit input vectors into one 256-bit
4711 input vector, we can use VPERMD and VPERMPS for the full
4712 two-operand shuffle. */
4713 t1 = gen_reg_rtx (V8SImode);
4714 t2 = gen_reg_rtx (V8SImode);
4715 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4716 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4717 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4718 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4719 return;
4720
4721 case E_V4SFmode:
4722 t1 = gen_reg_rtx (V8SFmode);
4723 t2 = gen_reg_rtx (V8SImode);
4724 mask = gen_lowpart (V4SImode, mask);
4725 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4726 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4727 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4728 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4729 return;
4730
4731 case E_V32QImode:
4732 t1 = gen_reg_rtx (V32QImode);
4733 t2 = gen_reg_rtx (V32QImode);
4734 t3 = gen_reg_rtx (V32QImode);
4735 vt2 = GEN_INT (-128);
4736 vt = gen_const_vec_duplicate (V32QImode, vt2);
4737 vt = force_reg (V32QImode, vt);
4738 for (i = 0; i < 32; i++)
4739 vec[i] = i < 16 ? vt2 : const0_rtx;
4740 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4741 vt2 = force_reg (V32QImode, vt2);
4742 /* From mask create two adjusted masks, which contain the same
4743 bits as mask in the low 7 bits of each vector element.
4744 The first mask will have the most significant bit clear
4745 if it requests element from the same 128-bit lane
4746 and MSB set if it requests element from the other 128-bit lane.
4747 The second mask will have the opposite values of the MSB,
4748 and additionally will have its 128-bit lanes swapped.
4749 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4750 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4751 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4752 stands for other 12 bytes. */
4753 /* The bit whether element is from the same lane or the other
4754 lane is bit 4, so shift it up by 3 to the MSB position. */
4755 t5 = gen_reg_rtx (V4DImode);
4756 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4757 GEN_INT (3)));
4758 /* Clear MSB bits from the mask just in case it had them set. */
4759 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4760 /* After this t1 will have MSB set for elements from other lane. */
4761 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4762 /* Clear bits other than MSB. */
4763 emit_insn (gen_andv32qi3 (t1, t1, vt));
4764 /* Or in the lower bits from mask into t3. */
4765 emit_insn (gen_iorv32qi3 (t3, t1, t2));
4766 /* And invert MSB bits in t1, so MSB is set for elements from the same
4767 lane. */
4768 emit_insn (gen_xorv32qi3 (t1, t1, vt));
4769 /* Swap 128-bit lanes in t3. */
4770 t6 = gen_reg_rtx (V4DImode);
4771 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4772 const2_rtx, GEN_INT (3),
4773 const0_rtx, const1_rtx));
4774 /* And or in the lower bits from mask into t1. */
4775 emit_insn (gen_iorv32qi3 (t1, t1, t2));
4776 if (one_operand_shuffle)
4777 {
4778 /* Each of these shuffles will put 0s in places where
4779 element from the other 128-bit lane is needed, otherwise
4780 will shuffle in the requested value. */
4781 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4782 gen_lowpart (V32QImode, t6)));
4783 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4784 /* For t3 the 128-bit lanes are swapped again. */
4785 t7 = gen_reg_rtx (V4DImode);
4786 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4787 const2_rtx, GEN_INT (3),
4788 const0_rtx, const1_rtx));
4789 /* And oring both together leads to the result. */
4790 emit_insn (gen_iorv32qi3 (target, t1,
4791 gen_lowpart (V32QImode, t7)));
4792 if (target != operands[0])
4793 emit_move_insn (operands[0],
4794 gen_lowpart (GET_MODE (operands[0]), target));
4795 return;
4796 }
4797
4798 t4 = gen_reg_rtx (V32QImode);
4799 /* Similarly to the above one_operand_shuffle code,
4800 just for repeated twice for each operand. merge_two:
4801 code will merge the two results together. */
4802 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4803 gen_lowpart (V32QImode, t6)));
4804 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4805 gen_lowpart (V32QImode, t6)));
4806 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4807 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4808 t7 = gen_reg_rtx (V4DImode);
4809 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4810 const2_rtx, GEN_INT (3),
4811 const0_rtx, const1_rtx));
4812 t8 = gen_reg_rtx (V4DImode);
4813 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4814 const2_rtx, GEN_INT (3),
4815 const0_rtx, const1_rtx));
4816 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4817 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4818 t1 = t4;
4819 t2 = t3;
4820 goto merge_two;
4821
4822 default:
4823 gcc_assert (GET_MODE_SIZE (mode) <= 16);
4824 break;
4825 }
4826 }
4827
4828 if (TARGET_XOP)
4829 {
4830 /* The XOP VPPERM insn supports three inputs. By ignoring the
4831 one_operand_shuffle special case, we avoid creating another
4832 set of constant vectors in memory. */
4833 one_operand_shuffle = false;
4834
4835 /* mask = mask & {2*w-1, ...} */
4836 vt = GEN_INT (2*w - 1);
4837 }
4838 else
4839 {
4840 /* mask = mask & {w-1, ...} */
4841 vt = GEN_INT (w - 1);
4842 }
4843
4844 vt = gen_const_vec_duplicate (maskmode, vt);
4845 mask = expand_simple_binop (maskmode, AND, mask, vt,
4846 NULL_RTX, 0, OPTAB_DIRECT);
4847
4848 /* For non-QImode operations, convert the word permutation control
4849 into a byte permutation control. */
4850 if (mode != V16QImode)
4851 {
4852 mask = expand_simple_binop (maskmode, ASHIFT, mask,
4853 GEN_INT (exact_log2 (e)),
4854 NULL_RTX, 0, OPTAB_DIRECT);
4855
4856 /* Convert mask to vector of chars. */
4857 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4858
4859 /* Replicate each of the input bytes into byte positions:
4860 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4861 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4862 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4863 for (i = 0; i < 16; ++i)
4864 vec[i] = GEN_INT (i/e * e);
4865 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4866 vt = validize_mem (force_const_mem (V16QImode, vt));
4867 if (TARGET_XOP)
4868 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4869 else
4870 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4871
4872 /* Convert it into the byte positions by doing
4873 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4874 for (i = 0; i < 16; ++i)
4875 vec[i] = GEN_INT (i % e);
4876 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4877 vt = validize_mem (force_const_mem (V16QImode, vt));
4878 emit_insn (gen_addv16qi3 (mask, mask, vt));
4879 }
4880
4881 /* The actual shuffle operations all operate on V16QImode. */
4882 op0 = gen_lowpart (V16QImode, op0);
4883 op1 = gen_lowpart (V16QImode, op1);
4884
4885 if (TARGET_XOP)
4886 {
4887 if (GET_MODE (target) != V16QImode)
4888 target = gen_reg_rtx (V16QImode);
4889 emit_insn (gen_xop_pperm (target, op0, op1, mask));
4890 if (target != operands[0])
4891 emit_move_insn (operands[0],
4892 gen_lowpart (GET_MODE (operands[0]), target));
4893 }
4894 else if (one_operand_shuffle)
4895 {
4896 if (GET_MODE (target) != V16QImode)
4897 target = gen_reg_rtx (V16QImode);
4898 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4899 if (target != operands[0])
4900 emit_move_insn (operands[0],
4901 gen_lowpart (GET_MODE (operands[0]), target));
4902 }
4903 else
4904 {
4905 rtx xops[6];
4906 bool ok;
4907
4908 /* Shuffle the two input vectors independently. */
4909 t1 = gen_reg_rtx (V16QImode);
4910 t2 = gen_reg_rtx (V16QImode);
4911 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4912 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4913
4914 merge_two:
4915 /* Then merge them together. The key is whether any given control
4916 element contained a bit set that indicates the second word. */
4917 mask = operands[3];
4918 vt = GEN_INT (w);
4919 if (maskmode == V2DImode && !TARGET_SSE4_1)
4920 {
4921 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4922 more shuffle to convert the V2DI input mask into a V4SI
4923 input mask. At which point the masking that expand_int_vcond
4924 will work as desired. */
4925 rtx t3 = gen_reg_rtx (V4SImode);
4926 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4927 const0_rtx, const0_rtx,
4928 const2_rtx, const2_rtx));
4929 mask = t3;
4930 maskmode = V4SImode;
4931 e = w = 4;
4932 }
4933
4934 vt = gen_const_vec_duplicate (maskmode, vt);
4935 vt = force_reg (maskmode, vt);
4936 mask = expand_simple_binop (maskmode, AND, mask, vt,
4937 NULL_RTX, 0, OPTAB_DIRECT);
4938
4939 if (GET_MODE (target) != mode)
4940 target = gen_reg_rtx (mode);
4941 xops[0] = target;
4942 xops[1] = gen_lowpart (mode, t2);
4943 xops[2] = gen_lowpart (mode, t1);
4944 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4945 xops[4] = mask;
4946 xops[5] = vt;
4947 ok = ix86_expand_int_vcond (xops);
4948 gcc_assert (ok);
4949 if (target != operands[0])
4950 emit_move_insn (operands[0],
4951 gen_lowpart (GET_MODE (operands[0]), target));
4952 }
4953 }
4954
4955 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4956 true if we should do zero extension, else sign extension. HIGH_P is
4957 true if we want the N/2 high elements, else the low elements. */
4958
4959 void
4960 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4961 {
4962 machine_mode imode = GET_MODE (src);
4963 rtx tmp;
4964
4965 if (TARGET_SSE4_1)
4966 {
4967 rtx (*unpack)(rtx, rtx);
4968 rtx (*extract)(rtx, rtx) = NULL;
4969 machine_mode halfmode = BLKmode;
4970
4971 switch (imode)
4972 {
4973 case E_V64QImode:
4974 if (unsigned_p)
4975 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4976 else
4977 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4978 halfmode = V32QImode;
4979 extract
4980 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4981 break;
4982 case E_V32QImode:
4983 if (unsigned_p)
4984 unpack = gen_avx2_zero_extendv16qiv16hi2;
4985 else
4986 unpack = gen_avx2_sign_extendv16qiv16hi2;
4987 halfmode = V16QImode;
4988 extract
4989 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4990 break;
4991 case E_V32HImode:
4992 if (unsigned_p)
4993 unpack = gen_avx512f_zero_extendv16hiv16si2;
4994 else
4995 unpack = gen_avx512f_sign_extendv16hiv16si2;
4996 halfmode = V16HImode;
4997 extract
4998 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
4999 break;
5000 case E_V16HImode:
5001 if (unsigned_p)
5002 unpack = gen_avx2_zero_extendv8hiv8si2;
5003 else
5004 unpack = gen_avx2_sign_extendv8hiv8si2;
5005 halfmode = V8HImode;
5006 extract
5007 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5008 break;
5009 case E_V16SImode:
5010 if (unsigned_p)
5011 unpack = gen_avx512f_zero_extendv8siv8di2;
5012 else
5013 unpack = gen_avx512f_sign_extendv8siv8di2;
5014 halfmode = V8SImode;
5015 extract
5016 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5017 break;
5018 case E_V8SImode:
5019 if (unsigned_p)
5020 unpack = gen_avx2_zero_extendv4siv4di2;
5021 else
5022 unpack = gen_avx2_sign_extendv4siv4di2;
5023 halfmode = V4SImode;
5024 extract
5025 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5026 break;
5027 case E_V16QImode:
5028 if (unsigned_p)
5029 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5030 else
5031 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5032 break;
5033 case E_V8HImode:
5034 if (unsigned_p)
5035 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5036 else
5037 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5038 break;
5039 case E_V4SImode:
5040 if (unsigned_p)
5041 unpack = gen_sse4_1_zero_extendv2siv2di2;
5042 else
5043 unpack = gen_sse4_1_sign_extendv2siv2di2;
5044 break;
5045 default:
5046 gcc_unreachable ();
5047 }
5048
5049 if (GET_MODE_SIZE (imode) >= 32)
5050 {
5051 tmp = gen_reg_rtx (halfmode);
5052 emit_insn (extract (tmp, src));
5053 }
5054 else if (high_p)
5055 {
5056 /* Shift higher 8 bytes to lower 8 bytes. */
5057 tmp = gen_reg_rtx (V1TImode);
5058 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5059 GEN_INT (64)));
5060 tmp = gen_lowpart (imode, tmp);
5061 }
5062 else
5063 tmp = src;
5064
5065 emit_insn (unpack (dest, tmp));
5066 }
5067 else
5068 {
5069 rtx (*unpack)(rtx, rtx, rtx);
5070
5071 switch (imode)
5072 {
5073 case E_V16QImode:
5074 if (high_p)
5075 unpack = gen_vec_interleave_highv16qi;
5076 else
5077 unpack = gen_vec_interleave_lowv16qi;
5078 break;
5079 case E_V8HImode:
5080 if (high_p)
5081 unpack = gen_vec_interleave_highv8hi;
5082 else
5083 unpack = gen_vec_interleave_lowv8hi;
5084 break;
5085 case E_V4SImode:
5086 if (high_p)
5087 unpack = gen_vec_interleave_highv4si;
5088 else
5089 unpack = gen_vec_interleave_lowv4si;
5090 break;
5091 default:
5092 gcc_unreachable ();
5093 }
5094
5095 if (unsigned_p)
5096 tmp = force_reg (imode, CONST0_RTX (imode));
5097 else
5098 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5099 src, pc_rtx, pc_rtx);
5100
5101 rtx tmp2 = gen_reg_rtx (imode);
5102 emit_insn (unpack (tmp2, src, tmp));
5103 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5104 }
5105 }
5106
5107 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5108 but works for floating pointer parameters and nonoffsetable memories.
5109 For pushes, it returns just stack offsets; the values will be saved
5110 in the right order. Maximally three parts are generated. */
5111
5112 static int
5113 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5114 {
5115 int size;
5116
5117 if (!TARGET_64BIT)
5118 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5119 else
5120 size = (GET_MODE_SIZE (mode) + 4) / 8;
5121
5122 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5123 gcc_assert (size >= 2 && size <= 4);
5124
5125 /* Optimize constant pool reference to immediates. This is used by fp
5126 moves, that force all constants to memory to allow combining. */
5127 if (MEM_P (operand) && MEM_READONLY_P (operand))
5128 operand = avoid_constant_pool_reference (operand);
5129
5130 if (MEM_P (operand) && !offsettable_memref_p (operand))
5131 {
5132 /* The only non-offsetable memories we handle are pushes. */
5133 int ok = push_operand (operand, VOIDmode);
5134
5135 gcc_assert (ok);
5136
5137 operand = copy_rtx (operand);
5138 PUT_MODE (operand, word_mode);
5139 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5140 return size;
5141 }
5142
5143 if (GET_CODE (operand) == CONST_VECTOR)
5144 {
5145 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5146 /* Caution: if we looked through a constant pool memory above,
5147 the operand may actually have a different mode now. That's
5148 ok, since we want to pun this all the way back to an integer. */
5149 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5150 gcc_assert (operand != NULL);
5151 mode = imode;
5152 }
5153
5154 if (!TARGET_64BIT)
5155 {
5156 if (mode == DImode)
5157 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5158 else
5159 {
5160 int i;
5161
5162 if (REG_P (operand))
5163 {
5164 gcc_assert (reload_completed);
5165 for (i = 0; i < size; i++)
5166 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5167 }
5168 else if (offsettable_memref_p (operand))
5169 {
5170 operand = adjust_address (operand, SImode, 0);
5171 parts[0] = operand;
5172 for (i = 1; i < size; i++)
5173 parts[i] = adjust_address (operand, SImode, 4 * i);
5174 }
5175 else if (CONST_DOUBLE_P (operand))
5176 {
5177 const REAL_VALUE_TYPE *r;
5178 long l[4];
5179
5180 r = CONST_DOUBLE_REAL_VALUE (operand);
5181 switch (mode)
5182 {
5183 case E_TFmode:
5184 real_to_target (l, r, mode);
5185 parts[3] = gen_int_mode (l[3], SImode);
5186 parts[2] = gen_int_mode (l[2], SImode);
5187 break;
5188 case E_XFmode:
5189 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5190 long double may not be 80-bit. */
5191 real_to_target (l, r, mode);
5192 parts[2] = gen_int_mode (l[2], SImode);
5193 break;
5194 case E_DFmode:
5195 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5196 break;
5197 default:
5198 gcc_unreachable ();
5199 }
5200 parts[1] = gen_int_mode (l[1], SImode);
5201 parts[0] = gen_int_mode (l[0], SImode);
5202 }
5203 else
5204 gcc_unreachable ();
5205 }
5206 }
5207 else
5208 {
5209 if (mode == TImode)
5210 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5211 if (mode == XFmode || mode == TFmode)
5212 {
5213 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5214 if (REG_P (operand))
5215 {
5216 gcc_assert (reload_completed);
5217 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5218 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5219 }
5220 else if (offsettable_memref_p (operand))
5221 {
5222 operand = adjust_address (operand, DImode, 0);
5223 parts[0] = operand;
5224 parts[1] = adjust_address (operand, upper_mode, 8);
5225 }
5226 else if (CONST_DOUBLE_P (operand))
5227 {
5228 long l[4];
5229
5230 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5231
5232 /* real_to_target puts 32-bit pieces in each long. */
5233 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5234 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5235 << 32), DImode);
5236
5237 if (upper_mode == SImode)
5238 parts[1] = gen_int_mode (l[2], SImode);
5239 else
5240 parts[1]
5241 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5242 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5243 << 32), DImode);
5244 }
5245 else
5246 gcc_unreachable ();
5247 }
5248 }
5249
5250 return size;
5251 }
5252
5253 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5254 Return false when normal moves are needed; true when all required
5255 insns have been emitted. Operands 2-4 contain the input values
5256 int the correct order; operands 5-7 contain the output values. */
5257
5258 void
5259 ix86_split_long_move (rtx operands[])
5260 {
5261 rtx part[2][4];
5262 int nparts, i, j;
5263 int push = 0;
5264 int collisions = 0;
5265 machine_mode mode = GET_MODE (operands[0]);
5266 bool collisionparts[4];
5267
5268 /* The DFmode expanders may ask us to move double.
5269 For 64bit target this is single move. By hiding the fact
5270 here we simplify i386.md splitters. */
5271 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5272 {
5273 /* Optimize constant pool reference to immediates. This is used by
5274 fp moves, that force all constants to memory to allow combining. */
5275
5276 if (MEM_P (operands[1])
5277 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5278 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5279 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5280 if (push_operand (operands[0], VOIDmode))
5281 {
5282 operands[0] = copy_rtx (operands[0]);
5283 PUT_MODE (operands[0], word_mode);
5284 }
5285 else
5286 operands[0] = gen_lowpart (DImode, operands[0]);
5287 operands[1] = gen_lowpart (DImode, operands[1]);
5288 emit_move_insn (operands[0], operands[1]);
5289 return;
5290 }
5291
5292 /* The only non-offsettable memory we handle is push. */
5293 if (push_operand (operands[0], VOIDmode))
5294 push = 1;
5295 else
5296 gcc_assert (!MEM_P (operands[0])
5297 || offsettable_memref_p (operands[0]));
5298
5299 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5300 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5301
5302 /* When emitting push, take care for source operands on the stack. */
5303 if (push && MEM_P (operands[1])
5304 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5305 {
5306 rtx src_base = XEXP (part[1][nparts - 1], 0);
5307
5308 /* Compensate for the stack decrement by 4. */
5309 if (!TARGET_64BIT && nparts == 3
5310 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5311 src_base = plus_constant (Pmode, src_base, 4);
5312
5313 /* src_base refers to the stack pointer and is
5314 automatically decreased by emitted push. */
5315 for (i = 0; i < nparts; i++)
5316 part[1][i] = change_address (part[1][i],
5317 GET_MODE (part[1][i]), src_base);
5318 }
5319
5320 /* We need to do copy in the right order in case an address register
5321 of the source overlaps the destination. */
5322 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5323 {
5324 rtx tmp;
5325
5326 for (i = 0; i < nparts; i++)
5327 {
5328 collisionparts[i]
5329 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5330 if (collisionparts[i])
5331 collisions++;
5332 }
5333
5334 /* Collision in the middle part can be handled by reordering. */
5335 if (collisions == 1 && nparts == 3 && collisionparts [1])
5336 {
5337 std::swap (part[0][1], part[0][2]);
5338 std::swap (part[1][1], part[1][2]);
5339 }
5340 else if (collisions == 1
5341 && nparts == 4
5342 && (collisionparts [1] || collisionparts [2]))
5343 {
5344 if (collisionparts [1])
5345 {
5346 std::swap (part[0][1], part[0][2]);
5347 std::swap (part[1][1], part[1][2]);
5348 }
5349 else
5350 {
5351 std::swap (part[0][2], part[0][3]);
5352 std::swap (part[1][2], part[1][3]);
5353 }
5354 }
5355
5356 /* If there are more collisions, we can't handle it by reordering.
5357 Do an lea to the last part and use only one colliding move. */
5358 else if (collisions > 1)
5359 {
5360 rtx base, addr;
5361
5362 collisions = 1;
5363
5364 base = part[0][nparts - 1];
5365
5366 /* Handle the case when the last part isn't valid for lea.
5367 Happens in 64-bit mode storing the 12-byte XFmode. */
5368 if (GET_MODE (base) != Pmode)
5369 base = gen_rtx_REG (Pmode, REGNO (base));
5370
5371 addr = XEXP (part[1][0], 0);
5372 if (TARGET_TLS_DIRECT_SEG_REFS)
5373 {
5374 struct ix86_address parts;
5375 int ok = ix86_decompose_address (addr, &parts);
5376 gcc_assert (ok);
5377 /* It is not valid to use %gs: or %fs: in lea. */
5378 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5379 }
5380 emit_insn (gen_rtx_SET (base, addr));
5381 part[1][0] = replace_equiv_address (part[1][0], base);
5382 for (i = 1; i < nparts; i++)
5383 {
5384 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5385 part[1][i] = replace_equiv_address (part[1][i], tmp);
5386 }
5387 }
5388 }
5389
5390 if (push)
5391 {
5392 if (!TARGET_64BIT)
5393 {
5394 if (nparts == 3)
5395 {
5396 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5397 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5398 emit_move_insn (part[0][2], part[1][2]);
5399 }
5400 else if (nparts == 4)
5401 {
5402 emit_move_insn (part[0][3], part[1][3]);
5403 emit_move_insn (part[0][2], part[1][2]);
5404 }
5405 }
5406 else
5407 {
5408 /* In 64bit mode we don't have 32bit push available. In case this is
5409 register, it is OK - we will just use larger counterpart. We also
5410 retype memory - these comes from attempt to avoid REX prefix on
5411 moving of second half of TFmode value. */
5412 if (GET_MODE (part[1][1]) == SImode)
5413 {
5414 switch (GET_CODE (part[1][1]))
5415 {
5416 case MEM:
5417 part[1][1] = adjust_address (part[1][1], DImode, 0);
5418 break;
5419
5420 case REG:
5421 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5422 break;
5423
5424 default:
5425 gcc_unreachable ();
5426 }
5427
5428 if (GET_MODE (part[1][0]) == SImode)
5429 part[1][0] = part[1][1];
5430 }
5431 }
5432 emit_move_insn (part[0][1], part[1][1]);
5433 emit_move_insn (part[0][0], part[1][0]);
5434 return;
5435 }
5436
5437 /* Choose correct order to not overwrite the source before it is copied. */
5438 if ((REG_P (part[0][0])
5439 && REG_P (part[1][1])
5440 && (REGNO (part[0][0]) == REGNO (part[1][1])
5441 || (nparts == 3
5442 && REGNO (part[0][0]) == REGNO (part[1][2]))
5443 || (nparts == 4
5444 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5445 || (collisions > 0
5446 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5447 {
5448 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5449 {
5450 operands[2 + i] = part[0][j];
5451 operands[6 + i] = part[1][j];
5452 }
5453 }
5454 else
5455 {
5456 for (i = 0; i < nparts; i++)
5457 {
5458 operands[2 + i] = part[0][i];
5459 operands[6 + i] = part[1][i];
5460 }
5461 }
5462
5463 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5464 if (optimize_insn_for_size_p ())
5465 {
5466 for (j = 0; j < nparts - 1; j++)
5467 if (CONST_INT_P (operands[6 + j])
5468 && operands[6 + j] != const0_rtx
5469 && REG_P (operands[2 + j]))
5470 for (i = j; i < nparts - 1; i++)
5471 if (CONST_INT_P (operands[7 + i])
5472 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5473 operands[7 + i] = operands[2 + j];
5474 }
5475
5476 for (i = 0; i < nparts; i++)
5477 emit_move_insn (operands[2 + i], operands[6 + i]);
5478
5479 return;
5480 }
5481
5482 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5483 left shift by a constant, either using a single shift or
5484 a sequence of add instructions. */
5485
5486 static void
5487 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5488 {
5489 if (count == 1
5490 || (count * ix86_cost->add <= ix86_cost->shift_const
5491 && !optimize_insn_for_size_p ()))
5492 {
5493 while (count-- > 0)
5494 emit_insn (gen_add2_insn (operand, operand));
5495 }
5496 else
5497 {
5498 rtx (*insn)(rtx, rtx, rtx);
5499
5500 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5501 emit_insn (insn (operand, operand, GEN_INT (count)));
5502 }
5503 }
5504
5505 void
5506 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5507 {
5508 rtx (*gen_ashl3)(rtx, rtx, rtx);
5509 rtx (*gen_shld)(rtx, rtx, rtx);
5510 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5511 machine_mode half_mode;
5512
5513 rtx low[2], high[2];
5514 int count;
5515
5516 if (CONST_INT_P (operands[2]))
5517 {
5518 split_double_mode (mode, operands, 2, low, high);
5519 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5520
5521 if (count >= half_width)
5522 {
5523 emit_move_insn (high[0], low[1]);
5524 emit_move_insn (low[0], const0_rtx);
5525
5526 if (count > half_width)
5527 ix86_expand_ashl_const (high[0], count - half_width, mode);
5528 }
5529 else
5530 {
5531 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5532
5533 if (!rtx_equal_p (operands[0], operands[1]))
5534 emit_move_insn (operands[0], operands[1]);
5535
5536 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5537 ix86_expand_ashl_const (low[0], count, mode);
5538 }
5539 return;
5540 }
5541
5542 split_double_mode (mode, operands, 1, low, high);
5543 half_mode = mode == DImode ? SImode : DImode;
5544
5545 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5546
5547 if (operands[1] == const1_rtx)
5548 {
5549 /* Assuming we've chosen a QImode capable registers, then 1 << N
5550 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5551 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5552 {
5553 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5554
5555 ix86_expand_clear (low[0]);
5556 ix86_expand_clear (high[0]);
5557 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5558
5559 d = gen_lowpart (QImode, low[0]);
5560 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5561 s = gen_rtx_EQ (QImode, flags, const0_rtx);
5562 emit_insn (gen_rtx_SET (d, s));
5563
5564 d = gen_lowpart (QImode, high[0]);
5565 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5566 s = gen_rtx_NE (QImode, flags, const0_rtx);
5567 emit_insn (gen_rtx_SET (d, s));
5568 }
5569
5570 /* Otherwise, we can get the same results by manually performing
5571 a bit extract operation on bit 5/6, and then performing the two
5572 shifts. The two methods of getting 0/1 into low/high are exactly
5573 the same size. Avoiding the shift in the bit extract case helps
5574 pentium4 a bit; no one else seems to care much either way. */
5575 else
5576 {
5577 rtx (*gen_lshr3)(rtx, rtx, rtx);
5578 rtx (*gen_and3)(rtx, rtx, rtx);
5579 rtx (*gen_xor3)(rtx, rtx, rtx);
5580 HOST_WIDE_INT bits;
5581 rtx x;
5582
5583 if (mode == DImode)
5584 {
5585 gen_lshr3 = gen_lshrsi3;
5586 gen_and3 = gen_andsi3;
5587 gen_xor3 = gen_xorsi3;
5588 bits = 5;
5589 }
5590 else
5591 {
5592 gen_lshr3 = gen_lshrdi3;
5593 gen_and3 = gen_anddi3;
5594 gen_xor3 = gen_xordi3;
5595 bits = 6;
5596 }
5597
5598 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5599 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5600 else
5601 x = gen_lowpart (half_mode, operands[2]);
5602 emit_insn (gen_rtx_SET (high[0], x));
5603
5604 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5605 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5606 emit_move_insn (low[0], high[0]);
5607 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5608 }
5609
5610 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5611 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5612 return;
5613 }
5614
5615 if (operands[1] == constm1_rtx)
5616 {
5617 /* For -1 << N, we can avoid the shld instruction, because we
5618 know that we're shifting 0...31/63 ones into a -1. */
5619 emit_move_insn (low[0], constm1_rtx);
5620 if (optimize_insn_for_size_p ())
5621 emit_move_insn (high[0], low[0]);
5622 else
5623 emit_move_insn (high[0], constm1_rtx);
5624 }
5625 else
5626 {
5627 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5628
5629 if (!rtx_equal_p (operands[0], operands[1]))
5630 emit_move_insn (operands[0], operands[1]);
5631
5632 split_double_mode (mode, operands, 1, low, high);
5633 emit_insn (gen_shld (high[0], low[0], operands[2]));
5634 }
5635
5636 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5637
5638 if (TARGET_CMOVE && scratch)
5639 {
5640 ix86_expand_clear (scratch);
5641 emit_insn (gen_x86_shift_adj_1
5642 (half_mode, high[0], low[0], operands[2], scratch));
5643 }
5644 else
5645 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
5646 }
5647
5648 void
5649 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5650 {
5651 rtx (*gen_ashr3)(rtx, rtx, rtx)
5652 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5653 rtx (*gen_shrd)(rtx, rtx, rtx);
5654 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5655
5656 rtx low[2], high[2];
5657 int count;
5658
5659 if (CONST_INT_P (operands[2]))
5660 {
5661 split_double_mode (mode, operands, 2, low, high);
5662 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5663
5664 if (count == GET_MODE_BITSIZE (mode) - 1)
5665 {
5666 emit_move_insn (high[0], high[1]);
5667 emit_insn (gen_ashr3 (high[0], high[0],
5668 GEN_INT (half_width - 1)));
5669 emit_move_insn (low[0], high[0]);
5670
5671 }
5672 else if (count >= half_width)
5673 {
5674 emit_move_insn (low[0], high[1]);
5675 emit_move_insn (high[0], low[0]);
5676 emit_insn (gen_ashr3 (high[0], high[0],
5677 GEN_INT (half_width - 1)));
5678
5679 if (count > half_width)
5680 emit_insn (gen_ashr3 (low[0], low[0],
5681 GEN_INT (count - half_width)));
5682 }
5683 else
5684 {
5685 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5686
5687 if (!rtx_equal_p (operands[0], operands[1]))
5688 emit_move_insn (operands[0], operands[1]);
5689
5690 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5691 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5692 }
5693 }
5694 else
5695 {
5696 machine_mode half_mode;
5697
5698 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5699
5700 if (!rtx_equal_p (operands[0], operands[1]))
5701 emit_move_insn (operands[0], operands[1]);
5702
5703 split_double_mode (mode, operands, 1, low, high);
5704 half_mode = mode == DImode ? SImode : DImode;
5705
5706 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5707 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5708
5709 if (TARGET_CMOVE && scratch)
5710 {
5711 emit_move_insn (scratch, high[0]);
5712 emit_insn (gen_ashr3 (scratch, scratch,
5713 GEN_INT (half_width - 1)));
5714 emit_insn (gen_x86_shift_adj_1
5715 (half_mode, low[0], high[0], operands[2], scratch));
5716 }
5717 else
5718 emit_insn (gen_x86_shift_adj_3
5719 (half_mode, low[0], high[0], operands[2]));
5720 }
5721 }
5722
5723 void
5724 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5725 {
5726 rtx (*gen_lshr3)(rtx, rtx, rtx)
5727 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5728 rtx (*gen_shrd)(rtx, rtx, rtx);
5729 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5730
5731 rtx low[2], high[2];
5732 int count;
5733
5734 if (CONST_INT_P (operands[2]))
5735 {
5736 split_double_mode (mode, operands, 2, low, high);
5737 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5738
5739 if (count >= half_width)
5740 {
5741 emit_move_insn (low[0], high[1]);
5742 ix86_expand_clear (high[0]);
5743
5744 if (count > half_width)
5745 emit_insn (gen_lshr3 (low[0], low[0],
5746 GEN_INT (count - half_width)));
5747 }
5748 else
5749 {
5750 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5751
5752 if (!rtx_equal_p (operands[0], operands[1]))
5753 emit_move_insn (operands[0], operands[1]);
5754
5755 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5756 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5757 }
5758 }
5759 else
5760 {
5761 machine_mode half_mode;
5762
5763 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5764
5765 if (!rtx_equal_p (operands[0], operands[1]))
5766 emit_move_insn (operands[0], operands[1]);
5767
5768 split_double_mode (mode, operands, 1, low, high);
5769 half_mode = mode == DImode ? SImode : DImode;
5770
5771 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5772 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5773
5774 if (TARGET_CMOVE && scratch)
5775 {
5776 ix86_expand_clear (scratch);
5777 emit_insn (gen_x86_shift_adj_1
5778 (half_mode, low[0], high[0], operands[2], scratch));
5779 }
5780 else
5781 emit_insn (gen_x86_shift_adj_2
5782 (half_mode, low[0], high[0], operands[2]));
5783 }
5784 }
5785
5786 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5787 DImode for constant loop counts. */
5788
5789 static machine_mode
5790 counter_mode (rtx count_exp)
5791 {
5792 if (GET_MODE (count_exp) != VOIDmode)
5793 return GET_MODE (count_exp);
5794 if (!CONST_INT_P (count_exp))
5795 return Pmode;
5796 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5797 return DImode;
5798 return SImode;
5799 }
5800
5801 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5802 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5803 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5804 memory by VALUE (supposed to be in MODE).
5805
5806 The size is rounded down to whole number of chunk size moved at once.
5807 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5808
5809
5810 static void
5811 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
5812 rtx destptr, rtx srcptr, rtx value,
5813 rtx count, machine_mode mode, int unroll,
5814 int expected_size, bool issetmem)
5815 {
5816 rtx_code_label *out_label, *top_label;
5817 rtx iter, tmp;
5818 machine_mode iter_mode = counter_mode (count);
5819 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5820 rtx piece_size = GEN_INT (piece_size_n);
5821 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5822 rtx size;
5823 int i;
5824
5825 top_label = gen_label_rtx ();
5826 out_label = gen_label_rtx ();
5827 iter = gen_reg_rtx (iter_mode);
5828
5829 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5830 NULL, 1, OPTAB_DIRECT);
5831 /* Those two should combine. */
5832 if (piece_size == const1_rtx)
5833 {
5834 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5835 true, out_label);
5836 predict_jump (REG_BR_PROB_BASE * 10 / 100);
5837 }
5838 emit_move_insn (iter, const0_rtx);
5839
5840 emit_label (top_label);
5841
5842 tmp = convert_modes (Pmode, iter_mode, iter, true);
5843
5844 /* This assert could be relaxed - in this case we'll need to compute
5845 smallest power of two, containing in PIECE_SIZE_N and pass it to
5846 offset_address. */
5847 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5848 destmem = offset_address (destmem, tmp, piece_size_n);
5849 destmem = adjust_address (destmem, mode, 0);
5850
5851 if (!issetmem)
5852 {
5853 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5854 srcmem = adjust_address (srcmem, mode, 0);
5855
5856 /* When unrolling for chips that reorder memory reads and writes,
5857 we can save registers by using single temporary.
5858 Also using 4 temporaries is overkill in 32bit mode. */
5859 if (!TARGET_64BIT && 0)
5860 {
5861 for (i = 0; i < unroll; i++)
5862 {
5863 if (i)
5864 {
5865 destmem = adjust_address (copy_rtx (destmem), mode,
5866 GET_MODE_SIZE (mode));
5867 srcmem = adjust_address (copy_rtx (srcmem), mode,
5868 GET_MODE_SIZE (mode));
5869 }
5870 emit_move_insn (destmem, srcmem);
5871 }
5872 }
5873 else
5874 {
5875 rtx tmpreg[4];
5876 gcc_assert (unroll <= 4);
5877 for (i = 0; i < unroll; i++)
5878 {
5879 tmpreg[i] = gen_reg_rtx (mode);
5880 if (i)
5881 srcmem = adjust_address (copy_rtx (srcmem), mode,
5882 GET_MODE_SIZE (mode));
5883 emit_move_insn (tmpreg[i], srcmem);
5884 }
5885 for (i = 0; i < unroll; i++)
5886 {
5887 if (i)
5888 destmem = adjust_address (copy_rtx (destmem), mode,
5889 GET_MODE_SIZE (mode));
5890 emit_move_insn (destmem, tmpreg[i]);
5891 }
5892 }
5893 }
5894 else
5895 for (i = 0; i < unroll; i++)
5896 {
5897 if (i)
5898 destmem = adjust_address (copy_rtx (destmem), mode,
5899 GET_MODE_SIZE (mode));
5900 emit_move_insn (destmem, value);
5901 }
5902
5903 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5904 true, OPTAB_LIB_WIDEN);
5905 if (tmp != iter)
5906 emit_move_insn (iter, tmp);
5907
5908 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5909 true, top_label);
5910 if (expected_size != -1)
5911 {
5912 expected_size /= GET_MODE_SIZE (mode) * unroll;
5913 if (expected_size == 0)
5914 predict_jump (0);
5915 else if (expected_size > REG_BR_PROB_BASE)
5916 predict_jump (REG_BR_PROB_BASE - 1);
5917 else
5918 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5919 / expected_size);
5920 }
5921 else
5922 predict_jump (REG_BR_PROB_BASE * 80 / 100);
5923 iter = ix86_zero_extend_to_Pmode (iter);
5924 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5925 true, OPTAB_LIB_WIDEN);
5926 if (tmp != destptr)
5927 emit_move_insn (destptr, tmp);
5928 if (!issetmem)
5929 {
5930 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5931 true, OPTAB_LIB_WIDEN);
5932 if (tmp != srcptr)
5933 emit_move_insn (srcptr, tmp);
5934 }
5935 emit_label (out_label);
5936 }
5937
5938 /* Divide COUNTREG by SCALE. */
5939 static rtx
5940 scale_counter (rtx countreg, int scale)
5941 {
5942 rtx sc;
5943
5944 if (scale == 1)
5945 return countreg;
5946 if (CONST_INT_P (countreg))
5947 return GEN_INT (INTVAL (countreg) / scale);
5948 gcc_assert (REG_P (countreg));
5949
5950 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5951 GEN_INT (exact_log2 (scale)),
5952 NULL, 1, OPTAB_DIRECT);
5953 return sc;
5954 }
5955
5956 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5957 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5958 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5959 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5960 ORIG_VALUE is the original value passed to memset to fill the memory with.
5961 Other arguments have same meaning as for previous function. */
5962
5963 static void
5964 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
5965 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5966 rtx count,
5967 machine_mode mode, bool issetmem)
5968 {
5969 rtx destexp;
5970 rtx srcexp;
5971 rtx countreg;
5972 HOST_WIDE_INT rounded_count;
5973
5974 /* If possible, it is shorter to use rep movs.
5975 TODO: Maybe it is better to move this logic to decide_alg. */
5976 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5977 && (!issetmem || orig_value == const0_rtx))
5978 mode = SImode;
5979
5980 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5981 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5982
5983 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5984 GET_MODE_SIZE (mode)));
5985 if (mode != QImode)
5986 {
5987 destexp = gen_rtx_ASHIFT (Pmode, countreg,
5988 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5989 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5990 }
5991 else
5992 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
5993 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
5994 {
5995 rounded_count
5996 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5997 destmem = shallow_copy_rtx (destmem);
5998 set_mem_size (destmem, rounded_count);
5999 }
6000 else if (MEM_SIZE_KNOWN_P (destmem))
6001 clear_mem_size (destmem);
6002
6003 if (issetmem)
6004 {
6005 value = force_reg (mode, gen_lowpart (mode, value));
6006 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
6007 }
6008 else
6009 {
6010 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
6011 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
6012 if (mode != QImode)
6013 {
6014 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
6015 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
6016 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
6017 }
6018 else
6019 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
6020 if (CONST_INT_P (count))
6021 {
6022 rounded_count
6023 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6024 srcmem = shallow_copy_rtx (srcmem);
6025 set_mem_size (srcmem, rounded_count);
6026 }
6027 else
6028 {
6029 if (MEM_SIZE_KNOWN_P (srcmem))
6030 clear_mem_size (srcmem);
6031 }
6032 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
6033 destexp, srcexp));
6034 }
6035 }
6036
6037 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6038 DESTMEM.
6039 SRC is passed by pointer to be updated on return.
6040 Return value is updated DST. */
6041 static rtx
6042 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6043 HOST_WIDE_INT size_to_move)
6044 {
6045 rtx dst = destmem, src = *srcmem, adjust, tempreg;
6046 enum insn_code code;
6047 machine_mode move_mode;
6048 int piece_size, i;
6049
6050 /* Find the widest mode in which we could perform moves.
6051 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6052 it until move of such size is supported. */
6053 piece_size = 1 << floor_log2 (size_to_move);
6054 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6055 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6056 {
6057 gcc_assert (piece_size > 1);
6058 piece_size >>= 1;
6059 }
6060
6061 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6062 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6063 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6064 {
6065 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6066 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6067 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6068 {
6069 move_mode = word_mode;
6070 piece_size = GET_MODE_SIZE (move_mode);
6071 code = optab_handler (mov_optab, move_mode);
6072 }
6073 }
6074 gcc_assert (code != CODE_FOR_nothing);
6075
6076 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6077 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6078
6079 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6080 gcc_assert (size_to_move % piece_size == 0);
6081 adjust = GEN_INT (piece_size);
6082 for (i = 0; i < size_to_move; i += piece_size)
6083 {
6084 /* We move from memory to memory, so we'll need to do it via
6085 a temporary register. */
6086 tempreg = gen_reg_rtx (move_mode);
6087 emit_insn (GEN_FCN (code) (tempreg, src));
6088 emit_insn (GEN_FCN (code) (dst, tempreg));
6089
6090 emit_move_insn (destptr,
6091 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6092 emit_move_insn (srcptr,
6093 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
6094
6095 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6096 piece_size);
6097 src = adjust_automodify_address_nv (src, move_mode, srcptr,
6098 piece_size);
6099 }
6100
6101 /* Update DST and SRC rtx. */
6102 *srcmem = src;
6103 return dst;
6104 }
6105
6106 /* Helper function for the string operations below. Dest VARIABLE whether
6107 it is aligned to VALUE bytes. If true, jump to the label. */
6108
6109 static rtx_code_label *
6110 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6111 {
6112 rtx_code_label *label = gen_label_rtx ();
6113 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6114 if (GET_MODE (variable) == DImode)
6115 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6116 else
6117 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6118 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6119 1, label);
6120 if (epilogue)
6121 predict_jump (REG_BR_PROB_BASE * 50 / 100);
6122 else
6123 predict_jump (REG_BR_PROB_BASE * 90 / 100);
6124 return label;
6125 }
6126
6127
6128 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6129
6130 static void
6131 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
6132 rtx destptr, rtx srcptr, rtx count, int max_size)
6133 {
6134 rtx src, dest;
6135 if (CONST_INT_P (count))
6136 {
6137 HOST_WIDE_INT countval = INTVAL (count);
6138 HOST_WIDE_INT epilogue_size = countval % max_size;
6139 int i;
6140
6141 /* For now MAX_SIZE should be a power of 2. This assert could be
6142 relaxed, but it'll require a bit more complicated epilogue
6143 expanding. */
6144 gcc_assert ((max_size & (max_size - 1)) == 0);
6145 for (i = max_size; i >= 1; i >>= 1)
6146 {
6147 if (epilogue_size & i)
6148 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6149 }
6150 return;
6151 }
6152 if (max_size > 8)
6153 {
6154 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6155 count, 1, OPTAB_DIRECT);
6156 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
6157 count, QImode, 1, 4, false);
6158 return;
6159 }
6160
6161 /* When there are stringops, we can cheaply increase dest and src pointers.
6162 Otherwise we save code size by maintaining offset (zero is readily
6163 available from preceding rep operation) and using x86 addressing modes.
6164 */
6165 if (TARGET_SINGLE_STRINGOP)
6166 {
6167 if (max_size > 4)
6168 {
6169 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6170 src = change_address (srcmem, SImode, srcptr);
6171 dest = change_address (destmem, SImode, destptr);
6172 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6173 emit_label (label);
6174 LABEL_NUSES (label) = 1;
6175 }
6176 if (max_size > 2)
6177 {
6178 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6179 src = change_address (srcmem, HImode, srcptr);
6180 dest = change_address (destmem, HImode, destptr);
6181 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6182 emit_label (label);
6183 LABEL_NUSES (label) = 1;
6184 }
6185 if (max_size > 1)
6186 {
6187 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6188 src = change_address (srcmem, QImode, srcptr);
6189 dest = change_address (destmem, QImode, destptr);
6190 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6191 emit_label (label);
6192 LABEL_NUSES (label) = 1;
6193 }
6194 }
6195 else
6196 {
6197 rtx offset = force_reg (Pmode, const0_rtx);
6198 rtx tmp;
6199
6200 if (max_size > 4)
6201 {
6202 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6203 src = change_address (srcmem, SImode, srcptr);
6204 dest = change_address (destmem, SImode, destptr);
6205 emit_move_insn (dest, src);
6206 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6207 true, OPTAB_LIB_WIDEN);
6208 if (tmp != offset)
6209 emit_move_insn (offset, tmp);
6210 emit_label (label);
6211 LABEL_NUSES (label) = 1;
6212 }
6213 if (max_size > 2)
6214 {
6215 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6216 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6217 src = change_address (srcmem, HImode, tmp);
6218 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6219 dest = change_address (destmem, HImode, tmp);
6220 emit_move_insn (dest, src);
6221 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6222 true, OPTAB_LIB_WIDEN);
6223 if (tmp != offset)
6224 emit_move_insn (offset, tmp);
6225 emit_label (label);
6226 LABEL_NUSES (label) = 1;
6227 }
6228 if (max_size > 1)
6229 {
6230 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6231 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6232 src = change_address (srcmem, QImode, tmp);
6233 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6234 dest = change_address (destmem, QImode, tmp);
6235 emit_move_insn (dest, src);
6236 emit_label (label);
6237 LABEL_NUSES (label) = 1;
6238 }
6239 }
6240 }
6241
6242 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6243 with value PROMOTED_VAL.
6244 SRC is passed by pointer to be updated on return.
6245 Return value is updated DST. */
6246 static rtx
6247 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6248 HOST_WIDE_INT size_to_move)
6249 {
6250 rtx dst = destmem, adjust;
6251 enum insn_code code;
6252 machine_mode move_mode;
6253 int piece_size, i;
6254
6255 /* Find the widest mode in which we could perform moves.
6256 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6257 it until move of such size is supported. */
6258 move_mode = GET_MODE (promoted_val);
6259 if (move_mode == VOIDmode)
6260 move_mode = QImode;
6261 if (size_to_move < GET_MODE_SIZE (move_mode))
6262 {
6263 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6264 move_mode = int_mode_for_size (move_bits, 0).require ();
6265 promoted_val = gen_lowpart (move_mode, promoted_val);
6266 }
6267 piece_size = GET_MODE_SIZE (move_mode);
6268 code = optab_handler (mov_optab, move_mode);
6269 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6270
6271 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6272
6273 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6274 gcc_assert (size_to_move % piece_size == 0);
6275 adjust = GEN_INT (piece_size);
6276 for (i = 0; i < size_to_move; i += piece_size)
6277 {
6278 if (piece_size <= GET_MODE_SIZE (word_mode))
6279 {
6280 emit_insn (gen_strset (destptr, dst, promoted_val));
6281 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6282 piece_size);
6283 continue;
6284 }
6285
6286 emit_insn (GEN_FCN (code) (dst, promoted_val));
6287
6288 emit_move_insn (destptr,
6289 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6290
6291 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6292 piece_size);
6293 }
6294
6295 /* Update DST rtx. */
6296 return dst;
6297 }
6298 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6299 static void
6300 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6301 rtx count, int max_size)
6302 {
6303 count = expand_simple_binop (counter_mode (count), AND, count,
6304 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
6305 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
6306 gen_lowpart (QImode, value), count, QImode,
6307 1, max_size / 2, true);
6308 }
6309
6310 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6311 static void
6312 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6313 rtx count, int max_size)
6314 {
6315 rtx dest;
6316
6317 if (CONST_INT_P (count))
6318 {
6319 HOST_WIDE_INT countval = INTVAL (count);
6320 HOST_WIDE_INT epilogue_size = countval % max_size;
6321 int i;
6322
6323 /* For now MAX_SIZE should be a power of 2. This assert could be
6324 relaxed, but it'll require a bit more complicated epilogue
6325 expanding. */
6326 gcc_assert ((max_size & (max_size - 1)) == 0);
6327 for (i = max_size; i >= 1; i >>= 1)
6328 {
6329 if (epilogue_size & i)
6330 {
6331 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6332 destmem = emit_memset (destmem, destptr, vec_value, i);
6333 else
6334 destmem = emit_memset (destmem, destptr, value, i);
6335 }
6336 }
6337 return;
6338 }
6339 if (max_size > 32)
6340 {
6341 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6342 return;
6343 }
6344 if (max_size > 16)
6345 {
6346 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6347 if (TARGET_64BIT)
6348 {
6349 dest = change_address (destmem, DImode, destptr);
6350 emit_insn (gen_strset (destptr, dest, value));
6351 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6352 emit_insn (gen_strset (destptr, dest, value));
6353 }
6354 else
6355 {
6356 dest = change_address (destmem, SImode, destptr);
6357 emit_insn (gen_strset (destptr, dest, value));
6358 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6359 emit_insn (gen_strset (destptr, dest, value));
6360 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6361 emit_insn (gen_strset (destptr, dest, value));
6362 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6363 emit_insn (gen_strset (destptr, dest, value));
6364 }
6365 emit_label (label);
6366 LABEL_NUSES (label) = 1;
6367 }
6368 if (max_size > 8)
6369 {
6370 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6371 if (TARGET_64BIT)
6372 {
6373 dest = change_address (destmem, DImode, destptr);
6374 emit_insn (gen_strset (destptr, dest, value));
6375 }
6376 else
6377 {
6378 dest = change_address (destmem, SImode, destptr);
6379 emit_insn (gen_strset (destptr, dest, value));
6380 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6381 emit_insn (gen_strset (destptr, dest, value));
6382 }
6383 emit_label (label);
6384 LABEL_NUSES (label) = 1;
6385 }
6386 if (max_size > 4)
6387 {
6388 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6389 dest = change_address (destmem, SImode, destptr);
6390 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6391 emit_label (label);
6392 LABEL_NUSES (label) = 1;
6393 }
6394 if (max_size > 2)
6395 {
6396 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6397 dest = change_address (destmem, HImode, destptr);
6398 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6399 emit_label (label);
6400 LABEL_NUSES (label) = 1;
6401 }
6402 if (max_size > 1)
6403 {
6404 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6405 dest = change_address (destmem, QImode, destptr);
6406 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6407 emit_label (label);
6408 LABEL_NUSES (label) = 1;
6409 }
6410 }
6411
6412 /* Adjust COUNTER by the VALUE. */
6413 static void
6414 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6415 {
6416 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
6417 }
6418
6419 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6420 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6421 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6422 ignored.
6423 Return value is updated DESTMEM. */
6424
6425 static rtx
6426 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
6427 rtx destptr, rtx srcptr, rtx value,
6428 rtx vec_value, rtx count, int align,
6429 int desired_alignment, bool issetmem)
6430 {
6431 int i;
6432 for (i = 1; i < desired_alignment; i <<= 1)
6433 {
6434 if (align <= i)
6435 {
6436 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6437 if (issetmem)
6438 {
6439 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6440 destmem = emit_memset (destmem, destptr, vec_value, i);
6441 else
6442 destmem = emit_memset (destmem, destptr, value, i);
6443 }
6444 else
6445 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6446 ix86_adjust_counter (count, i);
6447 emit_label (label);
6448 LABEL_NUSES (label) = 1;
6449 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6450 }
6451 }
6452 return destmem;
6453 }
6454
6455 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6456 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6457 and jump to DONE_LABEL. */
6458 static void
6459 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
6460 rtx destptr, rtx srcptr,
6461 rtx value, rtx vec_value,
6462 rtx count, int size,
6463 rtx done_label, bool issetmem)
6464 {
6465 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6466 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6467 rtx modesize;
6468 int n;
6469
6470 /* If we do not have vector value to copy, we must reduce size. */
6471 if (issetmem)
6472 {
6473 if (!vec_value)
6474 {
6475 if (GET_MODE (value) == VOIDmode && size > 8)
6476 mode = Pmode;
6477 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6478 mode = GET_MODE (value);
6479 }
6480 else
6481 mode = GET_MODE (vec_value), value = vec_value;
6482 }
6483 else
6484 {
6485 /* Choose appropriate vector mode. */
6486 if (size >= 32)
6487 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6488 else if (size >= 16)
6489 mode = TARGET_SSE ? V16QImode : DImode;
6490 srcmem = change_address (srcmem, mode, srcptr);
6491 }
6492 destmem = change_address (destmem, mode, destptr);
6493 modesize = GEN_INT (GET_MODE_SIZE (mode));
6494 gcc_assert (GET_MODE_SIZE (mode) <= size);
6495 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6496 {
6497 if (issetmem)
6498 emit_move_insn (destmem, gen_lowpart (mode, value));
6499 else
6500 {
6501 emit_move_insn (destmem, srcmem);
6502 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6503 }
6504 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6505 }
6506
6507 destmem = offset_address (destmem, count, 1);
6508 destmem = offset_address (destmem, GEN_INT (-2 * size),
6509 GET_MODE_SIZE (mode));
6510 if (!issetmem)
6511 {
6512 srcmem = offset_address (srcmem, count, 1);
6513 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6514 GET_MODE_SIZE (mode));
6515 }
6516 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6517 {
6518 if (issetmem)
6519 emit_move_insn (destmem, gen_lowpart (mode, value));
6520 else
6521 {
6522 emit_move_insn (destmem, srcmem);
6523 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6524 }
6525 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6526 }
6527 emit_jump_insn (gen_jump (done_label));
6528 emit_barrier ();
6529
6530 emit_label (label);
6531 LABEL_NUSES (label) = 1;
6532 }
6533
6534 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6535 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6536 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6537 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6538 DONE_LABEL is a label after the whole copying sequence. The label is created
6539 on demand if *DONE_LABEL is NULL.
6540 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6541 bounds after the initial copies.
6542
6543 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6544 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6545 we will dispatch to a library call for large blocks.
6546
6547 In pseudocode we do:
6548
6549 if (COUNT < SIZE)
6550 {
6551 Assume that SIZE is 4. Bigger sizes are handled analogously
6552 if (COUNT & 4)
6553 {
6554 copy 4 bytes from SRCPTR to DESTPTR
6555 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6556 goto done_label
6557 }
6558 if (!COUNT)
6559 goto done_label;
6560 copy 1 byte from SRCPTR to DESTPTR
6561 if (COUNT & 2)
6562 {
6563 copy 2 bytes from SRCPTR to DESTPTR
6564 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6565 }
6566 }
6567 else
6568 {
6569 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6570 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6571
6572 OLD_DESPTR = DESTPTR;
6573 Align DESTPTR up to DESIRED_ALIGN
6574 SRCPTR += DESTPTR - OLD_DESTPTR
6575 COUNT -= DEST_PTR - OLD_DESTPTR
6576 if (DYNAMIC_CHECK)
6577 Round COUNT down to multiple of SIZE
6578 << optional caller supplied zero size guard is here >>
6579 << optional caller supplied dynamic check is here >>
6580 << caller supplied main copy loop is here >>
6581 }
6582 done_label:
6583 */
6584 static void
6585 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
6586 rtx *destptr, rtx *srcptr,
6587 machine_mode mode,
6588 rtx value, rtx vec_value,
6589 rtx *count,
6590 rtx_code_label **done_label,
6591 int size,
6592 int desired_align,
6593 int align,
6594 unsigned HOST_WIDE_INT *min_size,
6595 bool dynamic_check,
6596 bool issetmem)
6597 {
6598 rtx_code_label *loop_label = NULL, *label;
6599 int n;
6600 rtx modesize;
6601 int prolog_size = 0;
6602 rtx mode_value;
6603
6604 /* Chose proper value to copy. */
6605 if (issetmem && VECTOR_MODE_P (mode))
6606 mode_value = vec_value;
6607 else
6608 mode_value = value;
6609 gcc_assert (GET_MODE_SIZE (mode) <= size);
6610
6611 /* See if block is big or small, handle small blocks. */
6612 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6613 {
6614 int size2 = size;
6615 loop_label = gen_label_rtx ();
6616
6617 if (!*done_label)
6618 *done_label = gen_label_rtx ();
6619
6620 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6621 1, loop_label);
6622 size2 >>= 1;
6623
6624 /* Handle sizes > 3. */
6625 for (;size2 > 2; size2 >>= 1)
6626 expand_small_cpymem_or_setmem (destmem, srcmem,
6627 *destptr, *srcptr,
6628 value, vec_value,
6629 *count,
6630 size2, *done_label, issetmem);
6631 /* Nothing to copy? Jump to DONE_LABEL if so */
6632 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6633 1, *done_label);
6634
6635 /* Do a byte copy. */
6636 destmem = change_address (destmem, QImode, *destptr);
6637 if (issetmem)
6638 emit_move_insn (destmem, gen_lowpart (QImode, value));
6639 else
6640 {
6641 srcmem = change_address (srcmem, QImode, *srcptr);
6642 emit_move_insn (destmem, srcmem);
6643 }
6644
6645 /* Handle sizes 2 and 3. */
6646 label = ix86_expand_aligntest (*count, 2, false);
6647 destmem = change_address (destmem, HImode, *destptr);
6648 destmem = offset_address (destmem, *count, 1);
6649 destmem = offset_address (destmem, GEN_INT (-2), 2);
6650 if (issetmem)
6651 emit_move_insn (destmem, gen_lowpart (HImode, value));
6652 else
6653 {
6654 srcmem = change_address (srcmem, HImode, *srcptr);
6655 srcmem = offset_address (srcmem, *count, 1);
6656 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6657 emit_move_insn (destmem, srcmem);
6658 }
6659
6660 emit_label (label);
6661 LABEL_NUSES (label) = 1;
6662 emit_jump_insn (gen_jump (*done_label));
6663 emit_barrier ();
6664 }
6665 else
6666 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6667 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6668
6669 /* Start memcpy for COUNT >= SIZE. */
6670 if (loop_label)
6671 {
6672 emit_label (loop_label);
6673 LABEL_NUSES (loop_label) = 1;
6674 }
6675
6676 /* Copy first desired_align bytes. */
6677 if (!issetmem)
6678 srcmem = change_address (srcmem, mode, *srcptr);
6679 destmem = change_address (destmem, mode, *destptr);
6680 modesize = GEN_INT (GET_MODE_SIZE (mode));
6681 for (n = 0; prolog_size < desired_align - align; n++)
6682 {
6683 if (issetmem)
6684 emit_move_insn (destmem, mode_value);
6685 else
6686 {
6687 emit_move_insn (destmem, srcmem);
6688 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6689 }
6690 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6691 prolog_size += GET_MODE_SIZE (mode);
6692 }
6693
6694
6695 /* Copy last SIZE bytes. */
6696 destmem = offset_address (destmem, *count, 1);
6697 destmem = offset_address (destmem,
6698 GEN_INT (-size - prolog_size),
6699 1);
6700 if (issetmem)
6701 emit_move_insn (destmem, mode_value);
6702 else
6703 {
6704 srcmem = offset_address (srcmem, *count, 1);
6705 srcmem = offset_address (srcmem,
6706 GEN_INT (-size - prolog_size),
6707 1);
6708 emit_move_insn (destmem, srcmem);
6709 }
6710 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6711 {
6712 destmem = offset_address (destmem, modesize, 1);
6713 if (issetmem)
6714 emit_move_insn (destmem, mode_value);
6715 else
6716 {
6717 srcmem = offset_address (srcmem, modesize, 1);
6718 emit_move_insn (destmem, srcmem);
6719 }
6720 }
6721
6722 /* Align destination. */
6723 if (desired_align > 1 && desired_align > align)
6724 {
6725 rtx saveddest = *destptr;
6726
6727 gcc_assert (desired_align <= size);
6728 /* Align destptr up, place it to new register. */
6729 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6730 GEN_INT (prolog_size),
6731 NULL_RTX, 1, OPTAB_DIRECT);
6732 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6733 REG_POINTER (*destptr) = 1;
6734 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6735 GEN_INT (-desired_align),
6736 *destptr, 1, OPTAB_DIRECT);
6737 /* See how many bytes we skipped. */
6738 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6739 *destptr,
6740 saveddest, 1, OPTAB_DIRECT);
6741 /* Adjust srcptr and count. */
6742 if (!issetmem)
6743 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6744 saveddest, *srcptr, 1, OPTAB_DIRECT);
6745 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6746 saveddest, *count, 1, OPTAB_DIRECT);
6747 /* We copied at most size + prolog_size. */
6748 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6749 *min_size
6750 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6751 else
6752 *min_size = 0;
6753
6754 /* Our loops always round down the block size, but for dispatch to
6755 library we need precise value. */
6756 if (dynamic_check)
6757 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
6758 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6759 }
6760 else
6761 {
6762 gcc_assert (prolog_size == 0);
6763 /* Decrease count, so we won't end up copying last word twice. */
6764 if (!CONST_INT_P (*count))
6765 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6766 constm1_rtx, *count, 1, OPTAB_DIRECT);
6767 else
6768 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6769 (unsigned HOST_WIDE_INT)size));
6770 if (*min_size)
6771 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6772 }
6773 }
6774
6775
6776 /* This function is like the previous one, except here we know how many bytes
6777 need to be copied. That allows us to update alignment not only of DST, which
6778 is returned, but also of SRC, which is passed as a pointer for that
6779 reason. */
6780 static rtx
6781 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
6782 rtx srcreg, rtx value, rtx vec_value,
6783 int desired_align, int align_bytes,
6784 bool issetmem)
6785 {
6786 rtx src = NULL;
6787 rtx orig_dst = dst;
6788 rtx orig_src = NULL;
6789 int piece_size = 1;
6790 int copied_bytes = 0;
6791
6792 if (!issetmem)
6793 {
6794 gcc_assert (srcp != NULL);
6795 src = *srcp;
6796 orig_src = src;
6797 }
6798
6799 for (piece_size = 1;
6800 piece_size <= desired_align && copied_bytes < align_bytes;
6801 piece_size <<= 1)
6802 {
6803 if (align_bytes & piece_size)
6804 {
6805 if (issetmem)
6806 {
6807 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6808 dst = emit_memset (dst, destreg, vec_value, piece_size);
6809 else
6810 dst = emit_memset (dst, destreg, value, piece_size);
6811 }
6812 else
6813 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6814 copied_bytes += piece_size;
6815 }
6816 }
6817 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6818 set_mem_align (dst, desired_align * BITS_PER_UNIT);
6819 if (MEM_SIZE_KNOWN_P (orig_dst))
6820 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6821
6822 if (!issetmem)
6823 {
6824 int src_align_bytes = get_mem_align_offset (src, desired_align
6825 * BITS_PER_UNIT);
6826 if (src_align_bytes >= 0)
6827 src_align_bytes = desired_align - src_align_bytes;
6828 if (src_align_bytes >= 0)
6829 {
6830 unsigned int src_align;
6831 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6832 {
6833 if ((src_align_bytes & (src_align - 1))
6834 == (align_bytes & (src_align - 1)))
6835 break;
6836 }
6837 if (src_align > (unsigned int) desired_align)
6838 src_align = desired_align;
6839 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6840 set_mem_align (src, src_align * BITS_PER_UNIT);
6841 }
6842 if (MEM_SIZE_KNOWN_P (orig_src))
6843 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6844 *srcp = src;
6845 }
6846
6847 return dst;
6848 }
6849
6850 /* Return true if ALG can be used in current context.
6851 Assume we expand memset if MEMSET is true. */
6852 static bool
6853 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6854 {
6855 if (alg == no_stringop)
6856 return false;
6857 if (alg == vector_loop)
6858 return TARGET_SSE || TARGET_AVX;
6859 /* Algorithms using the rep prefix want at least edi and ecx;
6860 additionally, memset wants eax and memcpy wants esi. Don't
6861 consider such algorithms if the user has appropriated those
6862 registers for their own purposes, or if we have a non-default
6863 address space, since some string insns cannot override the segment. */
6864 if (alg == rep_prefix_1_byte
6865 || alg == rep_prefix_4_byte
6866 || alg == rep_prefix_8_byte)
6867 {
6868 if (have_as)
6869 return false;
6870 if (fixed_regs[CX_REG]
6871 || fixed_regs[DI_REG]
6872 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6873 return false;
6874 }
6875 return true;
6876 }
6877
6878 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6879 static enum stringop_alg
6880 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6881 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6882 bool memset, bool zero_memset, bool have_as,
6883 int *dynamic_check, bool *noalign, bool recur)
6884 {
6885 const struct stringop_algs *algs;
6886 bool optimize_for_speed;
6887 int max = 0;
6888 const struct processor_costs *cost;
6889 int i;
6890 bool any_alg_usable_p = false;
6891
6892 *noalign = false;
6893 *dynamic_check = -1;
6894
6895 /* Even if the string operation call is cold, we still might spend a lot
6896 of time processing large blocks. */
6897 if (optimize_function_for_size_p (cfun)
6898 || (optimize_insn_for_size_p ()
6899 && (max_size < 256
6900 || (expected_size != -1 && expected_size < 256))))
6901 optimize_for_speed = false;
6902 else
6903 optimize_for_speed = true;
6904
6905 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6906 if (memset)
6907 algs = &cost->memset[TARGET_64BIT != 0];
6908 else
6909 algs = &cost->memcpy[TARGET_64BIT != 0];
6910
6911 /* See maximal size for user defined algorithm. */
6912 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6913 {
6914 enum stringop_alg candidate = algs->size[i].alg;
6915 bool usable = alg_usable_p (candidate, memset, have_as);
6916 any_alg_usable_p |= usable;
6917
6918 if (candidate != libcall && candidate && usable)
6919 max = algs->size[i].max;
6920 }
6921
6922 /* If expected size is not known but max size is small enough
6923 so inline version is a win, set expected size into
6924 the range. */
6925 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6926 && expected_size == -1)
6927 expected_size = min_size / 2 + max_size / 2;
6928
6929 /* If user specified the algorithm, honor it if possible. */
6930 if (ix86_stringop_alg != no_stringop
6931 && alg_usable_p (ix86_stringop_alg, memset, have_as))
6932 return ix86_stringop_alg;
6933 /* rep; movq or rep; movl is the smallest variant. */
6934 else if (!optimize_for_speed)
6935 {
6936 *noalign = true;
6937 if (!count || (count & 3) || (memset && !zero_memset))
6938 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6939 ? rep_prefix_1_byte : loop_1_byte;
6940 else
6941 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6942 ? rep_prefix_4_byte : loop;
6943 }
6944 /* Very tiny blocks are best handled via the loop, REP is expensive to
6945 setup. */
6946 else if (expected_size != -1 && expected_size < 4)
6947 return loop_1_byte;
6948 else if (expected_size != -1)
6949 {
6950 enum stringop_alg alg = libcall;
6951 bool alg_noalign = false;
6952 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6953 {
6954 /* We get here if the algorithms that were not libcall-based
6955 were rep-prefix based and we are unable to use rep prefixes
6956 based on global register usage. Break out of the loop and
6957 use the heuristic below. */
6958 if (algs->size[i].max == 0)
6959 break;
6960 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6961 {
6962 enum stringop_alg candidate = algs->size[i].alg;
6963
6964 if (candidate != libcall
6965 && alg_usable_p (candidate, memset, have_as))
6966 {
6967 alg = candidate;
6968 alg_noalign = algs->size[i].noalign;
6969 }
6970 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6971 last non-libcall inline algorithm. */
6972 if (TARGET_INLINE_ALL_STRINGOPS)
6973 {
6974 /* When the current size is best to be copied by a libcall,
6975 but we are still forced to inline, run the heuristic below
6976 that will pick code for medium sized blocks. */
6977 if (alg != libcall)
6978 {
6979 *noalign = alg_noalign;
6980 return alg;
6981 }
6982 else if (!any_alg_usable_p)
6983 break;
6984 }
6985 else if (alg_usable_p (candidate, memset, have_as))
6986 {
6987 *noalign = algs->size[i].noalign;
6988 return candidate;
6989 }
6990 }
6991 }
6992 }
6993 /* When asked to inline the call anyway, try to pick meaningful choice.
6994 We look for maximal size of block that is faster to copy by hand and
6995 take blocks of at most of that size guessing that average size will
6996 be roughly half of the block.
6997
6998 If this turns out to be bad, we might simply specify the preferred
6999 choice in ix86_costs. */
7000 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7001 && (algs->unknown_size == libcall
7002 || !alg_usable_p (algs->unknown_size, memset, have_as)))
7003 {
7004 enum stringop_alg alg;
7005 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
7006
7007 /* If there aren't any usable algorithms or if recursing already,
7008 then recursing on smaller sizes or same size isn't going to
7009 find anything. Just return the simple byte-at-a-time copy loop. */
7010 if (!any_alg_usable_p || recur)
7011 {
7012 /* Pick something reasonable. */
7013 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
7014 *dynamic_check = 128;
7015 return loop_1_byte;
7016 }
7017 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
7018 zero_memset, have_as, dynamic_check, noalign, true);
7019 gcc_assert (*dynamic_check == -1);
7020 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7021 *dynamic_check = max;
7022 else
7023 gcc_assert (alg != libcall);
7024 return alg;
7025 }
7026 return (alg_usable_p (algs->unknown_size, memset, have_as)
7027 ? algs->unknown_size : libcall);
7028 }
7029
7030 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7031 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7032 static int
7033 decide_alignment (int align,
7034 enum stringop_alg alg,
7035 int expected_size,
7036 machine_mode move_mode)
7037 {
7038 int desired_align = 0;
7039
7040 gcc_assert (alg != no_stringop);
7041
7042 if (alg == libcall)
7043 return 0;
7044 if (move_mode == VOIDmode)
7045 return 0;
7046
7047 desired_align = GET_MODE_SIZE (move_mode);
7048 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7049 copying whole cacheline at once. */
7050 if (TARGET_PENTIUMPRO
7051 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7052 desired_align = 8;
7053
7054 if (optimize_size)
7055 desired_align = 1;
7056 if (desired_align < align)
7057 desired_align = align;
7058 if (expected_size != -1 && expected_size < 4)
7059 desired_align = align;
7060
7061 return desired_align;
7062 }
7063
7064
7065 /* Helper function for memcpy. For QImode value 0xXY produce
7066 0xXYXYXYXY of wide specified by MODE. This is essentially
7067 a * 0x10101010, but we can do slightly better than
7068 synth_mult by unwinding the sequence by hand on CPUs with
7069 slow multiply. */
7070 static rtx
7071 promote_duplicated_reg (machine_mode mode, rtx val)
7072 {
7073 machine_mode valmode = GET_MODE (val);
7074 rtx tmp;
7075 int nops = mode == DImode ? 3 : 2;
7076
7077 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7078 if (val == const0_rtx)
7079 return copy_to_mode_reg (mode, CONST0_RTX (mode));
7080 if (CONST_INT_P (val))
7081 {
7082 HOST_WIDE_INT v = INTVAL (val) & 255;
7083
7084 v |= v << 8;
7085 v |= v << 16;
7086 if (mode == DImode)
7087 v |= (v << 16) << 16;
7088 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7089 }
7090
7091 if (valmode == VOIDmode)
7092 valmode = QImode;
7093 if (valmode != QImode)
7094 val = gen_lowpart (QImode, val);
7095 if (mode == QImode)
7096 return val;
7097 if (!TARGET_PARTIAL_REG_STALL)
7098 nops--;
7099 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7100 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7101 <= (ix86_cost->shift_const + ix86_cost->add) * nops
7102 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7103 {
7104 rtx reg = convert_modes (mode, QImode, val, true);
7105 tmp = promote_duplicated_reg (mode, const1_rtx);
7106 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7107 OPTAB_DIRECT);
7108 }
7109 else
7110 {
7111 rtx reg = convert_modes (mode, QImode, val, true);
7112
7113 if (!TARGET_PARTIAL_REG_STALL)
7114 if (mode == SImode)
7115 emit_insn (gen_insvsi_1 (reg, reg));
7116 else
7117 emit_insn (gen_insvdi_1 (reg, reg));
7118 else
7119 {
7120 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7121 NULL, 1, OPTAB_DIRECT);
7122 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7123 OPTAB_DIRECT);
7124 }
7125 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7126 NULL, 1, OPTAB_DIRECT);
7127 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7128 if (mode == SImode)
7129 return reg;
7130 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7131 NULL, 1, OPTAB_DIRECT);
7132 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7133 return reg;
7134 }
7135 }
7136
7137 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7138 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7139 alignment from ALIGN to DESIRED_ALIGN. */
7140 static rtx
7141 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7142 int align)
7143 {
7144 rtx promoted_val;
7145
7146 if (TARGET_64BIT
7147 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7148 promoted_val = promote_duplicated_reg (DImode, val);
7149 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7150 promoted_val = promote_duplicated_reg (SImode, val);
7151 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7152 promoted_val = promote_duplicated_reg (HImode, val);
7153 else
7154 promoted_val = val;
7155
7156 return promoted_val;
7157 }
7158
7159 /* Copy the address to a Pmode register. This is used for x32 to
7160 truncate DImode TLS address to a SImode register. */
7161
7162 static rtx
7163 ix86_copy_addr_to_reg (rtx addr)
7164 {
7165 rtx reg;
7166 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7167 {
7168 reg = copy_addr_to_reg (addr);
7169 REG_POINTER (reg) = 1;
7170 return reg;
7171 }
7172 else
7173 {
7174 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7175 reg = copy_to_mode_reg (DImode, addr);
7176 REG_POINTER (reg) = 1;
7177 return gen_rtx_SUBREG (SImode, reg, 0);
7178 }
7179 }
7180
7181 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7182 operations when profitable. The code depends upon architecture, block size
7183 and alignment, but always has one of the following overall structures:
7184
7185 Aligned move sequence:
7186
7187 1) Prologue guard: Conditional that jumps up to epilogues for small
7188 blocks that can be handled by epilogue alone. This is faster
7189 but also needed for correctness, since prologue assume the block
7190 is larger than the desired alignment.
7191
7192 Optional dynamic check for size and libcall for large
7193 blocks is emitted here too, with -minline-stringops-dynamically.
7194
7195 2) Prologue: copy first few bytes in order to get destination
7196 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7197 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7198 copied. We emit either a jump tree on power of two sized
7199 blocks, or a byte loop.
7200
7201 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7202 with specified algorithm.
7203
7204 4) Epilogue: code copying tail of the block that is too small to be
7205 handled by main body (or up to size guarded by prologue guard).
7206
7207 Misaligned move sequence
7208
7209 1) missaligned move prologue/epilogue containing:
7210 a) Prologue handling small memory blocks and jumping to done_label
7211 (skipped if blocks are known to be large enough)
7212 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7213 needed by single possibly misaligned move
7214 (skipped if alignment is not needed)
7215 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7216
7217 2) Zero size guard dispatching to done_label, if needed
7218
7219 3) dispatch to library call, if needed,
7220
7221 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7222 with specified algorithm. */
7223 bool
7224 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
7225 rtx align_exp, rtx expected_align_exp,
7226 rtx expected_size_exp, rtx min_size_exp,
7227 rtx max_size_exp, rtx probable_max_size_exp,
7228 bool issetmem)
7229 {
7230 rtx destreg;
7231 rtx srcreg = NULL;
7232 rtx_code_label *label = NULL;
7233 rtx tmp;
7234 rtx_code_label *jump_around_label = NULL;
7235 HOST_WIDE_INT align = 1;
7236 unsigned HOST_WIDE_INT count = 0;
7237 HOST_WIDE_INT expected_size = -1;
7238 int size_needed = 0, epilogue_size_needed;
7239 int desired_align = 0, align_bytes = 0;
7240 enum stringop_alg alg;
7241 rtx promoted_val = NULL;
7242 rtx vec_promoted_val = NULL;
7243 bool force_loopy_epilogue = false;
7244 int dynamic_check;
7245 bool need_zero_guard = false;
7246 bool noalign;
7247 machine_mode move_mode = VOIDmode;
7248 machine_mode wider_mode;
7249 int unroll_factor = 1;
7250 /* TODO: Once value ranges are available, fill in proper data. */
7251 unsigned HOST_WIDE_INT min_size = 0;
7252 unsigned HOST_WIDE_INT max_size = -1;
7253 unsigned HOST_WIDE_INT probable_max_size = -1;
7254 bool misaligned_prologue_used = false;
7255 bool have_as;
7256
7257 if (CONST_INT_P (align_exp))
7258 align = INTVAL (align_exp);
7259 /* i386 can do misaligned access on reasonably increased cost. */
7260 if (CONST_INT_P (expected_align_exp)
7261 && INTVAL (expected_align_exp) > align)
7262 align = INTVAL (expected_align_exp);
7263 /* ALIGN is the minimum of destination and source alignment, but we care here
7264 just about destination alignment. */
7265 else if (!issetmem
7266 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7267 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7268
7269 if (CONST_INT_P (count_exp))
7270 {
7271 min_size = max_size = probable_max_size = count = expected_size
7272 = INTVAL (count_exp);
7273 /* When COUNT is 0, there is nothing to do. */
7274 if (!count)
7275 return true;
7276 }
7277 else
7278 {
7279 if (min_size_exp)
7280 min_size = INTVAL (min_size_exp);
7281 if (max_size_exp)
7282 max_size = INTVAL (max_size_exp);
7283 if (probable_max_size_exp)
7284 probable_max_size = INTVAL (probable_max_size_exp);
7285 if (CONST_INT_P (expected_size_exp))
7286 expected_size = INTVAL (expected_size_exp);
7287 }
7288
7289 /* Make sure we don't need to care about overflow later on. */
7290 if (count > (HOST_WIDE_INT_1U << 30))
7291 return false;
7292
7293 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7294 if (!issetmem)
7295 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7296
7297 /* Step 0: Decide on preferred algorithm, desired alignment and
7298 size of chunks to be copied by main loop. */
7299 alg = decide_alg (count, expected_size, min_size, probable_max_size,
7300 issetmem,
7301 issetmem && val_exp == const0_rtx, have_as,
7302 &dynamic_check, &noalign, false);
7303
7304 if (dump_file)
7305 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7306 stringop_alg_names[alg]);
7307
7308 if (alg == libcall)
7309 return false;
7310 gcc_assert (alg != no_stringop);
7311
7312 /* For now vector-version of memset is generated only for memory zeroing, as
7313 creating of promoted vector value is very cheap in this case. */
7314 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7315 alg = unrolled_loop;
7316
7317 if (!count)
7318 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7319 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7320 if (!issetmem)
7321 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7322
7323 unroll_factor = 1;
7324 move_mode = word_mode;
7325 switch (alg)
7326 {
7327 case libcall:
7328 case no_stringop:
7329 case last_alg:
7330 gcc_unreachable ();
7331 case loop_1_byte:
7332 need_zero_guard = true;
7333 move_mode = QImode;
7334 break;
7335 case loop:
7336 need_zero_guard = true;
7337 break;
7338 case unrolled_loop:
7339 need_zero_guard = true;
7340 unroll_factor = (TARGET_64BIT ? 4 : 2);
7341 break;
7342 case vector_loop:
7343 need_zero_guard = true;
7344 unroll_factor = 4;
7345 /* Find the widest supported mode. */
7346 move_mode = word_mode;
7347 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7348 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7349 move_mode = wider_mode;
7350
7351 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
7352 move_mode = TImode;
7353
7354 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7355 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7356 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7357 {
7358 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7359 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7360 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7361 move_mode = word_mode;
7362 }
7363 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7364 break;
7365 case rep_prefix_8_byte:
7366 move_mode = DImode;
7367 break;
7368 case rep_prefix_4_byte:
7369 move_mode = SImode;
7370 break;
7371 case rep_prefix_1_byte:
7372 move_mode = QImode;
7373 break;
7374 }
7375 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7376 epilogue_size_needed = size_needed;
7377
7378 /* If we are going to call any library calls conditionally, make sure any
7379 pending stack adjustment happen before the first conditional branch,
7380 otherwise they will be emitted before the library call only and won't
7381 happen from the other branches. */
7382 if (dynamic_check != -1)
7383 do_pending_stack_adjust ();
7384
7385 desired_align = decide_alignment (align, alg, expected_size, move_mode);
7386 if (!TARGET_ALIGN_STRINGOPS || noalign)
7387 align = desired_align;
7388
7389 /* Step 1: Prologue guard. */
7390
7391 /* Alignment code needs count to be in register. */
7392 if (CONST_INT_P (count_exp) && desired_align > align)
7393 {
7394 if (INTVAL (count_exp) > desired_align
7395 && INTVAL (count_exp) > size_needed)
7396 {
7397 align_bytes
7398 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7399 if (align_bytes <= 0)
7400 align_bytes = 0;
7401 else
7402 align_bytes = desired_align - align_bytes;
7403 }
7404 if (align_bytes == 0)
7405 count_exp = force_reg (counter_mode (count_exp), count_exp);
7406 }
7407 gcc_assert (desired_align >= 1 && align >= 1);
7408
7409 /* Misaligned move sequences handle both prologue and epilogue at once.
7410 Default code generation results in a smaller code for large alignments
7411 and also avoids redundant job when sizes are known precisely. */
7412 misaligned_prologue_used
7413 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7414 && MAX (desired_align, epilogue_size_needed) <= 32
7415 && desired_align <= epilogue_size_needed
7416 && ((desired_align > align && !align_bytes)
7417 || (!count && epilogue_size_needed > 1)));
7418
7419 /* Do the cheap promotion to allow better CSE across the
7420 main loop and epilogue (ie one load of the big constant in the
7421 front of all code.
7422 For now the misaligned move sequences do not have fast path
7423 without broadcasting. */
7424 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7425 {
7426 if (alg == vector_loop)
7427 {
7428 gcc_assert (val_exp == const0_rtx);
7429 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7430 promoted_val = promote_duplicated_reg_to_size (val_exp,
7431 GET_MODE_SIZE (word_mode),
7432 desired_align, align);
7433 }
7434 else
7435 {
7436 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7437 desired_align, align);
7438 }
7439 }
7440 /* Misaligned move sequences handles both prologues and epilogues at once.
7441 Default code generation results in smaller code for large alignments and
7442 also avoids redundant job when sizes are known precisely. */
7443 if (misaligned_prologue_used)
7444 {
7445 /* Misaligned move prologue handled small blocks by itself. */
7446 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7447 (dst, src, &destreg, &srcreg,
7448 move_mode, promoted_val, vec_promoted_val,
7449 &count_exp,
7450 &jump_around_label,
7451 desired_align < align
7452 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7453 desired_align, align, &min_size, dynamic_check, issetmem);
7454 if (!issetmem)
7455 src = change_address (src, BLKmode, srcreg);
7456 dst = change_address (dst, BLKmode, destreg);
7457 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7458 epilogue_size_needed = 0;
7459 if (need_zero_guard
7460 && min_size < (unsigned HOST_WIDE_INT) size_needed)
7461 {
7462 /* It is possible that we copied enough so the main loop will not
7463 execute. */
7464 gcc_assert (size_needed > 1);
7465 if (jump_around_label == NULL_RTX)
7466 jump_around_label = gen_label_rtx ();
7467 emit_cmp_and_jump_insns (count_exp,
7468 GEN_INT (size_needed),
7469 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7470 if (expected_size == -1
7471 || expected_size < (desired_align - align) / 2 + size_needed)
7472 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7473 else
7474 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7475 }
7476 }
7477 /* Ensure that alignment prologue won't copy past end of block. */
7478 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7479 {
7480 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7481 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7482 Make sure it is power of 2. */
7483 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7484
7485 /* To improve performance of small blocks, we jump around the VAL
7486 promoting mode. This mean that if the promoted VAL is not constant,
7487 we might not use it in the epilogue and have to use byte
7488 loop variant. */
7489 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7490 force_loopy_epilogue = true;
7491 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7492 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7493 {
7494 /* If main algorithm works on QImode, no epilogue is needed.
7495 For small sizes just don't align anything. */
7496 if (size_needed == 1)
7497 desired_align = align;
7498 else
7499 goto epilogue;
7500 }
7501 else if (!count
7502 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7503 {
7504 label = gen_label_rtx ();
7505 emit_cmp_and_jump_insns (count_exp,
7506 GEN_INT (epilogue_size_needed),
7507 LTU, 0, counter_mode (count_exp), 1, label);
7508 if (expected_size == -1 || expected_size < epilogue_size_needed)
7509 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7510 else
7511 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7512 }
7513 }
7514
7515 /* Emit code to decide on runtime whether library call or inline should be
7516 used. */
7517 if (dynamic_check != -1)
7518 {
7519 if (!issetmem && CONST_INT_P (count_exp))
7520 {
7521 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7522 {
7523 emit_block_copy_via_libcall (dst, src, count_exp);
7524 count_exp = const0_rtx;
7525 goto epilogue;
7526 }
7527 }
7528 else
7529 {
7530 rtx_code_label *hot_label = gen_label_rtx ();
7531 if (jump_around_label == NULL_RTX)
7532 jump_around_label = gen_label_rtx ();
7533 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7534 LEU, 0, counter_mode (count_exp),
7535 1, hot_label);
7536 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7537 if (issetmem)
7538 set_storage_via_libcall (dst, count_exp, val_exp);
7539 else
7540 emit_block_copy_via_libcall (dst, src, count_exp);
7541 emit_jump (jump_around_label);
7542 emit_label (hot_label);
7543 }
7544 }
7545
7546 /* Step 2: Alignment prologue. */
7547 /* Do the expensive promotion once we branched off the small blocks. */
7548 if (issetmem && !promoted_val)
7549 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7550 desired_align, align);
7551
7552 if (desired_align > align && !misaligned_prologue_used)
7553 {
7554 if (align_bytes == 0)
7555 {
7556 /* Except for the first move in prologue, we no longer know
7557 constant offset in aliasing info. It don't seems to worth
7558 the pain to maintain it for the first move, so throw away
7559 the info early. */
7560 dst = change_address (dst, BLKmode, destreg);
7561 if (!issetmem)
7562 src = change_address (src, BLKmode, srcreg);
7563 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
7564 promoted_val, vec_promoted_val,
7565 count_exp, align, desired_align,
7566 issetmem);
7567 /* At most desired_align - align bytes are copied. */
7568 if (min_size < (unsigned)(desired_align - align))
7569 min_size = 0;
7570 else
7571 min_size -= desired_align - align;
7572 }
7573 else
7574 {
7575 /* If we know how many bytes need to be stored before dst is
7576 sufficiently aligned, maintain aliasing info accurately. */
7577 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
7578 srcreg,
7579 promoted_val,
7580 vec_promoted_val,
7581 desired_align,
7582 align_bytes,
7583 issetmem);
7584
7585 count_exp = plus_constant (counter_mode (count_exp),
7586 count_exp, -align_bytes);
7587 count -= align_bytes;
7588 min_size -= align_bytes;
7589 max_size -= align_bytes;
7590 }
7591 if (need_zero_guard
7592 && min_size < (unsigned HOST_WIDE_INT) size_needed
7593 && (count < (unsigned HOST_WIDE_INT) size_needed
7594 || (align_bytes == 0
7595 && count < ((unsigned HOST_WIDE_INT) size_needed
7596 + desired_align - align))))
7597 {
7598 /* It is possible that we copied enough so the main loop will not
7599 execute. */
7600 gcc_assert (size_needed > 1);
7601 if (label == NULL_RTX)
7602 label = gen_label_rtx ();
7603 emit_cmp_and_jump_insns (count_exp,
7604 GEN_INT (size_needed),
7605 LTU, 0, counter_mode (count_exp), 1, label);
7606 if (expected_size == -1
7607 || expected_size < (desired_align - align) / 2 + size_needed)
7608 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7609 else
7610 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7611 }
7612 }
7613 if (label && size_needed == 1)
7614 {
7615 emit_label (label);
7616 LABEL_NUSES (label) = 1;
7617 label = NULL;
7618 epilogue_size_needed = 1;
7619 if (issetmem)
7620 promoted_val = val_exp;
7621 }
7622 else if (label == NULL_RTX && !misaligned_prologue_used)
7623 epilogue_size_needed = size_needed;
7624
7625 /* Step 3: Main loop. */
7626
7627 switch (alg)
7628 {
7629 case libcall:
7630 case no_stringop:
7631 case last_alg:
7632 gcc_unreachable ();
7633 case loop_1_byte:
7634 case loop:
7635 case unrolled_loop:
7636 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
7637 count_exp, move_mode, unroll_factor,
7638 expected_size, issetmem);
7639 break;
7640 case vector_loop:
7641 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
7642 vec_promoted_val, count_exp, move_mode,
7643 unroll_factor, expected_size, issetmem);
7644 break;
7645 case rep_prefix_8_byte:
7646 case rep_prefix_4_byte:
7647 case rep_prefix_1_byte:
7648 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
7649 val_exp, count_exp, move_mode, issetmem);
7650 break;
7651 }
7652 /* Adjust properly the offset of src and dest memory for aliasing. */
7653 if (CONST_INT_P (count_exp))
7654 {
7655 if (!issetmem)
7656 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7657 (count / size_needed) * size_needed);
7658 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7659 (count / size_needed) * size_needed);
7660 }
7661 else
7662 {
7663 if (!issetmem)
7664 src = change_address (src, BLKmode, srcreg);
7665 dst = change_address (dst, BLKmode, destreg);
7666 }
7667
7668 /* Step 4: Epilogue to copy the remaining bytes. */
7669 epilogue:
7670 if (label)
7671 {
7672 /* When the main loop is done, COUNT_EXP might hold original count,
7673 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7674 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7675 bytes. Compensate if needed. */
7676
7677 if (size_needed < epilogue_size_needed)
7678 {
7679 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7680 GEN_INT (size_needed - 1), count_exp, 1,
7681 OPTAB_DIRECT);
7682 if (tmp != count_exp)
7683 emit_move_insn (count_exp, tmp);
7684 }
7685 emit_label (label);
7686 LABEL_NUSES (label) = 1;
7687 }
7688
7689 if (count_exp != const0_rtx && epilogue_size_needed > 1)
7690 {
7691 if (force_loopy_epilogue)
7692 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7693 epilogue_size_needed);
7694 else
7695 {
7696 if (issetmem)
7697 expand_setmem_epilogue (dst, destreg, promoted_val,
7698 vec_promoted_val, count_exp,
7699 epilogue_size_needed);
7700 else
7701 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
7702 epilogue_size_needed);
7703 }
7704 }
7705 if (jump_around_label)
7706 emit_label (jump_around_label);
7707 return true;
7708 }
7709
7710
7711 /* Expand the appropriate insns for doing strlen if not just doing
7712 repnz; scasb
7713
7714 out = result, initialized with the start address
7715 align_rtx = alignment of the address.
7716 scratch = scratch register, initialized with the startaddress when
7717 not aligned, otherwise undefined
7718
7719 This is just the body. It needs the initializations mentioned above and
7720 some address computing at the end. These things are done in i386.md. */
7721
7722 static void
7723 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7724 {
7725 int align;
7726 rtx tmp;
7727 rtx_code_label *align_2_label = NULL;
7728 rtx_code_label *align_3_label = NULL;
7729 rtx_code_label *align_4_label = gen_label_rtx ();
7730 rtx_code_label *end_0_label = gen_label_rtx ();
7731 rtx mem;
7732 rtx tmpreg = gen_reg_rtx (SImode);
7733 rtx scratch = gen_reg_rtx (SImode);
7734 rtx cmp;
7735
7736 align = 0;
7737 if (CONST_INT_P (align_rtx))
7738 align = INTVAL (align_rtx);
7739
7740 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7741
7742 /* Is there a known alignment and is it less than 4? */
7743 if (align < 4)
7744 {
7745 rtx scratch1 = gen_reg_rtx (Pmode);
7746 emit_move_insn (scratch1, out);
7747 /* Is there a known alignment and is it not 2? */
7748 if (align != 2)
7749 {
7750 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7751 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7752
7753 /* Leave just the 3 lower bits. */
7754 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7755 NULL_RTX, 0, OPTAB_WIDEN);
7756
7757 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7758 Pmode, 1, align_4_label);
7759 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7760 Pmode, 1, align_2_label);
7761 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7762 Pmode, 1, align_3_label);
7763 }
7764 else
7765 {
7766 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7767 check if is aligned to 4 - byte. */
7768
7769 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7770 NULL_RTX, 0, OPTAB_WIDEN);
7771
7772 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7773 Pmode, 1, align_4_label);
7774 }
7775
7776 mem = change_address (src, QImode, out);
7777
7778 /* Now compare the bytes. */
7779
7780 /* Compare the first n unaligned byte on a byte per byte basis. */
7781 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7782 QImode, 1, end_0_label);
7783
7784 /* Increment the address. */
7785 emit_insn (gen_add2_insn (out, const1_rtx));
7786
7787 /* Not needed with an alignment of 2 */
7788 if (align != 2)
7789 {
7790 emit_label (align_2_label);
7791
7792 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7793 end_0_label);
7794
7795 emit_insn (gen_add2_insn (out, const1_rtx));
7796
7797 emit_label (align_3_label);
7798 }
7799
7800 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7801 end_0_label);
7802
7803 emit_insn (gen_add2_insn (out, const1_rtx));
7804 }
7805
7806 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7807 align this loop. It gives only huge programs, but does not help to
7808 speed up. */
7809 emit_label (align_4_label);
7810
7811 mem = change_address (src, SImode, out);
7812 emit_move_insn (scratch, mem);
7813 emit_insn (gen_add2_insn (out, GEN_INT (4)));
7814
7815 /* This formula yields a nonzero result iff one of the bytes is zero.
7816 This saves three branches inside loop and many cycles. */
7817
7818 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7819 emit_insn (gen_one_cmplsi2 (scratch, scratch));
7820 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7821 emit_insn (gen_andsi3 (tmpreg, tmpreg,
7822 gen_int_mode (0x80808080, SImode)));
7823 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7824 align_4_label);
7825
7826 if (TARGET_CMOVE)
7827 {
7828 rtx reg = gen_reg_rtx (SImode);
7829 rtx reg2 = gen_reg_rtx (Pmode);
7830 emit_move_insn (reg, tmpreg);
7831 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7832
7833 /* If zero is not in the first two bytes, move two bytes forward. */
7834 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7835 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7836 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7837 emit_insn (gen_rtx_SET (tmpreg,
7838 gen_rtx_IF_THEN_ELSE (SImode, tmp,
7839 reg,
7840 tmpreg)));
7841 /* Emit lea manually to avoid clobbering of flags. */
7842 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
7843
7844 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7845 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7846 emit_insn (gen_rtx_SET (out,
7847 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7848 reg2,
7849 out)));
7850 }
7851 else
7852 {
7853 rtx_code_label *end_2_label = gen_label_rtx ();
7854 /* Is zero in the first two bytes? */
7855
7856 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7857 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7858 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7859 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7860 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7861 pc_rtx);
7862 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7863 JUMP_LABEL (tmp) = end_2_label;
7864
7865 /* Not in the first two. Move two bytes forward. */
7866 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
7867 emit_insn (gen_add2_insn (out, const2_rtx));
7868
7869 emit_label (end_2_label);
7870
7871 }
7872
7873 /* Avoid branch in fixing the byte. */
7874 tmpreg = gen_lowpart (QImode, tmpreg);
7875 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7876 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7877 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
7878 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
7879
7880 emit_label (end_0_label);
7881 }
7882
7883 /* Expand strlen. */
7884
7885 bool
7886 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7887 {
7888 if (TARGET_UNROLL_STRLEN
7889 && TARGET_INLINE_ALL_STRINGOPS
7890 && eoschar == const0_rtx
7891 && optimize > 1)
7892 {
7893 /* The generic case of strlen expander is long. Avoid it's
7894 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7895 rtx addr = force_reg (Pmode, XEXP (src, 0));
7896 /* Well it seems that some optimizer does not combine a call like
7897 foo(strlen(bar), strlen(bar));
7898 when the move and the subtraction is done here. It does calculate
7899 the length just once when these instructions are done inside of
7900 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7901 often used and I use one fewer register for the lifetime of
7902 output_strlen_unroll() this is better. */
7903
7904 emit_move_insn (out, addr);
7905
7906 ix86_expand_strlensi_unroll_1 (out, src, align);
7907
7908 /* strlensi_unroll_1 returns the address of the zero at the end of
7909 the string, like memchr(), so compute the length by subtracting
7910 the start address. */
7911 emit_insn (gen_sub2_insn (out, addr));
7912 return true;
7913 }
7914 else
7915 return false;
7916 }
7917
7918 /* For given symbol (function) construct code to compute address of it's PLT
7919 entry in large x86-64 PIC model. */
7920
7921 static rtx
7922 construct_plt_address (rtx symbol)
7923 {
7924 rtx tmp, unspec;
7925
7926 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
7927 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
7928 gcc_assert (Pmode == DImode);
7929
7930 tmp = gen_reg_rtx (Pmode);
7931 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
7932
7933 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
7934 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
7935 return tmp;
7936 }
7937
7938 /* Additional registers that are clobbered by SYSV calls. */
7939
7940 static int const x86_64_ms_sysv_extra_clobbered_registers
7941 [NUM_X86_64_MS_CLOBBERED_REGS] =
7942 {
7943 SI_REG, DI_REG,
7944 XMM6_REG, XMM7_REG,
7945 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
7946 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
7947 };
7948
7949 rtx_insn *
7950 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
7951 rtx callarg2,
7952 rtx pop, bool sibcall)
7953 {
7954 rtx vec[3];
7955 rtx use = NULL, call;
7956 unsigned int vec_len = 0;
7957 tree fndecl;
7958
7959 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7960 {
7961 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
7962 if (fndecl
7963 && (lookup_attribute ("interrupt",
7964 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
7965 error ("interrupt service routine cannot be called directly");
7966 }
7967 else
7968 fndecl = NULL_TREE;
7969
7970 if (pop == const0_rtx)
7971 pop = NULL;
7972 gcc_assert (!TARGET_64BIT || !pop);
7973
7974 if (TARGET_MACHO && !TARGET_64BIT)
7975 {
7976 #if TARGET_MACHO
7977 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7978 fnaddr = machopic_indirect_call_target (fnaddr);
7979 #endif
7980 }
7981 else
7982 {
7983 /* Static functions and indirect calls don't need the pic register. Also,
7984 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7985 it an indirect call. */
7986 rtx addr = XEXP (fnaddr, 0);
7987 if (flag_pic
7988 && GET_CODE (addr) == SYMBOL_REF
7989 && !SYMBOL_REF_LOCAL_P (addr))
7990 {
7991 if (flag_plt
7992 && (SYMBOL_REF_DECL (addr) == NULL_TREE
7993 || !lookup_attribute ("noplt",
7994 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
7995 {
7996 if (!TARGET_64BIT
7997 || (ix86_cmodel == CM_LARGE_PIC
7998 && DEFAULT_ABI != MS_ABI))
7999 {
8000 use_reg (&use, gen_rtx_REG (Pmode,
8001 REAL_PIC_OFFSET_TABLE_REGNUM));
8002 if (ix86_use_pseudo_pic_reg ())
8003 emit_move_insn (gen_rtx_REG (Pmode,
8004 REAL_PIC_OFFSET_TABLE_REGNUM),
8005 pic_offset_table_rtx);
8006 }
8007 }
8008 else if (!TARGET_PECOFF && !TARGET_MACHO)
8009 {
8010 if (TARGET_64BIT)
8011 {
8012 fnaddr = gen_rtx_UNSPEC (Pmode,
8013 gen_rtvec (1, addr),
8014 UNSPEC_GOTPCREL);
8015 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8016 }
8017 else
8018 {
8019 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8020 UNSPEC_GOT);
8021 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8022 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
8023 fnaddr);
8024 }
8025 fnaddr = gen_const_mem (Pmode, fnaddr);
8026 /* Pmode may not be the same as word_mode for x32, which
8027 doesn't support indirect branch via 32-bit memory slot.
8028 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8029 indirect branch via x32 GOT slot is OK. */
8030 if (GET_MODE (fnaddr) != word_mode)
8031 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
8032 fnaddr = gen_rtx_MEM (QImode, fnaddr);
8033 }
8034 }
8035 }
8036
8037 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8038 parameters passed in vector registers. */
8039 if (TARGET_64BIT
8040 && (INTVAL (callarg2) > 0
8041 || (INTVAL (callarg2) == 0
8042 && (TARGET_SSE || !flag_skip_rax_setup))))
8043 {
8044 rtx al = gen_rtx_REG (QImode, AX_REG);
8045 emit_move_insn (al, callarg2);
8046 use_reg (&use, al);
8047 }
8048
8049 if (ix86_cmodel == CM_LARGE_PIC
8050 && !TARGET_PECOFF
8051 && MEM_P (fnaddr)
8052 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8053 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8054 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8055 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8056 branch via x32 GOT slot is OK. */
8057 else if (!(TARGET_X32
8058 && MEM_P (fnaddr)
8059 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8060 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8061 && (sibcall
8062 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8063 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8064 {
8065 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8066 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8067 }
8068
8069 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8070
8071 if (retval)
8072 call = gen_rtx_SET (retval, call);
8073 vec[vec_len++] = call;
8074
8075 if (pop)
8076 {
8077 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8078 pop = gen_rtx_SET (stack_pointer_rtx, pop);
8079 vec[vec_len++] = pop;
8080 }
8081
8082 if (cfun->machine->no_caller_saved_registers
8083 && (!fndecl
8084 || (!TREE_THIS_VOLATILE (fndecl)
8085 && !lookup_attribute ("no_caller_saved_registers",
8086 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8087 {
8088 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8089 bool is_64bit_ms_abi = (TARGET_64BIT
8090 && ix86_function_abi (fndecl) == MS_ABI);
8091 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8092
8093 /* If there are no caller-saved registers, add all registers
8094 that are clobbered by the call which returns. */
8095 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8096 if (!fixed_regs[i]
8097 && (ix86_call_used_regs[i] == 1
8098 || (ix86_call_used_regs[i] & c_mask))
8099 && !STACK_REGNO_P (i)
8100 && !MMX_REGNO_P (i))
8101 clobber_reg (&use,
8102 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8103 }
8104 else if (TARGET_64BIT_MS_ABI
8105 && (!callarg2 || INTVAL (callarg2) != -2))
8106 {
8107 unsigned i;
8108
8109 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8110 {
8111 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8112 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8113
8114 clobber_reg (&use, gen_rtx_REG (mode, regno));
8115 }
8116
8117 /* Set here, but it may get cleared later. */
8118 if (TARGET_CALL_MS2SYSV_XLOGUES)
8119 {
8120 if (!TARGET_SSE)
8121 ;
8122
8123 /* Don't break hot-patched functions. */
8124 else if (ix86_function_ms_hook_prologue (current_function_decl))
8125 ;
8126
8127 /* TODO: Cases not yet examined. */
8128 else if (flag_split_stack)
8129 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8130
8131 else
8132 {
8133 gcc_assert (!reload_completed);
8134 cfun->machine->call_ms2sysv = true;
8135 }
8136 }
8137 }
8138
8139 if (vec_len > 1)
8140 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8141 rtx_insn *call_insn = emit_call_insn (call);
8142 if (use)
8143 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8144
8145 return call_insn;
8146 }
8147
8148 /* Split simple return with popping POPC bytes from stack to indirect
8149 branch with stack adjustment . */
8150
8151 void
8152 ix86_split_simple_return_pop_internal (rtx popc)
8153 {
8154 struct machine_function *m = cfun->machine;
8155 rtx ecx = gen_rtx_REG (SImode, CX_REG);
8156 rtx_insn *insn;
8157
8158 /* There is no "pascal" calling convention in any 64bit ABI. */
8159 gcc_assert (!TARGET_64BIT);
8160
8161 insn = emit_insn (gen_pop (ecx));
8162 m->fs.cfa_offset -= UNITS_PER_WORD;
8163 m->fs.sp_offset -= UNITS_PER_WORD;
8164
8165 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8166 x = gen_rtx_SET (stack_pointer_rtx, x);
8167 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8168 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8169 RTX_FRAME_RELATED_P (insn) = 1;
8170
8171 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8172 x = gen_rtx_SET (stack_pointer_rtx, x);
8173 insn = emit_insn (x);
8174 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8175 RTX_FRAME_RELATED_P (insn) = 1;
8176
8177 /* Now return address is in ECX. */
8178 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8179 }
8180
8181 /* Errors in the source file can cause expand_expr to return const0_rtx
8182 where we expect a vector. To avoid crashing, use one of the vector
8183 clear instructions. */
8184
8185 static rtx
8186 safe_vector_operand (rtx x, machine_mode mode)
8187 {
8188 if (x == const0_rtx)
8189 x = CONST0_RTX (mode);
8190 return x;
8191 }
8192
8193 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8194
8195 static rtx
8196 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8197 {
8198 rtx pat;
8199 tree arg0 = CALL_EXPR_ARG (exp, 0);
8200 tree arg1 = CALL_EXPR_ARG (exp, 1);
8201 rtx op0 = expand_normal (arg0);
8202 rtx op1 = expand_normal (arg1);
8203 machine_mode tmode = insn_data[icode].operand[0].mode;
8204 machine_mode mode0 = insn_data[icode].operand[1].mode;
8205 machine_mode mode1 = insn_data[icode].operand[2].mode;
8206
8207 if (VECTOR_MODE_P (mode0))
8208 op0 = safe_vector_operand (op0, mode0);
8209 if (VECTOR_MODE_P (mode1))
8210 op1 = safe_vector_operand (op1, mode1);
8211
8212 if (optimize || !target
8213 || GET_MODE (target) != tmode
8214 || !insn_data[icode].operand[0].predicate (target, tmode))
8215 target = gen_reg_rtx (tmode);
8216
8217 if (GET_MODE (op1) == SImode && mode1 == TImode)
8218 {
8219 rtx x = gen_reg_rtx (V4SImode);
8220 emit_insn (gen_sse2_loadd (x, op1));
8221 op1 = gen_lowpart (TImode, x);
8222 }
8223
8224 if (!insn_data[icode].operand[1].predicate (op0, mode0))
8225 op0 = copy_to_mode_reg (mode0, op0);
8226 if (!insn_data[icode].operand[2].predicate (op1, mode1))
8227 op1 = copy_to_mode_reg (mode1, op1);
8228
8229 pat = GEN_FCN (icode) (target, op0, op1);
8230 if (! pat)
8231 return 0;
8232
8233 emit_insn (pat);
8234
8235 return target;
8236 }
8237
8238 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8239
8240 static rtx
8241 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8242 enum ix86_builtin_func_type m_type,
8243 enum rtx_code sub_code)
8244 {
8245 rtx pat;
8246 int i;
8247 int nargs;
8248 bool comparison_p = false;
8249 bool tf_p = false;
8250 bool last_arg_constant = false;
8251 int num_memory = 0;
8252 struct {
8253 rtx op;
8254 machine_mode mode;
8255 } args[4];
8256
8257 machine_mode tmode = insn_data[icode].operand[0].mode;
8258
8259 switch (m_type)
8260 {
8261 case MULTI_ARG_4_DF2_DI_I:
8262 case MULTI_ARG_4_DF2_DI_I1:
8263 case MULTI_ARG_4_SF2_SI_I:
8264 case MULTI_ARG_4_SF2_SI_I1:
8265 nargs = 4;
8266 last_arg_constant = true;
8267 break;
8268
8269 case MULTI_ARG_3_SF:
8270 case MULTI_ARG_3_DF:
8271 case MULTI_ARG_3_SF2:
8272 case MULTI_ARG_3_DF2:
8273 case MULTI_ARG_3_DI:
8274 case MULTI_ARG_3_SI:
8275 case MULTI_ARG_3_SI_DI:
8276 case MULTI_ARG_3_HI:
8277 case MULTI_ARG_3_HI_SI:
8278 case MULTI_ARG_3_QI:
8279 case MULTI_ARG_3_DI2:
8280 case MULTI_ARG_3_SI2:
8281 case MULTI_ARG_3_HI2:
8282 case MULTI_ARG_3_QI2:
8283 nargs = 3;
8284 break;
8285
8286 case MULTI_ARG_2_SF:
8287 case MULTI_ARG_2_DF:
8288 case MULTI_ARG_2_DI:
8289 case MULTI_ARG_2_SI:
8290 case MULTI_ARG_2_HI:
8291 case MULTI_ARG_2_QI:
8292 nargs = 2;
8293 break;
8294
8295 case MULTI_ARG_2_DI_IMM:
8296 case MULTI_ARG_2_SI_IMM:
8297 case MULTI_ARG_2_HI_IMM:
8298 case MULTI_ARG_2_QI_IMM:
8299 nargs = 2;
8300 last_arg_constant = true;
8301 break;
8302
8303 case MULTI_ARG_1_SF:
8304 case MULTI_ARG_1_DF:
8305 case MULTI_ARG_1_SF2:
8306 case MULTI_ARG_1_DF2:
8307 case MULTI_ARG_1_DI:
8308 case MULTI_ARG_1_SI:
8309 case MULTI_ARG_1_HI:
8310 case MULTI_ARG_1_QI:
8311 case MULTI_ARG_1_SI_DI:
8312 case MULTI_ARG_1_HI_DI:
8313 case MULTI_ARG_1_HI_SI:
8314 case MULTI_ARG_1_QI_DI:
8315 case MULTI_ARG_1_QI_SI:
8316 case MULTI_ARG_1_QI_HI:
8317 nargs = 1;
8318 break;
8319
8320 case MULTI_ARG_2_DI_CMP:
8321 case MULTI_ARG_2_SI_CMP:
8322 case MULTI_ARG_2_HI_CMP:
8323 case MULTI_ARG_2_QI_CMP:
8324 nargs = 2;
8325 comparison_p = true;
8326 break;
8327
8328 case MULTI_ARG_2_SF_TF:
8329 case MULTI_ARG_2_DF_TF:
8330 case MULTI_ARG_2_DI_TF:
8331 case MULTI_ARG_2_SI_TF:
8332 case MULTI_ARG_2_HI_TF:
8333 case MULTI_ARG_2_QI_TF:
8334 nargs = 2;
8335 tf_p = true;
8336 break;
8337
8338 default:
8339 gcc_unreachable ();
8340 }
8341
8342 if (optimize || !target
8343 || GET_MODE (target) != tmode
8344 || !insn_data[icode].operand[0].predicate (target, tmode))
8345 target = gen_reg_rtx (tmode);
8346 else if (memory_operand (target, tmode))
8347 num_memory++;
8348
8349 gcc_assert (nargs <= 4);
8350
8351 for (i = 0; i < nargs; i++)
8352 {
8353 tree arg = CALL_EXPR_ARG (exp, i);
8354 rtx op = expand_normal (arg);
8355 int adjust = (comparison_p) ? 1 : 0;
8356 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8357
8358 if (last_arg_constant && i == nargs - 1)
8359 {
8360 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8361 {
8362 enum insn_code new_icode = icode;
8363 switch (icode)
8364 {
8365 case CODE_FOR_xop_vpermil2v2df3:
8366 case CODE_FOR_xop_vpermil2v4sf3:
8367 case CODE_FOR_xop_vpermil2v4df3:
8368 case CODE_FOR_xop_vpermil2v8sf3:
8369 error ("the last argument must be a 2-bit immediate");
8370 return gen_reg_rtx (tmode);
8371 case CODE_FOR_xop_rotlv2di3:
8372 new_icode = CODE_FOR_rotlv2di3;
8373 goto xop_rotl;
8374 case CODE_FOR_xop_rotlv4si3:
8375 new_icode = CODE_FOR_rotlv4si3;
8376 goto xop_rotl;
8377 case CODE_FOR_xop_rotlv8hi3:
8378 new_icode = CODE_FOR_rotlv8hi3;
8379 goto xop_rotl;
8380 case CODE_FOR_xop_rotlv16qi3:
8381 new_icode = CODE_FOR_rotlv16qi3;
8382 xop_rotl:
8383 if (CONST_INT_P (op))
8384 {
8385 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8386 op = GEN_INT (INTVAL (op) & mask);
8387 gcc_checking_assert
8388 (insn_data[icode].operand[i + 1].predicate (op, mode));
8389 }
8390 else
8391 {
8392 gcc_checking_assert
8393 (nargs == 2
8394 && insn_data[new_icode].operand[0].mode == tmode
8395 && insn_data[new_icode].operand[1].mode == tmode
8396 && insn_data[new_icode].operand[2].mode == mode
8397 && insn_data[new_icode].operand[0].predicate
8398 == insn_data[icode].operand[0].predicate
8399 && insn_data[new_icode].operand[1].predicate
8400 == insn_data[icode].operand[1].predicate);
8401 icode = new_icode;
8402 goto non_constant;
8403 }
8404 break;
8405 default:
8406 gcc_unreachable ();
8407 }
8408 }
8409 }
8410 else
8411 {
8412 non_constant:
8413 if (VECTOR_MODE_P (mode))
8414 op = safe_vector_operand (op, mode);
8415
8416 /* If we aren't optimizing, only allow one memory operand to be
8417 generated. */
8418 if (memory_operand (op, mode))
8419 num_memory++;
8420
8421 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8422
8423 if (optimize
8424 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8425 || num_memory > 1)
8426 op = force_reg (mode, op);
8427 }
8428
8429 args[i].op = op;
8430 args[i].mode = mode;
8431 }
8432
8433 switch (nargs)
8434 {
8435 case 1:
8436 pat = GEN_FCN (icode) (target, args[0].op);
8437 break;
8438
8439 case 2:
8440 if (tf_p)
8441 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
8442 GEN_INT ((int)sub_code));
8443 else if (! comparison_p)
8444 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
8445 else
8446 {
8447 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8448 args[0].op,
8449 args[1].op);
8450
8451 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
8452 }
8453 break;
8454
8455 case 3:
8456 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
8457 break;
8458
8459 case 4:
8460 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
8461 break;
8462
8463 default:
8464 gcc_unreachable ();
8465 }
8466
8467 if (! pat)
8468 return 0;
8469
8470 emit_insn (pat);
8471 return target;
8472 }
8473
8474 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8475 insns with vec_merge. */
8476
8477 static rtx
8478 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8479 rtx target)
8480 {
8481 rtx pat;
8482 tree arg0 = CALL_EXPR_ARG (exp, 0);
8483 rtx op1, op0 = expand_normal (arg0);
8484 machine_mode tmode = insn_data[icode].operand[0].mode;
8485 machine_mode mode0 = insn_data[icode].operand[1].mode;
8486
8487 if (optimize || !target
8488 || GET_MODE (target) != tmode
8489 || !insn_data[icode].operand[0].predicate (target, tmode))
8490 target = gen_reg_rtx (tmode);
8491
8492 if (VECTOR_MODE_P (mode0))
8493 op0 = safe_vector_operand (op0, mode0);
8494
8495 if ((optimize && !register_operand (op0, mode0))
8496 || !insn_data[icode].operand[1].predicate (op0, mode0))
8497 op0 = copy_to_mode_reg (mode0, op0);
8498
8499 op1 = op0;
8500 if (!insn_data[icode].operand[2].predicate (op1, mode0))
8501 op1 = copy_to_mode_reg (mode0, op1);
8502
8503 pat = GEN_FCN (icode) (target, op0, op1);
8504 if (! pat)
8505 return 0;
8506 emit_insn (pat);
8507 return target;
8508 }
8509
8510 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8511
8512 static rtx
8513 ix86_expand_sse_compare (const struct builtin_description *d,
8514 tree exp, rtx target, bool swap)
8515 {
8516 rtx pat;
8517 tree arg0 = CALL_EXPR_ARG (exp, 0);
8518 tree arg1 = CALL_EXPR_ARG (exp, 1);
8519 rtx op0 = expand_normal (arg0);
8520 rtx op1 = expand_normal (arg1);
8521 rtx op2;
8522 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8523 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8524 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8525 enum rtx_code comparison = d->comparison;
8526
8527 if (VECTOR_MODE_P (mode0))
8528 op0 = safe_vector_operand (op0, mode0);
8529 if (VECTOR_MODE_P (mode1))
8530 op1 = safe_vector_operand (op1, mode1);
8531
8532 /* Swap operands if we have a comparison that isn't available in
8533 hardware. */
8534 if (swap)
8535 std::swap (op0, op1);
8536
8537 if (optimize || !target
8538 || GET_MODE (target) != tmode
8539 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8540 target = gen_reg_rtx (tmode);
8541
8542 if ((optimize && !register_operand (op0, mode0))
8543 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8544 op0 = copy_to_mode_reg (mode0, op0);
8545 if ((optimize && !register_operand (op1, mode1))
8546 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8547 op1 = copy_to_mode_reg (mode1, op1);
8548
8549 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8550 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8551 if (! pat)
8552 return 0;
8553 emit_insn (pat);
8554 return target;
8555 }
8556
8557 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8558
8559 static rtx
8560 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8561 rtx target)
8562 {
8563 rtx pat;
8564 tree arg0 = CALL_EXPR_ARG (exp, 0);
8565 tree arg1 = CALL_EXPR_ARG (exp, 1);
8566 rtx op0 = expand_normal (arg0);
8567 rtx op1 = expand_normal (arg1);
8568 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8569 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8570 enum rtx_code comparison = d->comparison;
8571
8572 if (VECTOR_MODE_P (mode0))
8573 op0 = safe_vector_operand (op0, mode0);
8574 if (VECTOR_MODE_P (mode1))
8575 op1 = safe_vector_operand (op1, mode1);
8576
8577 /* Swap operands if we have a comparison that isn't available in
8578 hardware. */
8579 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
8580 std::swap (op0, op1);
8581
8582 target = gen_reg_rtx (SImode);
8583 emit_move_insn (target, const0_rtx);
8584 target = gen_rtx_SUBREG (QImode, target, 0);
8585
8586 if ((optimize && !register_operand (op0, mode0))
8587 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8588 op0 = copy_to_mode_reg (mode0, op0);
8589 if ((optimize && !register_operand (op1, mode1))
8590 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8591 op1 = copy_to_mode_reg (mode1, op1);
8592
8593 pat = GEN_FCN (d->icode) (op0, op1);
8594 if (! pat)
8595 return 0;
8596 emit_insn (pat);
8597 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8598 gen_rtx_fmt_ee (comparison, QImode,
8599 SET_DEST (pat),
8600 const0_rtx)));
8601
8602 return SUBREG_REG (target);
8603 }
8604
8605 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8606
8607 static rtx
8608 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8609 rtx target)
8610 {
8611 rtx pat;
8612 tree arg0 = CALL_EXPR_ARG (exp, 0);
8613 rtx op1, op0 = expand_normal (arg0);
8614 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8615 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8616
8617 if (optimize || target == 0
8618 || GET_MODE (target) != tmode
8619 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8620 target = gen_reg_rtx (tmode);
8621
8622 if (VECTOR_MODE_P (mode0))
8623 op0 = safe_vector_operand (op0, mode0);
8624
8625 if ((optimize && !register_operand (op0, mode0))
8626 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8627 op0 = copy_to_mode_reg (mode0, op0);
8628
8629 op1 = GEN_INT (d->comparison);
8630
8631 pat = GEN_FCN (d->icode) (target, op0, op1);
8632 if (! pat)
8633 return 0;
8634 emit_insn (pat);
8635 return target;
8636 }
8637
8638 static rtx
8639 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8640 tree exp, rtx target)
8641 {
8642 rtx pat;
8643 tree arg0 = CALL_EXPR_ARG (exp, 0);
8644 tree arg1 = CALL_EXPR_ARG (exp, 1);
8645 rtx op0 = expand_normal (arg0);
8646 rtx op1 = expand_normal (arg1);
8647 rtx op2;
8648 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8649 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8650 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8651
8652 if (optimize || target == 0
8653 || GET_MODE (target) != tmode
8654 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8655 target = gen_reg_rtx (tmode);
8656
8657 op0 = safe_vector_operand (op0, mode0);
8658 op1 = safe_vector_operand (op1, mode1);
8659
8660 if ((optimize && !register_operand (op0, mode0))
8661 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8662 op0 = copy_to_mode_reg (mode0, op0);
8663 if ((optimize && !register_operand (op1, mode1))
8664 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8665 op1 = copy_to_mode_reg (mode1, op1);
8666
8667 op2 = GEN_INT (d->comparison);
8668
8669 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8670 if (! pat)
8671 return 0;
8672 emit_insn (pat);
8673 return target;
8674 }
8675
8676 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8677
8678 static rtx
8679 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8680 rtx target)
8681 {
8682 rtx pat;
8683 tree arg0 = CALL_EXPR_ARG (exp, 0);
8684 tree arg1 = CALL_EXPR_ARG (exp, 1);
8685 rtx op0 = expand_normal (arg0);
8686 rtx op1 = expand_normal (arg1);
8687 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8688 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8689 enum rtx_code comparison = d->comparison;
8690
8691 if (VECTOR_MODE_P (mode0))
8692 op0 = safe_vector_operand (op0, mode0);
8693 if (VECTOR_MODE_P (mode1))
8694 op1 = safe_vector_operand (op1, mode1);
8695
8696 target = gen_reg_rtx (SImode);
8697 emit_move_insn (target, const0_rtx);
8698 target = gen_rtx_SUBREG (QImode, target, 0);
8699
8700 if ((optimize && !register_operand (op0, mode0))
8701 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8702 op0 = copy_to_mode_reg (mode0, op0);
8703 if ((optimize && !register_operand (op1, mode1))
8704 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8705 op1 = copy_to_mode_reg (mode1, op1);
8706
8707 pat = GEN_FCN (d->icode) (op0, op1);
8708 if (! pat)
8709 return 0;
8710 emit_insn (pat);
8711 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8712 gen_rtx_fmt_ee (comparison, QImode,
8713 SET_DEST (pat),
8714 const0_rtx)));
8715
8716 return SUBREG_REG (target);
8717 }
8718
8719 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8720
8721 static rtx
8722 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8723 tree exp, rtx target)
8724 {
8725 rtx pat;
8726 tree arg0 = CALL_EXPR_ARG (exp, 0);
8727 tree arg1 = CALL_EXPR_ARG (exp, 1);
8728 tree arg2 = CALL_EXPR_ARG (exp, 2);
8729 tree arg3 = CALL_EXPR_ARG (exp, 3);
8730 tree arg4 = CALL_EXPR_ARG (exp, 4);
8731 rtx scratch0, scratch1;
8732 rtx op0 = expand_normal (arg0);
8733 rtx op1 = expand_normal (arg1);
8734 rtx op2 = expand_normal (arg2);
8735 rtx op3 = expand_normal (arg3);
8736 rtx op4 = expand_normal (arg4);
8737 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8738
8739 tmode0 = insn_data[d->icode].operand[0].mode;
8740 tmode1 = insn_data[d->icode].operand[1].mode;
8741 modev2 = insn_data[d->icode].operand[2].mode;
8742 modei3 = insn_data[d->icode].operand[3].mode;
8743 modev4 = insn_data[d->icode].operand[4].mode;
8744 modei5 = insn_data[d->icode].operand[5].mode;
8745 modeimm = insn_data[d->icode].operand[6].mode;
8746
8747 if (VECTOR_MODE_P (modev2))
8748 op0 = safe_vector_operand (op0, modev2);
8749 if (VECTOR_MODE_P (modev4))
8750 op2 = safe_vector_operand (op2, modev4);
8751
8752 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8753 op0 = copy_to_mode_reg (modev2, op0);
8754 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8755 op1 = copy_to_mode_reg (modei3, op1);
8756 if ((optimize && !register_operand (op2, modev4))
8757 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8758 op2 = copy_to_mode_reg (modev4, op2);
8759 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8760 op3 = copy_to_mode_reg (modei5, op3);
8761
8762 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8763 {
8764 error ("the fifth argument must be an 8-bit immediate");
8765 return const0_rtx;
8766 }
8767
8768 if (d->code == IX86_BUILTIN_PCMPESTRI128)
8769 {
8770 if (optimize || !target
8771 || GET_MODE (target) != tmode0
8772 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8773 target = gen_reg_rtx (tmode0);
8774
8775 scratch1 = gen_reg_rtx (tmode1);
8776
8777 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8778 }
8779 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8780 {
8781 if (optimize || !target
8782 || GET_MODE (target) != tmode1
8783 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8784 target = gen_reg_rtx (tmode1);
8785
8786 scratch0 = gen_reg_rtx (tmode0);
8787
8788 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8789 }
8790 else
8791 {
8792 gcc_assert (d->flag);
8793
8794 scratch0 = gen_reg_rtx (tmode0);
8795 scratch1 = gen_reg_rtx (tmode1);
8796
8797 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8798 }
8799
8800 if (! pat)
8801 return 0;
8802
8803 emit_insn (pat);
8804
8805 if (d->flag)
8806 {
8807 target = gen_reg_rtx (SImode);
8808 emit_move_insn (target, const0_rtx);
8809 target = gen_rtx_SUBREG (QImode, target, 0);
8810
8811 emit_insn
8812 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8813 gen_rtx_fmt_ee (EQ, QImode,
8814 gen_rtx_REG ((machine_mode) d->flag,
8815 FLAGS_REG),
8816 const0_rtx)));
8817 return SUBREG_REG (target);
8818 }
8819 else
8820 return target;
8821 }
8822
8823
8824 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8825
8826 static rtx
8827 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8828 tree exp, rtx target)
8829 {
8830 rtx pat;
8831 tree arg0 = CALL_EXPR_ARG (exp, 0);
8832 tree arg1 = CALL_EXPR_ARG (exp, 1);
8833 tree arg2 = CALL_EXPR_ARG (exp, 2);
8834 rtx scratch0, scratch1;
8835 rtx op0 = expand_normal (arg0);
8836 rtx op1 = expand_normal (arg1);
8837 rtx op2 = expand_normal (arg2);
8838 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8839
8840 tmode0 = insn_data[d->icode].operand[0].mode;
8841 tmode1 = insn_data[d->icode].operand[1].mode;
8842 modev2 = insn_data[d->icode].operand[2].mode;
8843 modev3 = insn_data[d->icode].operand[3].mode;
8844 modeimm = insn_data[d->icode].operand[4].mode;
8845
8846 if (VECTOR_MODE_P (modev2))
8847 op0 = safe_vector_operand (op0, modev2);
8848 if (VECTOR_MODE_P (modev3))
8849 op1 = safe_vector_operand (op1, modev3);
8850
8851 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8852 op0 = copy_to_mode_reg (modev2, op0);
8853 if ((optimize && !register_operand (op1, modev3))
8854 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8855 op1 = copy_to_mode_reg (modev3, op1);
8856
8857 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8858 {
8859 error ("the third argument must be an 8-bit immediate");
8860 return const0_rtx;
8861 }
8862
8863 if (d->code == IX86_BUILTIN_PCMPISTRI128)
8864 {
8865 if (optimize || !target
8866 || GET_MODE (target) != tmode0
8867 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8868 target = gen_reg_rtx (tmode0);
8869
8870 scratch1 = gen_reg_rtx (tmode1);
8871
8872 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8873 }
8874 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8875 {
8876 if (optimize || !target
8877 || GET_MODE (target) != tmode1
8878 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8879 target = gen_reg_rtx (tmode1);
8880
8881 scratch0 = gen_reg_rtx (tmode0);
8882
8883 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8884 }
8885 else
8886 {
8887 gcc_assert (d->flag);
8888
8889 scratch0 = gen_reg_rtx (tmode0);
8890 scratch1 = gen_reg_rtx (tmode1);
8891
8892 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8893 }
8894
8895 if (! pat)
8896 return 0;
8897
8898 emit_insn (pat);
8899
8900 if (d->flag)
8901 {
8902 target = gen_reg_rtx (SImode);
8903 emit_move_insn (target, const0_rtx);
8904 target = gen_rtx_SUBREG (QImode, target, 0);
8905
8906 emit_insn
8907 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8908 gen_rtx_fmt_ee (EQ, QImode,
8909 gen_rtx_REG ((machine_mode) d->flag,
8910 FLAGS_REG),
8911 const0_rtx)));
8912 return SUBREG_REG (target);
8913 }
8914 else
8915 return target;
8916 }
8917
8918 /* Fixup modeless constants to fit required mode. */
8919
8920 static rtx
8921 fixup_modeless_constant (rtx x, machine_mode mode)
8922 {
8923 if (GET_MODE (x) == VOIDmode)
8924 x = convert_to_mode (mode, x, 1);
8925 return x;
8926 }
8927
8928 /* Subroutine of ix86_expand_builtin to take care of insns with
8929 variable number of operands. */
8930
8931 static rtx
8932 ix86_expand_args_builtin (const struct builtin_description *d,
8933 tree exp, rtx target)
8934 {
8935 rtx pat, real_target;
8936 unsigned int i, nargs;
8937 unsigned int nargs_constant = 0;
8938 unsigned int mask_pos = 0;
8939 int num_memory = 0;
8940 struct
8941 {
8942 rtx op;
8943 machine_mode mode;
8944 } args[6];
8945 bool second_arg_count = false;
8946 enum insn_code icode = d->icode;
8947 const struct insn_data_d *insn_p = &insn_data[icode];
8948 machine_mode tmode = insn_p->operand[0].mode;
8949 machine_mode rmode = VOIDmode;
8950 bool swap = false;
8951 enum rtx_code comparison = d->comparison;
8952
8953 switch ((enum ix86_builtin_func_type) d->flag)
8954 {
8955 case V2DF_FTYPE_V2DF_ROUND:
8956 case V4DF_FTYPE_V4DF_ROUND:
8957 case V8DF_FTYPE_V8DF_ROUND:
8958 case V4SF_FTYPE_V4SF_ROUND:
8959 case V8SF_FTYPE_V8SF_ROUND:
8960 case V16SF_FTYPE_V16SF_ROUND:
8961 case V4SI_FTYPE_V4SF_ROUND:
8962 case V8SI_FTYPE_V8SF_ROUND:
8963 case V16SI_FTYPE_V16SF_ROUND:
8964 return ix86_expand_sse_round (d, exp, target);
8965 case V4SI_FTYPE_V2DF_V2DF_ROUND:
8966 case V8SI_FTYPE_V4DF_V4DF_ROUND:
8967 case V16SI_FTYPE_V8DF_V8DF_ROUND:
8968 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
8969 case INT_FTYPE_V8SF_V8SF_PTEST:
8970 case INT_FTYPE_V4DI_V4DI_PTEST:
8971 case INT_FTYPE_V4DF_V4DF_PTEST:
8972 case INT_FTYPE_V4SF_V4SF_PTEST:
8973 case INT_FTYPE_V2DI_V2DI_PTEST:
8974 case INT_FTYPE_V2DF_V2DF_PTEST:
8975 return ix86_expand_sse_ptest (d, exp, target);
8976 case FLOAT128_FTYPE_FLOAT128:
8977 case FLOAT_FTYPE_FLOAT:
8978 case INT_FTYPE_INT:
8979 case UINT_FTYPE_UINT:
8980 case UINT16_FTYPE_UINT16:
8981 case UINT64_FTYPE_INT:
8982 case UINT64_FTYPE_UINT64:
8983 case INT64_FTYPE_INT64:
8984 case INT64_FTYPE_V4SF:
8985 case INT64_FTYPE_V2DF:
8986 case INT_FTYPE_V16QI:
8987 case INT_FTYPE_V8QI:
8988 case INT_FTYPE_V8SF:
8989 case INT_FTYPE_V4DF:
8990 case INT_FTYPE_V4SF:
8991 case INT_FTYPE_V2DF:
8992 case INT_FTYPE_V32QI:
8993 case V16QI_FTYPE_V16QI:
8994 case V8SI_FTYPE_V8SF:
8995 case V8SI_FTYPE_V4SI:
8996 case V8HI_FTYPE_V8HI:
8997 case V8HI_FTYPE_V16QI:
8998 case V8QI_FTYPE_V8QI:
8999 case V8SF_FTYPE_V8SF:
9000 case V8SF_FTYPE_V8SI:
9001 case V8SF_FTYPE_V4SF:
9002 case V8SF_FTYPE_V8HI:
9003 case V4SI_FTYPE_V4SI:
9004 case V4SI_FTYPE_V16QI:
9005 case V4SI_FTYPE_V4SF:
9006 case V4SI_FTYPE_V8SI:
9007 case V4SI_FTYPE_V8HI:
9008 case V4SI_FTYPE_V4DF:
9009 case V4SI_FTYPE_V2DF:
9010 case V4HI_FTYPE_V4HI:
9011 case V4DF_FTYPE_V4DF:
9012 case V4DF_FTYPE_V4SI:
9013 case V4DF_FTYPE_V4SF:
9014 case V4DF_FTYPE_V2DF:
9015 case V4SF_FTYPE_V4SF:
9016 case V4SF_FTYPE_V4SI:
9017 case V4SF_FTYPE_V8SF:
9018 case V4SF_FTYPE_V4DF:
9019 case V4SF_FTYPE_V8HI:
9020 case V4SF_FTYPE_V2DF:
9021 case V2DI_FTYPE_V2DI:
9022 case V2DI_FTYPE_V16QI:
9023 case V2DI_FTYPE_V8HI:
9024 case V2DI_FTYPE_V4SI:
9025 case V2DF_FTYPE_V2DF:
9026 case V2DF_FTYPE_V4SI:
9027 case V2DF_FTYPE_V4DF:
9028 case V2DF_FTYPE_V4SF:
9029 case V2DF_FTYPE_V2SI:
9030 case V2SI_FTYPE_V2SI:
9031 case V2SI_FTYPE_V4SF:
9032 case V2SI_FTYPE_V2SF:
9033 case V2SI_FTYPE_V2DF:
9034 case V2SF_FTYPE_V2SF:
9035 case V2SF_FTYPE_V2SI:
9036 case V32QI_FTYPE_V32QI:
9037 case V32QI_FTYPE_V16QI:
9038 case V16HI_FTYPE_V16HI:
9039 case V16HI_FTYPE_V8HI:
9040 case V8SI_FTYPE_V8SI:
9041 case V16HI_FTYPE_V16QI:
9042 case V8SI_FTYPE_V16QI:
9043 case V4DI_FTYPE_V16QI:
9044 case V8SI_FTYPE_V8HI:
9045 case V4DI_FTYPE_V8HI:
9046 case V4DI_FTYPE_V4SI:
9047 case V4DI_FTYPE_V2DI:
9048 case UQI_FTYPE_UQI:
9049 case UHI_FTYPE_UHI:
9050 case USI_FTYPE_USI:
9051 case USI_FTYPE_UQI:
9052 case USI_FTYPE_UHI:
9053 case UDI_FTYPE_UDI:
9054 case UHI_FTYPE_V16QI:
9055 case USI_FTYPE_V32QI:
9056 case UDI_FTYPE_V64QI:
9057 case V16QI_FTYPE_UHI:
9058 case V32QI_FTYPE_USI:
9059 case V64QI_FTYPE_UDI:
9060 case V8HI_FTYPE_UQI:
9061 case V16HI_FTYPE_UHI:
9062 case V32HI_FTYPE_USI:
9063 case V4SI_FTYPE_UQI:
9064 case V8SI_FTYPE_UQI:
9065 case V4SI_FTYPE_UHI:
9066 case V8SI_FTYPE_UHI:
9067 case UQI_FTYPE_V8HI:
9068 case UHI_FTYPE_V16HI:
9069 case USI_FTYPE_V32HI:
9070 case UQI_FTYPE_V4SI:
9071 case UQI_FTYPE_V8SI:
9072 case UHI_FTYPE_V16SI:
9073 case UQI_FTYPE_V2DI:
9074 case UQI_FTYPE_V4DI:
9075 case UQI_FTYPE_V8DI:
9076 case V16SI_FTYPE_UHI:
9077 case V2DI_FTYPE_UQI:
9078 case V4DI_FTYPE_UQI:
9079 case V16SI_FTYPE_INT:
9080 case V16SF_FTYPE_V8SF:
9081 case V16SI_FTYPE_V8SI:
9082 case V16SF_FTYPE_V4SF:
9083 case V16SI_FTYPE_V4SI:
9084 case V16SI_FTYPE_V16SF:
9085 case V16SI_FTYPE_V16SI:
9086 case V64QI_FTYPE_V64QI:
9087 case V32HI_FTYPE_V32HI:
9088 case V16SF_FTYPE_V16SF:
9089 case V8DI_FTYPE_UQI:
9090 case V8DI_FTYPE_V8DI:
9091 case V8DF_FTYPE_V4DF:
9092 case V8DF_FTYPE_V2DF:
9093 case V8DF_FTYPE_V8DF:
9094 case V4DI_FTYPE_V4DI:
9095 case V16HI_FTYPE_V16SF:
9096 case V8HI_FTYPE_V8SF:
9097 case V8HI_FTYPE_V4SF:
9098 nargs = 1;
9099 break;
9100 case V4SF_FTYPE_V4SF_VEC_MERGE:
9101 case V2DF_FTYPE_V2DF_VEC_MERGE:
9102 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9103 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9104 case V16QI_FTYPE_V16QI_V16QI:
9105 case V16QI_FTYPE_V8HI_V8HI:
9106 case V16SF_FTYPE_V16SF_V16SF:
9107 case V8QI_FTYPE_V8QI_V8QI:
9108 case V8QI_FTYPE_V4HI_V4HI:
9109 case V8HI_FTYPE_V8HI_V8HI:
9110 case V8HI_FTYPE_V16QI_V16QI:
9111 case V8HI_FTYPE_V4SI_V4SI:
9112 case V8SF_FTYPE_V8SF_V8SF:
9113 case V8SF_FTYPE_V8SF_V8SI:
9114 case V8DF_FTYPE_V8DF_V8DF:
9115 case V4SI_FTYPE_V4SI_V4SI:
9116 case V4SI_FTYPE_V8HI_V8HI:
9117 case V4SI_FTYPE_V2DF_V2DF:
9118 case V4HI_FTYPE_V4HI_V4HI:
9119 case V4HI_FTYPE_V8QI_V8QI:
9120 case V4HI_FTYPE_V2SI_V2SI:
9121 case V4DF_FTYPE_V4DF_V4DF:
9122 case V4DF_FTYPE_V4DF_V4DI:
9123 case V4SF_FTYPE_V4SF_V4SF:
9124 case V4SF_FTYPE_V4SF_V4SI:
9125 case V4SF_FTYPE_V4SF_V2SI:
9126 case V4SF_FTYPE_V4SF_V2DF:
9127 case V4SF_FTYPE_V4SF_UINT:
9128 case V4SF_FTYPE_V4SF_DI:
9129 case V4SF_FTYPE_V4SF_SI:
9130 case V2DI_FTYPE_V2DI_V2DI:
9131 case V2DI_FTYPE_V16QI_V16QI:
9132 case V2DI_FTYPE_V4SI_V4SI:
9133 case V2DI_FTYPE_V2DI_V16QI:
9134 case V2SI_FTYPE_V2SI_V2SI:
9135 case V2SI_FTYPE_V4HI_V4HI:
9136 case V2SI_FTYPE_V2SF_V2SF:
9137 case V2DF_FTYPE_V2DF_V2DF:
9138 case V2DF_FTYPE_V2DF_V4SF:
9139 case V2DF_FTYPE_V2DF_V2DI:
9140 case V2DF_FTYPE_V2DF_DI:
9141 case V2DF_FTYPE_V2DF_SI:
9142 case V2DF_FTYPE_V2DF_UINT:
9143 case V2SF_FTYPE_V2SF_V2SF:
9144 case V1DI_FTYPE_V1DI_V1DI:
9145 case V1DI_FTYPE_V8QI_V8QI:
9146 case V1DI_FTYPE_V2SI_V2SI:
9147 case V32QI_FTYPE_V16HI_V16HI:
9148 case V16HI_FTYPE_V8SI_V8SI:
9149 case V64QI_FTYPE_V64QI_V64QI:
9150 case V32QI_FTYPE_V32QI_V32QI:
9151 case V16HI_FTYPE_V32QI_V32QI:
9152 case V16HI_FTYPE_V16HI_V16HI:
9153 case V8SI_FTYPE_V4DF_V4DF:
9154 case V8SI_FTYPE_V8SI_V8SI:
9155 case V8SI_FTYPE_V16HI_V16HI:
9156 case V4DI_FTYPE_V4DI_V4DI:
9157 case V4DI_FTYPE_V8SI_V8SI:
9158 case V8DI_FTYPE_V64QI_V64QI:
9159 if (comparison == UNKNOWN)
9160 return ix86_expand_binop_builtin (icode, exp, target);
9161 nargs = 2;
9162 break;
9163 case V4SF_FTYPE_V4SF_V4SF_SWAP:
9164 case V2DF_FTYPE_V2DF_V2DF_SWAP:
9165 gcc_assert (comparison != UNKNOWN);
9166 nargs = 2;
9167 swap = true;
9168 break;
9169 case V16HI_FTYPE_V16HI_V8HI_COUNT:
9170 case V16HI_FTYPE_V16HI_SI_COUNT:
9171 case V8SI_FTYPE_V8SI_V4SI_COUNT:
9172 case V8SI_FTYPE_V8SI_SI_COUNT:
9173 case V4DI_FTYPE_V4DI_V2DI_COUNT:
9174 case V4DI_FTYPE_V4DI_INT_COUNT:
9175 case V8HI_FTYPE_V8HI_V8HI_COUNT:
9176 case V8HI_FTYPE_V8HI_SI_COUNT:
9177 case V4SI_FTYPE_V4SI_V4SI_COUNT:
9178 case V4SI_FTYPE_V4SI_SI_COUNT:
9179 case V4HI_FTYPE_V4HI_V4HI_COUNT:
9180 case V4HI_FTYPE_V4HI_SI_COUNT:
9181 case V2DI_FTYPE_V2DI_V2DI_COUNT:
9182 case V2DI_FTYPE_V2DI_SI_COUNT:
9183 case V2SI_FTYPE_V2SI_V2SI_COUNT:
9184 case V2SI_FTYPE_V2SI_SI_COUNT:
9185 case V1DI_FTYPE_V1DI_V1DI_COUNT:
9186 case V1DI_FTYPE_V1DI_SI_COUNT:
9187 nargs = 2;
9188 second_arg_count = true;
9189 break;
9190 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9191 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9192 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9193 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9194 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9195 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9196 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9197 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9198 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9199 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9200 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9201 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9202 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9203 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9204 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9205 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9206 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9207 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9208 nargs = 4;
9209 second_arg_count = true;
9210 break;
9211 case UINT64_FTYPE_UINT64_UINT64:
9212 case UINT_FTYPE_UINT_UINT:
9213 case UINT_FTYPE_UINT_USHORT:
9214 case UINT_FTYPE_UINT_UCHAR:
9215 case UINT16_FTYPE_UINT16_INT:
9216 case UINT8_FTYPE_UINT8_INT:
9217 case UQI_FTYPE_UQI_UQI:
9218 case UHI_FTYPE_UHI_UHI:
9219 case USI_FTYPE_USI_USI:
9220 case UDI_FTYPE_UDI_UDI:
9221 case V16SI_FTYPE_V8DF_V8DF:
9222 case V32HI_FTYPE_V16SF_V16SF:
9223 case V16HI_FTYPE_V8SF_V8SF:
9224 case V8HI_FTYPE_V4SF_V4SF:
9225 case V16HI_FTYPE_V16SF_UHI:
9226 case V8HI_FTYPE_V8SF_UQI:
9227 case V8HI_FTYPE_V4SF_UQI:
9228 nargs = 2;
9229 break;
9230 case V2DI_FTYPE_V2DI_INT_CONVERT:
9231 nargs = 2;
9232 rmode = V1TImode;
9233 nargs_constant = 1;
9234 break;
9235 case V4DI_FTYPE_V4DI_INT_CONVERT:
9236 nargs = 2;
9237 rmode = V2TImode;
9238 nargs_constant = 1;
9239 break;
9240 case V8DI_FTYPE_V8DI_INT_CONVERT:
9241 nargs = 2;
9242 rmode = V4TImode;
9243 nargs_constant = 1;
9244 break;
9245 case V8HI_FTYPE_V8HI_INT:
9246 case V8HI_FTYPE_V8SF_INT:
9247 case V16HI_FTYPE_V16SF_INT:
9248 case V8HI_FTYPE_V4SF_INT:
9249 case V8SF_FTYPE_V8SF_INT:
9250 case V4SF_FTYPE_V16SF_INT:
9251 case V16SF_FTYPE_V16SF_INT:
9252 case V4SI_FTYPE_V4SI_INT:
9253 case V4SI_FTYPE_V8SI_INT:
9254 case V4HI_FTYPE_V4HI_INT:
9255 case V4DF_FTYPE_V4DF_INT:
9256 case V4DF_FTYPE_V8DF_INT:
9257 case V4SF_FTYPE_V4SF_INT:
9258 case V4SF_FTYPE_V8SF_INT:
9259 case V2DI_FTYPE_V2DI_INT:
9260 case V2DF_FTYPE_V2DF_INT:
9261 case V2DF_FTYPE_V4DF_INT:
9262 case V16HI_FTYPE_V16HI_INT:
9263 case V8SI_FTYPE_V8SI_INT:
9264 case V16SI_FTYPE_V16SI_INT:
9265 case V4SI_FTYPE_V16SI_INT:
9266 case V4DI_FTYPE_V4DI_INT:
9267 case V2DI_FTYPE_V4DI_INT:
9268 case V4DI_FTYPE_V8DI_INT:
9269 case UQI_FTYPE_UQI_UQI_CONST:
9270 case UHI_FTYPE_UHI_UQI:
9271 case USI_FTYPE_USI_UQI:
9272 case UDI_FTYPE_UDI_UQI:
9273 nargs = 2;
9274 nargs_constant = 1;
9275 break;
9276 case V16QI_FTYPE_V16QI_V16QI_V16QI:
9277 case V8SF_FTYPE_V8SF_V8SF_V8SF:
9278 case V4DF_FTYPE_V4DF_V4DF_V4DF:
9279 case V4SF_FTYPE_V4SF_V4SF_V4SF:
9280 case V2DF_FTYPE_V2DF_V2DF_V2DF:
9281 case V32QI_FTYPE_V32QI_V32QI_V32QI:
9282 case UHI_FTYPE_V16SI_V16SI_UHI:
9283 case UQI_FTYPE_V8DI_V8DI_UQI:
9284 case V16HI_FTYPE_V16SI_V16HI_UHI:
9285 case V16QI_FTYPE_V16SI_V16QI_UHI:
9286 case V16QI_FTYPE_V8DI_V16QI_UQI:
9287 case V16SF_FTYPE_V16SF_V16SF_UHI:
9288 case V16SF_FTYPE_V4SF_V16SF_UHI:
9289 case V16SI_FTYPE_SI_V16SI_UHI:
9290 case V16SI_FTYPE_V16HI_V16SI_UHI:
9291 case V16SI_FTYPE_V16QI_V16SI_UHI:
9292 case V8SF_FTYPE_V4SF_V8SF_UQI:
9293 case V4DF_FTYPE_V2DF_V4DF_UQI:
9294 case V8SI_FTYPE_V4SI_V8SI_UQI:
9295 case V8SI_FTYPE_SI_V8SI_UQI:
9296 case V4SI_FTYPE_V4SI_V4SI_UQI:
9297 case V4SI_FTYPE_SI_V4SI_UQI:
9298 case V4DI_FTYPE_V2DI_V4DI_UQI:
9299 case V4DI_FTYPE_DI_V4DI_UQI:
9300 case V2DI_FTYPE_V2DI_V2DI_UQI:
9301 case V2DI_FTYPE_DI_V2DI_UQI:
9302 case V64QI_FTYPE_V64QI_V64QI_UDI:
9303 case V64QI_FTYPE_V16QI_V64QI_UDI:
9304 case V64QI_FTYPE_QI_V64QI_UDI:
9305 case V32QI_FTYPE_V32QI_V32QI_USI:
9306 case V32QI_FTYPE_V16QI_V32QI_USI:
9307 case V32QI_FTYPE_QI_V32QI_USI:
9308 case V16QI_FTYPE_V16QI_V16QI_UHI:
9309 case V16QI_FTYPE_QI_V16QI_UHI:
9310 case V32HI_FTYPE_V8HI_V32HI_USI:
9311 case V32HI_FTYPE_HI_V32HI_USI:
9312 case V16HI_FTYPE_V8HI_V16HI_UHI:
9313 case V16HI_FTYPE_HI_V16HI_UHI:
9314 case V8HI_FTYPE_V8HI_V8HI_UQI:
9315 case V8HI_FTYPE_HI_V8HI_UQI:
9316 case V8SF_FTYPE_V8HI_V8SF_UQI:
9317 case V4SF_FTYPE_V8HI_V4SF_UQI:
9318 case V8SI_FTYPE_V8SF_V8SI_UQI:
9319 case V4SI_FTYPE_V4SF_V4SI_UQI:
9320 case V4DI_FTYPE_V4SF_V4DI_UQI:
9321 case V2DI_FTYPE_V4SF_V2DI_UQI:
9322 case V4SF_FTYPE_V4DI_V4SF_UQI:
9323 case V4SF_FTYPE_V2DI_V4SF_UQI:
9324 case V4DF_FTYPE_V4DI_V4DF_UQI:
9325 case V2DF_FTYPE_V2DI_V2DF_UQI:
9326 case V16QI_FTYPE_V8HI_V16QI_UQI:
9327 case V16QI_FTYPE_V16HI_V16QI_UHI:
9328 case V16QI_FTYPE_V4SI_V16QI_UQI:
9329 case V16QI_FTYPE_V8SI_V16QI_UQI:
9330 case V8HI_FTYPE_V4SI_V8HI_UQI:
9331 case V8HI_FTYPE_V8SI_V8HI_UQI:
9332 case V16QI_FTYPE_V2DI_V16QI_UQI:
9333 case V16QI_FTYPE_V4DI_V16QI_UQI:
9334 case V8HI_FTYPE_V2DI_V8HI_UQI:
9335 case V8HI_FTYPE_V4DI_V8HI_UQI:
9336 case V4SI_FTYPE_V2DI_V4SI_UQI:
9337 case V4SI_FTYPE_V4DI_V4SI_UQI:
9338 case V32QI_FTYPE_V32HI_V32QI_USI:
9339 case UHI_FTYPE_V16QI_V16QI_UHI:
9340 case USI_FTYPE_V32QI_V32QI_USI:
9341 case UDI_FTYPE_V64QI_V64QI_UDI:
9342 case UQI_FTYPE_V8HI_V8HI_UQI:
9343 case UHI_FTYPE_V16HI_V16HI_UHI:
9344 case USI_FTYPE_V32HI_V32HI_USI:
9345 case UQI_FTYPE_V4SI_V4SI_UQI:
9346 case UQI_FTYPE_V8SI_V8SI_UQI:
9347 case UQI_FTYPE_V2DI_V2DI_UQI:
9348 case UQI_FTYPE_V4DI_V4DI_UQI:
9349 case V4SF_FTYPE_V2DF_V4SF_UQI:
9350 case V4SF_FTYPE_V4DF_V4SF_UQI:
9351 case V16SI_FTYPE_V16SI_V16SI_UHI:
9352 case V16SI_FTYPE_V4SI_V16SI_UHI:
9353 case V2DI_FTYPE_V4SI_V2DI_UQI:
9354 case V2DI_FTYPE_V8HI_V2DI_UQI:
9355 case V2DI_FTYPE_V16QI_V2DI_UQI:
9356 case V4DI_FTYPE_V4DI_V4DI_UQI:
9357 case V4DI_FTYPE_V4SI_V4DI_UQI:
9358 case V4DI_FTYPE_V8HI_V4DI_UQI:
9359 case V4DI_FTYPE_V16QI_V4DI_UQI:
9360 case V4DI_FTYPE_V4DF_V4DI_UQI:
9361 case V2DI_FTYPE_V2DF_V2DI_UQI:
9362 case V4SI_FTYPE_V4DF_V4SI_UQI:
9363 case V4SI_FTYPE_V2DF_V4SI_UQI:
9364 case V4SI_FTYPE_V8HI_V4SI_UQI:
9365 case V4SI_FTYPE_V16QI_V4SI_UQI:
9366 case V4DI_FTYPE_V4DI_V4DI_V4DI:
9367 case V8DF_FTYPE_V2DF_V8DF_UQI:
9368 case V8DF_FTYPE_V4DF_V8DF_UQI:
9369 case V8DF_FTYPE_V8DF_V8DF_UQI:
9370 case V8SF_FTYPE_V8SF_V8SF_UQI:
9371 case V8SF_FTYPE_V8SI_V8SF_UQI:
9372 case V4DF_FTYPE_V4DF_V4DF_UQI:
9373 case V4SF_FTYPE_V4SF_V4SF_UQI:
9374 case V2DF_FTYPE_V2DF_V2DF_UQI:
9375 case V2DF_FTYPE_V4SF_V2DF_UQI:
9376 case V2DF_FTYPE_V4SI_V2DF_UQI:
9377 case V4SF_FTYPE_V4SI_V4SF_UQI:
9378 case V4DF_FTYPE_V4SF_V4DF_UQI:
9379 case V4DF_FTYPE_V4SI_V4DF_UQI:
9380 case V8SI_FTYPE_V8SI_V8SI_UQI:
9381 case V8SI_FTYPE_V8HI_V8SI_UQI:
9382 case V8SI_FTYPE_V16QI_V8SI_UQI:
9383 case V8DF_FTYPE_V8SI_V8DF_UQI:
9384 case V8DI_FTYPE_DI_V8DI_UQI:
9385 case V16SF_FTYPE_V8SF_V16SF_UHI:
9386 case V16SI_FTYPE_V8SI_V16SI_UHI:
9387 case V16HI_FTYPE_V16HI_V16HI_UHI:
9388 case V8HI_FTYPE_V16QI_V8HI_UQI:
9389 case V16HI_FTYPE_V16QI_V16HI_UHI:
9390 case V32HI_FTYPE_V32HI_V32HI_USI:
9391 case V32HI_FTYPE_V32QI_V32HI_USI:
9392 case V8DI_FTYPE_V16QI_V8DI_UQI:
9393 case V8DI_FTYPE_V2DI_V8DI_UQI:
9394 case V8DI_FTYPE_V4DI_V8DI_UQI:
9395 case V8DI_FTYPE_V8DI_V8DI_UQI:
9396 case V8DI_FTYPE_V8HI_V8DI_UQI:
9397 case V8DI_FTYPE_V8SI_V8DI_UQI:
9398 case V8HI_FTYPE_V8DI_V8HI_UQI:
9399 case V8SI_FTYPE_V8DI_V8SI_UQI:
9400 case V4SI_FTYPE_V4SI_V4SI_V4SI:
9401 case V16SI_FTYPE_V16SI_V16SI_V16SI:
9402 case V8DI_FTYPE_V8DI_V8DI_V8DI:
9403 case V32HI_FTYPE_V32HI_V32HI_V32HI:
9404 case V2DI_FTYPE_V2DI_V2DI_V2DI:
9405 case V16HI_FTYPE_V16HI_V16HI_V16HI:
9406 case V8SI_FTYPE_V8SI_V8SI_V8SI:
9407 case V8HI_FTYPE_V8HI_V8HI_V8HI:
9408 case V32HI_FTYPE_V16SF_V16SF_USI:
9409 case V16HI_FTYPE_V8SF_V8SF_UHI:
9410 case V8HI_FTYPE_V4SF_V4SF_UQI:
9411 case V16HI_FTYPE_V16SF_V16HI_UHI:
9412 case V8HI_FTYPE_V8SF_V8HI_UQI:
9413 case V8HI_FTYPE_V4SF_V8HI_UQI:
9414 case V16SF_FTYPE_V16SF_V32HI_V32HI:
9415 case V8SF_FTYPE_V8SF_V16HI_V16HI:
9416 case V4SF_FTYPE_V4SF_V8HI_V8HI:
9417 nargs = 3;
9418 break;
9419 case V32QI_FTYPE_V32QI_V32QI_INT:
9420 case V16HI_FTYPE_V16HI_V16HI_INT:
9421 case V16QI_FTYPE_V16QI_V16QI_INT:
9422 case V4DI_FTYPE_V4DI_V4DI_INT:
9423 case V8HI_FTYPE_V8HI_V8HI_INT:
9424 case V8SI_FTYPE_V8SI_V8SI_INT:
9425 case V8SI_FTYPE_V8SI_V4SI_INT:
9426 case V8SF_FTYPE_V8SF_V8SF_INT:
9427 case V8SF_FTYPE_V8SF_V4SF_INT:
9428 case V4SI_FTYPE_V4SI_V4SI_INT:
9429 case V4DF_FTYPE_V4DF_V4DF_INT:
9430 case V16SF_FTYPE_V16SF_V16SF_INT:
9431 case V16SF_FTYPE_V16SF_V4SF_INT:
9432 case V16SI_FTYPE_V16SI_V4SI_INT:
9433 case V4DF_FTYPE_V4DF_V2DF_INT:
9434 case V4SF_FTYPE_V4SF_V4SF_INT:
9435 case V2DI_FTYPE_V2DI_V2DI_INT:
9436 case V4DI_FTYPE_V4DI_V2DI_INT:
9437 case V2DF_FTYPE_V2DF_V2DF_INT:
9438 case UQI_FTYPE_V8DI_V8UDI_INT:
9439 case UQI_FTYPE_V8DF_V8DF_INT:
9440 case UQI_FTYPE_V2DF_V2DF_INT:
9441 case UQI_FTYPE_V4SF_V4SF_INT:
9442 case UHI_FTYPE_V16SI_V16SI_INT:
9443 case UHI_FTYPE_V16SF_V16SF_INT:
9444 case V64QI_FTYPE_V64QI_V64QI_INT:
9445 case V32HI_FTYPE_V32HI_V32HI_INT:
9446 case V16SI_FTYPE_V16SI_V16SI_INT:
9447 case V8DI_FTYPE_V8DI_V8DI_INT:
9448 nargs = 3;
9449 nargs_constant = 1;
9450 break;
9451 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9452 nargs = 3;
9453 rmode = V4DImode;
9454 nargs_constant = 1;
9455 break;
9456 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9457 nargs = 3;
9458 rmode = V2DImode;
9459 nargs_constant = 1;
9460 break;
9461 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9462 nargs = 3;
9463 rmode = DImode;
9464 nargs_constant = 1;
9465 break;
9466 case V2DI_FTYPE_V2DI_UINT_UINT:
9467 nargs = 3;
9468 nargs_constant = 2;
9469 break;
9470 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9471 nargs = 3;
9472 rmode = V8DImode;
9473 nargs_constant = 1;
9474 break;
9475 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9476 nargs = 5;
9477 rmode = V8DImode;
9478 mask_pos = 2;
9479 nargs_constant = 1;
9480 break;
9481 case QI_FTYPE_V8DF_INT_UQI:
9482 case QI_FTYPE_V4DF_INT_UQI:
9483 case QI_FTYPE_V2DF_INT_UQI:
9484 case HI_FTYPE_V16SF_INT_UHI:
9485 case QI_FTYPE_V8SF_INT_UQI:
9486 case QI_FTYPE_V4SF_INT_UQI:
9487 case V4SI_FTYPE_V4SI_V4SI_UHI:
9488 case V8SI_FTYPE_V8SI_V8SI_UHI:
9489 nargs = 3;
9490 mask_pos = 1;
9491 nargs_constant = 1;
9492 break;
9493 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9494 nargs = 5;
9495 rmode = V4DImode;
9496 mask_pos = 2;
9497 nargs_constant = 1;
9498 break;
9499 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9500 nargs = 5;
9501 rmode = V2DImode;
9502 mask_pos = 2;
9503 nargs_constant = 1;
9504 break;
9505 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9506 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9507 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9508 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9509 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9510 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9511 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9512 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9513 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9514 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9515 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9516 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9517 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9518 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9519 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9520 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9521 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9522 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9523 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9524 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9525 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9526 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9527 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9528 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9529 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9530 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9531 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9532 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9533 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9534 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9535 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9536 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9537 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9538 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9539 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9540 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9541 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9542 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9543 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9544 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9545 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9546 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9547 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9548 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9549 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9550 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9551 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9552 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9553 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9554 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9555 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
9556 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9557 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9558 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
9559 nargs = 4;
9560 break;
9561 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9562 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9563 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9564 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9565 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9566 nargs = 4;
9567 nargs_constant = 1;
9568 break;
9569 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9570 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9571 case QI_FTYPE_V4DF_V4DF_INT_UQI:
9572 case QI_FTYPE_V8SF_V8SF_INT_UQI:
9573 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9574 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9575 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9576 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9577 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9578 case USI_FTYPE_V32QI_V32QI_INT_USI:
9579 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9580 case USI_FTYPE_V32HI_V32HI_INT_USI:
9581 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9582 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
9583 nargs = 4;
9584 mask_pos = 1;
9585 nargs_constant = 1;
9586 break;
9587 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9588 nargs = 4;
9589 nargs_constant = 2;
9590 break;
9591 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9592 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
9593 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9594 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9595 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
9596 nargs = 4;
9597 break;
9598 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9599 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9600 mask_pos = 1;
9601 nargs = 4;
9602 nargs_constant = 1;
9603 break;
9604 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9605 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9606 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9607 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9608 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9609 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9610 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9611 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9612 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9613 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9614 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9615 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9616 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9617 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9618 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9619 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9620 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9621 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9622 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9623 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9624 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9625 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9626 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9627 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9628 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9629 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9630 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9631 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9632 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9633 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9634 nargs = 4;
9635 mask_pos = 2;
9636 nargs_constant = 1;
9637 break;
9638 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9639 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9640 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9641 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9642 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9643 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9644 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9645 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9646 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9647 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9648 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9649 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9650 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9651 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9652 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9653 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9654 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9655 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9656 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9657 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9658 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9659 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9660 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9661 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9662 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9663 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9664 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9665 nargs = 5;
9666 mask_pos = 2;
9667 nargs_constant = 1;
9668 break;
9669 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9670 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9671 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9672 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9673 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9674 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9675 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9676 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9677 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9678 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9679 nargs = 5;
9680 mask_pos = 1;
9681 nargs_constant = 1;
9682 break;
9683 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9684 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9685 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9686 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9687 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9688 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9689 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9690 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9691 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9692 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9693 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9694 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9695 nargs = 5;
9696 mask_pos = 1;
9697 nargs_constant = 2;
9698 break;
9699
9700 default:
9701 gcc_unreachable ();
9702 }
9703
9704 gcc_assert (nargs <= ARRAY_SIZE (args));
9705
9706 if (comparison != UNKNOWN)
9707 {
9708 gcc_assert (nargs == 2);
9709 return ix86_expand_sse_compare (d, exp, target, swap);
9710 }
9711
9712 if (rmode == VOIDmode || rmode == tmode)
9713 {
9714 if (optimize
9715 || target == 0
9716 || GET_MODE (target) != tmode
9717 || !insn_p->operand[0].predicate (target, tmode))
9718 target = gen_reg_rtx (tmode);
9719 else if (memory_operand (target, tmode))
9720 num_memory++;
9721 real_target = target;
9722 }
9723 else
9724 {
9725 real_target = gen_reg_rtx (tmode);
9726 target = lowpart_subreg (rmode, real_target, tmode);
9727 }
9728
9729 for (i = 0; i < nargs; i++)
9730 {
9731 tree arg = CALL_EXPR_ARG (exp, i);
9732 rtx op = expand_normal (arg);
9733 machine_mode mode = insn_p->operand[i + 1].mode;
9734 bool match = insn_p->operand[i + 1].predicate (op, mode);
9735
9736 if (second_arg_count && i == 1)
9737 {
9738 /* SIMD shift insns take either an 8-bit immediate or
9739 register as count. But builtin functions take int as
9740 count. If count doesn't match, we put it in register.
9741 The instructions are using 64-bit count, if op is just
9742 32-bit, zero-extend it, as negative shift counts
9743 are undefined behavior and zero-extension is more
9744 efficient. */
9745 if (!match)
9746 {
9747 if (SCALAR_INT_MODE_P (GET_MODE (op)))
9748 op = convert_modes (mode, GET_MODE (op), op, 1);
9749 else
9750 op = lowpart_subreg (mode, op, GET_MODE (op));
9751 if (!insn_p->operand[i + 1].predicate (op, mode))
9752 op = copy_to_reg (op);
9753 }
9754 }
9755 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9756 (!mask_pos && (nargs - i) <= nargs_constant))
9757 {
9758 if (!match)
9759 switch (icode)
9760 {
9761 case CODE_FOR_avx_vinsertf128v4di:
9762 case CODE_FOR_avx_vextractf128v4di:
9763 error ("the last argument must be an 1-bit immediate");
9764 return const0_rtx;
9765
9766 case CODE_FOR_avx512f_cmpv8di3_mask:
9767 case CODE_FOR_avx512f_cmpv16si3_mask:
9768 case CODE_FOR_avx512f_ucmpv8di3_mask:
9769 case CODE_FOR_avx512f_ucmpv16si3_mask:
9770 case CODE_FOR_avx512vl_cmpv4di3_mask:
9771 case CODE_FOR_avx512vl_cmpv8si3_mask:
9772 case CODE_FOR_avx512vl_ucmpv4di3_mask:
9773 case CODE_FOR_avx512vl_ucmpv8si3_mask:
9774 case CODE_FOR_avx512vl_cmpv2di3_mask:
9775 case CODE_FOR_avx512vl_cmpv4si3_mask:
9776 case CODE_FOR_avx512vl_ucmpv2di3_mask:
9777 case CODE_FOR_avx512vl_ucmpv4si3_mask:
9778 error ("the last argument must be a 3-bit immediate");
9779 return const0_rtx;
9780
9781 case CODE_FOR_sse4_1_roundsd:
9782 case CODE_FOR_sse4_1_roundss:
9783
9784 case CODE_FOR_sse4_1_roundpd:
9785 case CODE_FOR_sse4_1_roundps:
9786 case CODE_FOR_avx_roundpd256:
9787 case CODE_FOR_avx_roundps256:
9788
9789 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9790 case CODE_FOR_sse4_1_roundps_sfix:
9791 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9792 case CODE_FOR_avx_roundps_sfix256:
9793
9794 case CODE_FOR_sse4_1_blendps:
9795 case CODE_FOR_avx_blendpd256:
9796 case CODE_FOR_avx_vpermilv4df:
9797 case CODE_FOR_avx_vpermilv4df_mask:
9798 case CODE_FOR_avx512f_getmantv8df_mask:
9799 case CODE_FOR_avx512f_getmantv16sf_mask:
9800 case CODE_FOR_avx512vl_getmantv8sf_mask:
9801 case CODE_FOR_avx512vl_getmantv4df_mask:
9802 case CODE_FOR_avx512vl_getmantv4sf_mask:
9803 case CODE_FOR_avx512vl_getmantv2df_mask:
9804 case CODE_FOR_avx512dq_rangepv8df_mask_round:
9805 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9806 case CODE_FOR_avx512dq_rangepv4df_mask:
9807 case CODE_FOR_avx512dq_rangepv8sf_mask:
9808 case CODE_FOR_avx512dq_rangepv2df_mask:
9809 case CODE_FOR_avx512dq_rangepv4sf_mask:
9810 case CODE_FOR_avx_shufpd256_mask:
9811 error ("the last argument must be a 4-bit immediate");
9812 return const0_rtx;
9813
9814 case CODE_FOR_sha1rnds4:
9815 case CODE_FOR_sse4_1_blendpd:
9816 case CODE_FOR_avx_vpermilv2df:
9817 case CODE_FOR_avx_vpermilv2df_mask:
9818 case CODE_FOR_xop_vpermil2v2df3:
9819 case CODE_FOR_xop_vpermil2v4sf3:
9820 case CODE_FOR_xop_vpermil2v4df3:
9821 case CODE_FOR_xop_vpermil2v8sf3:
9822 case CODE_FOR_avx512f_vinsertf32x4_mask:
9823 case CODE_FOR_avx512f_vinserti32x4_mask:
9824 case CODE_FOR_avx512f_vextractf32x4_mask:
9825 case CODE_FOR_avx512f_vextracti32x4_mask:
9826 case CODE_FOR_sse2_shufpd:
9827 case CODE_FOR_sse2_shufpd_mask:
9828 case CODE_FOR_avx512dq_shuf_f64x2_mask:
9829 case CODE_FOR_avx512dq_shuf_i64x2_mask:
9830 case CODE_FOR_avx512vl_shuf_i32x4_mask:
9831 case CODE_FOR_avx512vl_shuf_f32x4_mask:
9832 error ("the last argument must be a 2-bit immediate");
9833 return const0_rtx;
9834
9835 case CODE_FOR_avx_vextractf128v4df:
9836 case CODE_FOR_avx_vextractf128v8sf:
9837 case CODE_FOR_avx_vextractf128v8si:
9838 case CODE_FOR_avx_vinsertf128v4df:
9839 case CODE_FOR_avx_vinsertf128v8sf:
9840 case CODE_FOR_avx_vinsertf128v8si:
9841 case CODE_FOR_avx512f_vinsertf64x4_mask:
9842 case CODE_FOR_avx512f_vinserti64x4_mask:
9843 case CODE_FOR_avx512f_vextractf64x4_mask:
9844 case CODE_FOR_avx512f_vextracti64x4_mask:
9845 case CODE_FOR_avx512dq_vinsertf32x8_mask:
9846 case CODE_FOR_avx512dq_vinserti32x8_mask:
9847 case CODE_FOR_avx512vl_vinsertv4df:
9848 case CODE_FOR_avx512vl_vinsertv4di:
9849 case CODE_FOR_avx512vl_vinsertv8sf:
9850 case CODE_FOR_avx512vl_vinsertv8si:
9851 error ("the last argument must be a 1-bit immediate");
9852 return const0_rtx;
9853
9854 case CODE_FOR_avx_vmcmpv2df3:
9855 case CODE_FOR_avx_vmcmpv4sf3:
9856 case CODE_FOR_avx_cmpv2df3:
9857 case CODE_FOR_avx_cmpv4sf3:
9858 case CODE_FOR_avx_cmpv4df3:
9859 case CODE_FOR_avx_cmpv8sf3:
9860 case CODE_FOR_avx512f_cmpv8df3_mask:
9861 case CODE_FOR_avx512f_cmpv16sf3_mask:
9862 case CODE_FOR_avx512f_vmcmpv2df3_mask:
9863 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9864 error ("the last argument must be a 5-bit immediate");
9865 return const0_rtx;
9866
9867 default:
9868 switch (nargs_constant)
9869 {
9870 case 2:
9871 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9872 (!mask_pos && (nargs - i) == nargs_constant))
9873 {
9874 error ("the next to last argument must be an 8-bit immediate");
9875 break;
9876 }
9877 /* FALLTHRU */
9878 case 1:
9879 error ("the last argument must be an 8-bit immediate");
9880 break;
9881 default:
9882 gcc_unreachable ();
9883 }
9884 return const0_rtx;
9885 }
9886 }
9887 else
9888 {
9889 if (VECTOR_MODE_P (mode))
9890 op = safe_vector_operand (op, mode);
9891
9892 /* If we aren't optimizing, only allow one memory operand to
9893 be generated. */
9894 if (memory_operand (op, mode))
9895 num_memory++;
9896
9897 op = fixup_modeless_constant (op, mode);
9898
9899 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9900 {
9901 if (optimize || !match || num_memory > 1)
9902 op = copy_to_mode_reg (mode, op);
9903 }
9904 else
9905 {
9906 op = copy_to_reg (op);
9907 op = lowpart_subreg (mode, op, GET_MODE (op));
9908 }
9909 }
9910
9911 args[i].op = op;
9912 args[i].mode = mode;
9913 }
9914
9915 switch (nargs)
9916 {
9917 case 1:
9918 pat = GEN_FCN (icode) (real_target, args[0].op);
9919 break;
9920 case 2:
9921 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
9922 break;
9923 case 3:
9924 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9925 args[2].op);
9926 break;
9927 case 4:
9928 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9929 args[2].op, args[3].op);
9930 break;
9931 case 5:
9932 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9933 args[2].op, args[3].op, args[4].op);
9934 break;
9935 case 6:
9936 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9937 args[2].op, args[3].op, args[4].op,
9938 args[5].op);
9939 break;
9940 default:
9941 gcc_unreachable ();
9942 }
9943
9944 if (! pat)
9945 return 0;
9946
9947 emit_insn (pat);
9948 return target;
9949 }
9950
9951 /* Transform pattern of following layout:
9952 (set A
9953 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9954 )
9955 into:
9956 (set (A B)) */
9957
9958 static rtx
9959 ix86_erase_embedded_rounding (rtx pat)
9960 {
9961 if (GET_CODE (pat) == INSN)
9962 pat = PATTERN (pat);
9963
9964 gcc_assert (GET_CODE (pat) == SET);
9965 rtx src = SET_SRC (pat);
9966 gcc_assert (XVECLEN (src, 0) == 2);
9967 rtx p0 = XVECEXP (src, 0, 0);
9968 gcc_assert (GET_CODE (src) == UNSPEC
9969 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
9970 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
9971 return res;
9972 }
9973
9974 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9975 with rounding. */
9976 static rtx
9977 ix86_expand_sse_comi_round (const struct builtin_description *d,
9978 tree exp, rtx target)
9979 {
9980 rtx pat, set_dst;
9981 tree arg0 = CALL_EXPR_ARG (exp, 0);
9982 tree arg1 = CALL_EXPR_ARG (exp, 1);
9983 tree arg2 = CALL_EXPR_ARG (exp, 2);
9984 tree arg3 = CALL_EXPR_ARG (exp, 3);
9985 rtx op0 = expand_normal (arg0);
9986 rtx op1 = expand_normal (arg1);
9987 rtx op2 = expand_normal (arg2);
9988 rtx op3 = expand_normal (arg3);
9989 enum insn_code icode = d->icode;
9990 const struct insn_data_d *insn_p = &insn_data[icode];
9991 machine_mode mode0 = insn_p->operand[0].mode;
9992 machine_mode mode1 = insn_p->operand[1].mode;
9993
9994 /* See avxintrin.h for values. */
9995 static const enum rtx_code comparisons[32] =
9996 {
9997 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9998 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
9999 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10000 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
10001 };
10002 static const bool ordereds[32] =
10003 {
10004 true, true, true, false, false, false, false, true,
10005 false, false, false, true, true, true, true, false,
10006 true, true, true, false, false, false, false, true,
10007 false, false, false, true, true, true, true, false
10008 };
10009 static const bool non_signalings[32] =
10010 {
10011 true, false, false, true, true, false, false, true,
10012 true, false, false, true, true, false, false, true,
10013 false, true, true, false, false, true, true, false,
10014 false, true, true, false, false, true, true, false
10015 };
10016
10017 if (!CONST_INT_P (op2))
10018 {
10019 error ("the third argument must be comparison constant");
10020 return const0_rtx;
10021 }
10022 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
10023 {
10024 error ("incorrect comparison mode");
10025 return const0_rtx;
10026 }
10027
10028 if (!insn_p->operand[2].predicate (op3, SImode))
10029 {
10030 error ("incorrect rounding operand");
10031 return const0_rtx;
10032 }
10033
10034 if (VECTOR_MODE_P (mode0))
10035 op0 = safe_vector_operand (op0, mode0);
10036 if (VECTOR_MODE_P (mode1))
10037 op1 = safe_vector_operand (op1, mode1);
10038
10039 enum rtx_code comparison = comparisons[INTVAL (op2)];
10040 bool ordered = ordereds[INTVAL (op2)];
10041 bool non_signaling = non_signalings[INTVAL (op2)];
10042 rtx const_val = const0_rtx;
10043
10044 bool check_unordered = false;
10045 machine_mode mode = CCFPmode;
10046 switch (comparison)
10047 {
10048 case ORDERED:
10049 if (!ordered)
10050 {
10051 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10052 if (!non_signaling)
10053 ordered = true;
10054 mode = CCSmode;
10055 }
10056 else
10057 {
10058 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10059 if (non_signaling)
10060 ordered = false;
10061 mode = CCPmode;
10062 }
10063 comparison = NE;
10064 break;
10065 case UNORDERED:
10066 if (ordered)
10067 {
10068 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10069 if (non_signaling)
10070 ordered = false;
10071 mode = CCSmode;
10072 }
10073 else
10074 {
10075 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10076 if (!non_signaling)
10077 ordered = true;
10078 mode = CCPmode;
10079 }
10080 comparison = EQ;
10081 break;
10082
10083 case LE: /* -> GE */
10084 case LT: /* -> GT */
10085 case UNGE: /* -> UNLE */
10086 case UNGT: /* -> UNLT */
10087 std::swap (op0, op1);
10088 comparison = swap_condition (comparison);
10089 /* FALLTHRU */
10090 case GT:
10091 case GE:
10092 case UNEQ:
10093 case UNLT:
10094 case UNLE:
10095 case LTGT:
10096 /* These are supported by CCFPmode. NB: Use ordered/signaling
10097 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10098 with NAN operands. */
10099 if (ordered == non_signaling)
10100 ordered = !ordered;
10101 break;
10102 case EQ:
10103 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10104 _CMP_EQ_OQ/_CMP_EQ_OS. */
10105 check_unordered = true;
10106 mode = CCZmode;
10107 break;
10108 case NE:
10109 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10110 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10111 gcc_assert (!ordered);
10112 check_unordered = true;
10113 mode = CCZmode;
10114 const_val = const1_rtx;
10115 break;
10116 default:
10117 gcc_unreachable ();
10118 }
10119
10120 target = gen_reg_rtx (SImode);
10121 emit_move_insn (target, const_val);
10122 target = gen_rtx_SUBREG (QImode, target, 0);
10123
10124 if ((optimize && !register_operand (op0, mode0))
10125 || !insn_p->operand[0].predicate (op0, mode0))
10126 op0 = copy_to_mode_reg (mode0, op0);
10127 if ((optimize && !register_operand (op1, mode1))
10128 || !insn_p->operand[1].predicate (op1, mode1))
10129 op1 = copy_to_mode_reg (mode1, op1);
10130
10131 /*
10132 1. COMI: ordered and signaling.
10133 2. UCOMI: unordered and non-signaling.
10134 */
10135 if (non_signaling)
10136 icode = (icode == CODE_FOR_sse_comi_round
10137 ? CODE_FOR_sse_ucomi_round
10138 : CODE_FOR_sse2_ucomi_round);
10139
10140 pat = GEN_FCN (icode) (op0, op1, op3);
10141 if (! pat)
10142 return 0;
10143
10144 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10145 if (INTVAL (op3) == NO_ROUND)
10146 {
10147 pat = ix86_erase_embedded_rounding (pat);
10148 if (! pat)
10149 return 0;
10150
10151 set_dst = SET_DEST (pat);
10152 }
10153 else
10154 {
10155 gcc_assert (GET_CODE (pat) == SET);
10156 set_dst = SET_DEST (pat);
10157 }
10158
10159 emit_insn (pat);
10160
10161 rtx_code_label *label = NULL;
10162
10163 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10164 with NAN operands. */
10165 if (check_unordered)
10166 {
10167 gcc_assert (comparison == EQ || comparison == NE);
10168
10169 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10170 label = gen_label_rtx ();
10171 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10172 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10173 gen_rtx_LABEL_REF (VOIDmode, label),
10174 pc_rtx);
10175 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10176 }
10177
10178 /* NB: Set CCFPmode and check a different CCmode which is in subset
10179 of CCFPmode. */
10180 if (GET_MODE (set_dst) != mode)
10181 {
10182 gcc_assert (mode == CCAmode || mode == CCCmode
10183 || mode == CCOmode || mode == CCPmode
10184 || mode == CCSmode || mode == CCZmode);
10185 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10186 }
10187
10188 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10189 gen_rtx_fmt_ee (comparison, QImode,
10190 set_dst,
10191 const0_rtx)));
10192
10193 if (label)
10194 emit_label (label);
10195
10196 return SUBREG_REG (target);
10197 }
10198
10199 static rtx
10200 ix86_expand_round_builtin (const struct builtin_description *d,
10201 tree exp, rtx target)
10202 {
10203 rtx pat;
10204 unsigned int i, nargs;
10205 struct
10206 {
10207 rtx op;
10208 machine_mode mode;
10209 } args[6];
10210 enum insn_code icode = d->icode;
10211 const struct insn_data_d *insn_p = &insn_data[icode];
10212 machine_mode tmode = insn_p->operand[0].mode;
10213 unsigned int nargs_constant = 0;
10214 unsigned int redundant_embed_rnd = 0;
10215
10216 switch ((enum ix86_builtin_func_type) d->flag)
10217 {
10218 case UINT64_FTYPE_V2DF_INT:
10219 case UINT64_FTYPE_V4SF_INT:
10220 case UINT_FTYPE_V2DF_INT:
10221 case UINT_FTYPE_V4SF_INT:
10222 case INT64_FTYPE_V2DF_INT:
10223 case INT64_FTYPE_V4SF_INT:
10224 case INT_FTYPE_V2DF_INT:
10225 case INT_FTYPE_V4SF_INT:
10226 nargs = 2;
10227 break;
10228 case V4SF_FTYPE_V4SF_UINT_INT:
10229 case V4SF_FTYPE_V4SF_UINT64_INT:
10230 case V2DF_FTYPE_V2DF_UINT64_INT:
10231 case V4SF_FTYPE_V4SF_INT_INT:
10232 case V4SF_FTYPE_V4SF_INT64_INT:
10233 case V2DF_FTYPE_V2DF_INT64_INT:
10234 case V4SF_FTYPE_V4SF_V4SF_INT:
10235 case V2DF_FTYPE_V2DF_V2DF_INT:
10236 case V4SF_FTYPE_V4SF_V2DF_INT:
10237 case V2DF_FTYPE_V2DF_V4SF_INT:
10238 nargs = 3;
10239 break;
10240 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10241 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10242 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10243 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10244 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10245 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10246 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10247 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10248 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10249 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10250 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10251 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10252 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10253 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10254 nargs = 4;
10255 break;
10256 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10257 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10258 nargs_constant = 2;
10259 nargs = 4;
10260 break;
10261 case INT_FTYPE_V4SF_V4SF_INT_INT:
10262 case INT_FTYPE_V2DF_V2DF_INT_INT:
10263 return ix86_expand_sse_comi_round (d, exp, target);
10264 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10265 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10266 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10267 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10268 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10269 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10270 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10271 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10272 nargs = 5;
10273 break;
10274 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10275 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10276 nargs_constant = 4;
10277 nargs = 5;
10278 break;
10279 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10280 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10281 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10282 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10283 nargs_constant = 3;
10284 nargs = 5;
10285 break;
10286 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10287 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10288 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10289 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10290 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10291 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10292 nargs = 6;
10293 nargs_constant = 4;
10294 break;
10295 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10296 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10297 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10298 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10299 nargs = 6;
10300 nargs_constant = 3;
10301 break;
10302 default:
10303 gcc_unreachable ();
10304 }
10305 gcc_assert (nargs <= ARRAY_SIZE (args));
10306
10307 if (optimize
10308 || target == 0
10309 || GET_MODE (target) != tmode
10310 || !insn_p->operand[0].predicate (target, tmode))
10311 target = gen_reg_rtx (tmode);
10312
10313 for (i = 0; i < nargs; i++)
10314 {
10315 tree arg = CALL_EXPR_ARG (exp, i);
10316 rtx op = expand_normal (arg);
10317 machine_mode mode = insn_p->operand[i + 1].mode;
10318 bool match = insn_p->operand[i + 1].predicate (op, mode);
10319
10320 if (i == nargs - nargs_constant)
10321 {
10322 if (!match)
10323 {
10324 switch (icode)
10325 {
10326 case CODE_FOR_avx512f_getmantv8df_mask_round:
10327 case CODE_FOR_avx512f_getmantv16sf_mask_round:
10328 case CODE_FOR_avx512f_vgetmantv2df_round:
10329 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10330 case CODE_FOR_avx512f_vgetmantv4sf_round:
10331 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10332 error ("the immediate argument must be a 4-bit immediate");
10333 return const0_rtx;
10334 case CODE_FOR_avx512f_cmpv8df3_mask_round:
10335 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10336 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10337 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10338 error ("the immediate argument must be a 5-bit immediate");
10339 return const0_rtx;
10340 default:
10341 error ("the immediate argument must be an 8-bit immediate");
10342 return const0_rtx;
10343 }
10344 }
10345 }
10346 else if (i == nargs-1)
10347 {
10348 if (!insn_p->operand[nargs].predicate (op, SImode))
10349 {
10350 error ("incorrect rounding operand");
10351 return const0_rtx;
10352 }
10353
10354 /* If there is no rounding use normal version of the pattern. */
10355 if (INTVAL (op) == NO_ROUND)
10356 redundant_embed_rnd = 1;
10357 }
10358 else
10359 {
10360 if (VECTOR_MODE_P (mode))
10361 op = safe_vector_operand (op, mode);
10362
10363 op = fixup_modeless_constant (op, mode);
10364
10365 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10366 {
10367 if (optimize || !match)
10368 op = copy_to_mode_reg (mode, op);
10369 }
10370 else
10371 {
10372 op = copy_to_reg (op);
10373 op = lowpart_subreg (mode, op, GET_MODE (op));
10374 }
10375 }
10376
10377 args[i].op = op;
10378 args[i].mode = mode;
10379 }
10380
10381 switch (nargs)
10382 {
10383 case 1:
10384 pat = GEN_FCN (icode) (target, args[0].op);
10385 break;
10386 case 2:
10387 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10388 break;
10389 case 3:
10390 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10391 args[2].op);
10392 break;
10393 case 4:
10394 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10395 args[2].op, args[3].op);
10396 break;
10397 case 5:
10398 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10399 args[2].op, args[3].op, args[4].op);
10400 break;
10401 case 6:
10402 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10403 args[2].op, args[3].op, args[4].op,
10404 args[5].op);
10405 break;
10406 default:
10407 gcc_unreachable ();
10408 }
10409
10410 if (!pat)
10411 return 0;
10412
10413 if (redundant_embed_rnd)
10414 pat = ix86_erase_embedded_rounding (pat);
10415
10416 emit_insn (pat);
10417 return target;
10418 }
10419
10420 /* Subroutine of ix86_expand_builtin to take care of special insns
10421 with variable number of operands. */
10422
10423 static rtx
10424 ix86_expand_special_args_builtin (const struct builtin_description *d,
10425 tree exp, rtx target)
10426 {
10427 tree arg;
10428 rtx pat, op;
10429 unsigned int i, nargs, arg_adjust, memory;
10430 bool aligned_mem = false;
10431 struct
10432 {
10433 rtx op;
10434 machine_mode mode;
10435 } args[3];
10436 enum insn_code icode = d->icode;
10437 bool last_arg_constant = false;
10438 const struct insn_data_d *insn_p = &insn_data[icode];
10439 machine_mode tmode = insn_p->operand[0].mode;
10440 enum { load, store } klass;
10441
10442 switch ((enum ix86_builtin_func_type) d->flag)
10443 {
10444 case VOID_FTYPE_VOID:
10445 emit_insn (GEN_FCN (icode) (target));
10446 return 0;
10447 case VOID_FTYPE_UINT64:
10448 case VOID_FTYPE_UNSIGNED:
10449 nargs = 0;
10450 klass = store;
10451 memory = 0;
10452 break;
10453
10454 case INT_FTYPE_VOID:
10455 case USHORT_FTYPE_VOID:
10456 case UINT64_FTYPE_VOID:
10457 case UINT_FTYPE_VOID:
10458 case UNSIGNED_FTYPE_VOID:
10459 nargs = 0;
10460 klass = load;
10461 memory = 0;
10462 break;
10463 case UINT64_FTYPE_PUNSIGNED:
10464 case V2DI_FTYPE_PV2DI:
10465 case V4DI_FTYPE_PV4DI:
10466 case V32QI_FTYPE_PCCHAR:
10467 case V16QI_FTYPE_PCCHAR:
10468 case V8SF_FTYPE_PCV4SF:
10469 case V8SF_FTYPE_PCFLOAT:
10470 case V4SF_FTYPE_PCFLOAT:
10471 case V4DF_FTYPE_PCV2DF:
10472 case V4DF_FTYPE_PCDOUBLE:
10473 case V2DF_FTYPE_PCDOUBLE:
10474 case VOID_FTYPE_PVOID:
10475 case V8DI_FTYPE_PV8DI:
10476 nargs = 1;
10477 klass = load;
10478 memory = 0;
10479 switch (icode)
10480 {
10481 case CODE_FOR_sse4_1_movntdqa:
10482 case CODE_FOR_avx2_movntdqa:
10483 case CODE_FOR_avx512f_movntdqa:
10484 aligned_mem = true;
10485 break;
10486 default:
10487 break;
10488 }
10489 break;
10490 case VOID_FTYPE_PV2SF_V4SF:
10491 case VOID_FTYPE_PV8DI_V8DI:
10492 case VOID_FTYPE_PV4DI_V4DI:
10493 case VOID_FTYPE_PV2DI_V2DI:
10494 case VOID_FTYPE_PCHAR_V32QI:
10495 case VOID_FTYPE_PCHAR_V16QI:
10496 case VOID_FTYPE_PFLOAT_V16SF:
10497 case VOID_FTYPE_PFLOAT_V8SF:
10498 case VOID_FTYPE_PFLOAT_V4SF:
10499 case VOID_FTYPE_PDOUBLE_V8DF:
10500 case VOID_FTYPE_PDOUBLE_V4DF:
10501 case VOID_FTYPE_PDOUBLE_V2DF:
10502 case VOID_FTYPE_PLONGLONG_LONGLONG:
10503 case VOID_FTYPE_PULONGLONG_ULONGLONG:
10504 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10505 case VOID_FTYPE_PINT_INT:
10506 nargs = 1;
10507 klass = store;
10508 /* Reserve memory operand for target. */
10509 memory = ARRAY_SIZE (args);
10510 switch (icode)
10511 {
10512 /* These builtins and instructions require the memory
10513 to be properly aligned. */
10514 case CODE_FOR_avx_movntv4di:
10515 case CODE_FOR_sse2_movntv2di:
10516 case CODE_FOR_avx_movntv8sf:
10517 case CODE_FOR_sse_movntv4sf:
10518 case CODE_FOR_sse4a_vmmovntv4sf:
10519 case CODE_FOR_avx_movntv4df:
10520 case CODE_FOR_sse2_movntv2df:
10521 case CODE_FOR_sse4a_vmmovntv2df:
10522 case CODE_FOR_sse2_movntidi:
10523 case CODE_FOR_sse_movntq:
10524 case CODE_FOR_sse2_movntisi:
10525 case CODE_FOR_avx512f_movntv16sf:
10526 case CODE_FOR_avx512f_movntv8df:
10527 case CODE_FOR_avx512f_movntv8di:
10528 aligned_mem = true;
10529 break;
10530 default:
10531 break;
10532 }
10533 break;
10534 case VOID_FTYPE_PVOID_PCVOID:
10535 nargs = 1;
10536 klass = store;
10537 memory = 0;
10538
10539 break;
10540 case V4SF_FTYPE_V4SF_PCV2SF:
10541 case V2DF_FTYPE_V2DF_PCDOUBLE:
10542 nargs = 2;
10543 klass = load;
10544 memory = 1;
10545 break;
10546 case V8SF_FTYPE_PCV8SF_V8SI:
10547 case V4DF_FTYPE_PCV4DF_V4DI:
10548 case V4SF_FTYPE_PCV4SF_V4SI:
10549 case V2DF_FTYPE_PCV2DF_V2DI:
10550 case V8SI_FTYPE_PCV8SI_V8SI:
10551 case V4DI_FTYPE_PCV4DI_V4DI:
10552 case V4SI_FTYPE_PCV4SI_V4SI:
10553 case V2DI_FTYPE_PCV2DI_V2DI:
10554 case VOID_FTYPE_INT_INT64:
10555 nargs = 2;
10556 klass = load;
10557 memory = 0;
10558 break;
10559 case VOID_FTYPE_PV8DF_V8DF_UQI:
10560 case VOID_FTYPE_PV4DF_V4DF_UQI:
10561 case VOID_FTYPE_PV2DF_V2DF_UQI:
10562 case VOID_FTYPE_PV16SF_V16SF_UHI:
10563 case VOID_FTYPE_PV8SF_V8SF_UQI:
10564 case VOID_FTYPE_PV4SF_V4SF_UQI:
10565 case VOID_FTYPE_PV8DI_V8DI_UQI:
10566 case VOID_FTYPE_PV4DI_V4DI_UQI:
10567 case VOID_FTYPE_PV2DI_V2DI_UQI:
10568 case VOID_FTYPE_PV16SI_V16SI_UHI:
10569 case VOID_FTYPE_PV8SI_V8SI_UQI:
10570 case VOID_FTYPE_PV4SI_V4SI_UQI:
10571 case VOID_FTYPE_PV64QI_V64QI_UDI:
10572 case VOID_FTYPE_PV32HI_V32HI_USI:
10573 case VOID_FTYPE_PV32QI_V32QI_USI:
10574 case VOID_FTYPE_PV16QI_V16QI_UHI:
10575 case VOID_FTYPE_PV16HI_V16HI_UHI:
10576 case VOID_FTYPE_PV8HI_V8HI_UQI:
10577 switch (icode)
10578 {
10579 /* These builtins and instructions require the memory
10580 to be properly aligned. */
10581 case CODE_FOR_avx512f_storev16sf_mask:
10582 case CODE_FOR_avx512f_storev16si_mask:
10583 case CODE_FOR_avx512f_storev8df_mask:
10584 case CODE_FOR_avx512f_storev8di_mask:
10585 case CODE_FOR_avx512vl_storev8sf_mask:
10586 case CODE_FOR_avx512vl_storev8si_mask:
10587 case CODE_FOR_avx512vl_storev4df_mask:
10588 case CODE_FOR_avx512vl_storev4di_mask:
10589 case CODE_FOR_avx512vl_storev4sf_mask:
10590 case CODE_FOR_avx512vl_storev4si_mask:
10591 case CODE_FOR_avx512vl_storev2df_mask:
10592 case CODE_FOR_avx512vl_storev2di_mask:
10593 aligned_mem = true;
10594 break;
10595 default:
10596 break;
10597 }
10598 /* FALLTHRU */
10599 case VOID_FTYPE_PV8SF_V8SI_V8SF:
10600 case VOID_FTYPE_PV4DF_V4DI_V4DF:
10601 case VOID_FTYPE_PV4SF_V4SI_V4SF:
10602 case VOID_FTYPE_PV2DF_V2DI_V2DF:
10603 case VOID_FTYPE_PV8SI_V8SI_V8SI:
10604 case VOID_FTYPE_PV4DI_V4DI_V4DI:
10605 case VOID_FTYPE_PV4SI_V4SI_V4SI:
10606 case VOID_FTYPE_PV2DI_V2DI_V2DI:
10607 case VOID_FTYPE_PV8SI_V8DI_UQI:
10608 case VOID_FTYPE_PV8HI_V8DI_UQI:
10609 case VOID_FTYPE_PV16HI_V16SI_UHI:
10610 case VOID_FTYPE_PV16QI_V8DI_UQI:
10611 case VOID_FTYPE_PV16QI_V16SI_UHI:
10612 case VOID_FTYPE_PV4SI_V4DI_UQI:
10613 case VOID_FTYPE_PV4SI_V2DI_UQI:
10614 case VOID_FTYPE_PV8HI_V4DI_UQI:
10615 case VOID_FTYPE_PV8HI_V2DI_UQI:
10616 case VOID_FTYPE_PV8HI_V8SI_UQI:
10617 case VOID_FTYPE_PV8HI_V4SI_UQI:
10618 case VOID_FTYPE_PV16QI_V4DI_UQI:
10619 case VOID_FTYPE_PV16QI_V2DI_UQI:
10620 case VOID_FTYPE_PV16QI_V8SI_UQI:
10621 case VOID_FTYPE_PV16QI_V4SI_UQI:
10622 case VOID_FTYPE_PCHAR_V64QI_UDI:
10623 case VOID_FTYPE_PCHAR_V32QI_USI:
10624 case VOID_FTYPE_PCHAR_V16QI_UHI:
10625 case VOID_FTYPE_PSHORT_V32HI_USI:
10626 case VOID_FTYPE_PSHORT_V16HI_UHI:
10627 case VOID_FTYPE_PSHORT_V8HI_UQI:
10628 case VOID_FTYPE_PINT_V16SI_UHI:
10629 case VOID_FTYPE_PINT_V8SI_UQI:
10630 case VOID_FTYPE_PINT_V4SI_UQI:
10631 case VOID_FTYPE_PINT64_V8DI_UQI:
10632 case VOID_FTYPE_PINT64_V4DI_UQI:
10633 case VOID_FTYPE_PINT64_V2DI_UQI:
10634 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10635 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10636 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10637 case VOID_FTYPE_PFLOAT_V16SF_UHI:
10638 case VOID_FTYPE_PFLOAT_V8SF_UQI:
10639 case VOID_FTYPE_PFLOAT_V4SF_UQI:
10640 case VOID_FTYPE_PV32QI_V32HI_USI:
10641 case VOID_FTYPE_PV16QI_V16HI_UHI:
10642 case VOID_FTYPE_PV8QI_V8HI_UQI:
10643 nargs = 2;
10644 klass = store;
10645 /* Reserve memory operand for target. */
10646 memory = ARRAY_SIZE (args);
10647 break;
10648 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10649 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10650 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10651 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10652 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10653 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10654 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10655 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10656 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10657 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10658 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10659 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10660 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10661 case V32HI_FTYPE_PCV32HI_V32HI_USI:
10662 case V32QI_FTYPE_PCV32QI_V32QI_USI:
10663 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10664 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10665 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10666 switch (icode)
10667 {
10668 /* These builtins and instructions require the memory
10669 to be properly aligned. */
10670 case CODE_FOR_avx512f_loadv16sf_mask:
10671 case CODE_FOR_avx512f_loadv16si_mask:
10672 case CODE_FOR_avx512f_loadv8df_mask:
10673 case CODE_FOR_avx512f_loadv8di_mask:
10674 case CODE_FOR_avx512vl_loadv8sf_mask:
10675 case CODE_FOR_avx512vl_loadv8si_mask:
10676 case CODE_FOR_avx512vl_loadv4df_mask:
10677 case CODE_FOR_avx512vl_loadv4di_mask:
10678 case CODE_FOR_avx512vl_loadv4sf_mask:
10679 case CODE_FOR_avx512vl_loadv4si_mask:
10680 case CODE_FOR_avx512vl_loadv2df_mask:
10681 case CODE_FOR_avx512vl_loadv2di_mask:
10682 case CODE_FOR_avx512bw_loadv64qi_mask:
10683 case CODE_FOR_avx512vl_loadv32qi_mask:
10684 case CODE_FOR_avx512vl_loadv16qi_mask:
10685 case CODE_FOR_avx512bw_loadv32hi_mask:
10686 case CODE_FOR_avx512vl_loadv16hi_mask:
10687 case CODE_FOR_avx512vl_loadv8hi_mask:
10688 aligned_mem = true;
10689 break;
10690 default:
10691 break;
10692 }
10693 /* FALLTHRU */
10694 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10695 case V32QI_FTYPE_PCCHAR_V32QI_USI:
10696 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10697 case V32HI_FTYPE_PCSHORT_V32HI_USI:
10698 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10699 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10700 case V16SI_FTYPE_PCINT_V16SI_UHI:
10701 case V8SI_FTYPE_PCINT_V8SI_UQI:
10702 case V4SI_FTYPE_PCINT_V4SI_UQI:
10703 case V8DI_FTYPE_PCINT64_V8DI_UQI:
10704 case V4DI_FTYPE_PCINT64_V4DI_UQI:
10705 case V2DI_FTYPE_PCINT64_V2DI_UQI:
10706 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10707 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10708 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10709 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10710 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10711 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10712 nargs = 3;
10713 klass = load;
10714 memory = 0;
10715 break;
10716 case VOID_FTYPE_UINT_UINT_UINT:
10717 case VOID_FTYPE_UINT64_UINT_UINT:
10718 case UCHAR_FTYPE_UINT_UINT_UINT:
10719 case UCHAR_FTYPE_UINT64_UINT_UINT:
10720 nargs = 3;
10721 klass = load;
10722 memory = ARRAY_SIZE (args);
10723 last_arg_constant = true;
10724 break;
10725 default:
10726 gcc_unreachable ();
10727 }
10728
10729 gcc_assert (nargs <= ARRAY_SIZE (args));
10730
10731 if (klass == store)
10732 {
10733 arg = CALL_EXPR_ARG (exp, 0);
10734 op = expand_normal (arg);
10735 gcc_assert (target == 0);
10736 if (memory)
10737 {
10738 op = ix86_zero_extend_to_Pmode (op);
10739 target = gen_rtx_MEM (tmode, op);
10740 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10741 on it. Try to improve it using get_pointer_alignment,
10742 and if the special builtin is one that requires strict
10743 mode alignment, also from it's GET_MODE_ALIGNMENT.
10744 Failure to do so could lead to ix86_legitimate_combined_insn
10745 rejecting all changes to such insns. */
10746 unsigned int align = get_pointer_alignment (arg);
10747 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10748 align = GET_MODE_ALIGNMENT (tmode);
10749 if (MEM_ALIGN (target) < align)
10750 set_mem_align (target, align);
10751 }
10752 else
10753 target = force_reg (tmode, op);
10754 arg_adjust = 1;
10755 }
10756 else
10757 {
10758 arg_adjust = 0;
10759 if (optimize
10760 || target == 0
10761 || !register_operand (target, tmode)
10762 || GET_MODE (target) != tmode)
10763 target = gen_reg_rtx (tmode);
10764 }
10765
10766 for (i = 0; i < nargs; i++)
10767 {
10768 machine_mode mode = insn_p->operand[i + 1].mode;
10769 bool match;
10770
10771 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10772 op = expand_normal (arg);
10773 match = insn_p->operand[i + 1].predicate (op, mode);
10774
10775 if (last_arg_constant && (i + 1) == nargs)
10776 {
10777 if (!match)
10778 {
10779 if (icode == CODE_FOR_lwp_lwpvalsi3
10780 || icode == CODE_FOR_lwp_lwpinssi3
10781 || icode == CODE_FOR_lwp_lwpvaldi3
10782 || icode == CODE_FOR_lwp_lwpinsdi3)
10783 error ("the last argument must be a 32-bit immediate");
10784 else
10785 error ("the last argument must be an 8-bit immediate");
10786 return const0_rtx;
10787 }
10788 }
10789 else
10790 {
10791 if (i == memory)
10792 {
10793 /* This must be the memory operand. */
10794 op = ix86_zero_extend_to_Pmode (op);
10795 op = gen_rtx_MEM (mode, op);
10796 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10797 on it. Try to improve it using get_pointer_alignment,
10798 and if the special builtin is one that requires strict
10799 mode alignment, also from it's GET_MODE_ALIGNMENT.
10800 Failure to do so could lead to ix86_legitimate_combined_insn
10801 rejecting all changes to such insns. */
10802 unsigned int align = get_pointer_alignment (arg);
10803 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10804 align = GET_MODE_ALIGNMENT (mode);
10805 if (MEM_ALIGN (op) < align)
10806 set_mem_align (op, align);
10807 }
10808 else
10809 {
10810 /* This must be register. */
10811 if (VECTOR_MODE_P (mode))
10812 op = safe_vector_operand (op, mode);
10813
10814 op = fixup_modeless_constant (op, mode);
10815
10816 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10817 op = copy_to_mode_reg (mode, op);
10818 else
10819 {
10820 op = copy_to_reg (op);
10821 op = lowpart_subreg (mode, op, GET_MODE (op));
10822 }
10823 }
10824 }
10825
10826 args[i].op = op;
10827 args[i].mode = mode;
10828 }
10829
10830 switch (nargs)
10831 {
10832 case 0:
10833 pat = GEN_FCN (icode) (target);
10834 break;
10835 case 1:
10836 pat = GEN_FCN (icode) (target, args[0].op);
10837 break;
10838 case 2:
10839 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10840 break;
10841 case 3:
10842 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
10843 break;
10844 default:
10845 gcc_unreachable ();
10846 }
10847
10848 if (! pat)
10849 return 0;
10850 emit_insn (pat);
10851 return klass == store ? 0 : target;
10852 }
10853
10854 /* Return the integer constant in ARG. Constrain it to be in the range
10855 of the subparts of VEC_TYPE; issue an error if not. */
10856
10857 static int
10858 get_element_number (tree vec_type, tree arg)
10859 {
10860 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10861
10862 if (!tree_fits_uhwi_p (arg)
10863 || (elt = tree_to_uhwi (arg), elt > max))
10864 {
10865 error ("selector must be an integer constant in the range "
10866 "[0, %wi]", max);
10867 return 0;
10868 }
10869
10870 return elt;
10871 }
10872
10873 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10874 ix86_expand_vector_init. We DO have language-level syntax for this, in
10875 the form of (type){ init-list }. Except that since we can't place emms
10876 instructions from inside the compiler, we can't allow the use of MMX
10877 registers unless the user explicitly asks for it. So we do *not* define
10878 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10879 we have builtins invoked by mmintrin.h that gives us license to emit
10880 these sorts of instructions. */
10881
10882 static rtx
10883 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10884 {
10885 machine_mode tmode = TYPE_MODE (type);
10886 machine_mode inner_mode = GET_MODE_INNER (tmode);
10887 int i, n_elt = GET_MODE_NUNITS (tmode);
10888 rtvec v = rtvec_alloc (n_elt);
10889
10890 gcc_assert (VECTOR_MODE_P (tmode));
10891 gcc_assert (call_expr_nargs (exp) == n_elt);
10892
10893 for (i = 0; i < n_elt; ++i)
10894 {
10895 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10896 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10897 }
10898
10899 if (!target || !register_operand (target, tmode))
10900 target = gen_reg_rtx (tmode);
10901
10902 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10903 return target;
10904 }
10905
10906 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10907 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10908 had a language-level syntax for referencing vector elements. */
10909
10910 static rtx
10911 ix86_expand_vec_ext_builtin (tree exp, rtx target)
10912 {
10913 machine_mode tmode, mode0;
10914 tree arg0, arg1;
10915 int elt;
10916 rtx op0;
10917
10918 arg0 = CALL_EXPR_ARG (exp, 0);
10919 arg1 = CALL_EXPR_ARG (exp, 1);
10920
10921 op0 = expand_normal (arg0);
10922 elt = get_element_number (TREE_TYPE (arg0), arg1);
10923
10924 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10925 mode0 = TYPE_MODE (TREE_TYPE (arg0));
10926 gcc_assert (VECTOR_MODE_P (mode0));
10927
10928 op0 = force_reg (mode0, op0);
10929
10930 if (optimize || !target || !register_operand (target, tmode))
10931 target = gen_reg_rtx (tmode);
10932
10933 ix86_expand_vector_extract (true, target, op0, elt);
10934
10935 return target;
10936 }
10937
10938 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10939 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10940 a language-level syntax for referencing vector elements. */
10941
10942 static rtx
10943 ix86_expand_vec_set_builtin (tree exp)
10944 {
10945 machine_mode tmode, mode1;
10946 tree arg0, arg1, arg2;
10947 int elt;
10948 rtx op0, op1, target;
10949
10950 arg0 = CALL_EXPR_ARG (exp, 0);
10951 arg1 = CALL_EXPR_ARG (exp, 1);
10952 arg2 = CALL_EXPR_ARG (exp, 2);
10953
10954 tmode = TYPE_MODE (TREE_TYPE (arg0));
10955 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10956 gcc_assert (VECTOR_MODE_P (tmode));
10957
10958 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
10959 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
10960 elt = get_element_number (TREE_TYPE (arg0), arg2);
10961
10962 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
10963 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
10964
10965 op0 = force_reg (tmode, op0);
10966 op1 = force_reg (mode1, op1);
10967
10968 /* OP0 is the source of these builtin functions and shouldn't be
10969 modified. Create a copy, use it and return it as target. */
10970 target = gen_reg_rtx (tmode);
10971 emit_move_insn (target, op0);
10972 ix86_expand_vector_set (true, target, op1, elt);
10973
10974 return target;
10975 }
10976
10977 /* Expand an expression EXP that calls a built-in function,
10978 with result going to TARGET if that's convenient
10979 (and in mode MODE if that's convenient).
10980 SUBTARGET may be used as the target for computing one of EXP's operands.
10981 IGNORE is nonzero if the value is to be ignored. */
10982
10983 rtx
10984 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
10985 machine_mode mode, int ignore)
10986 {
10987 size_t i;
10988 enum insn_code icode, icode2;
10989 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
10990 tree arg0, arg1, arg2, arg3, arg4;
10991 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
10992 machine_mode mode0, mode1, mode2, mode3, mode4;
10993 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
10994
10995 /* For CPU builtins that can be folded, fold first and expand the fold. */
10996 switch (fcode)
10997 {
10998 case IX86_BUILTIN_CPU_INIT:
10999 {
11000 /* Make it call __cpu_indicator_init in libgcc. */
11001 tree call_expr, fndecl, type;
11002 type = build_function_type_list (integer_type_node, NULL_TREE);
11003 fndecl = build_fn_decl ("__cpu_indicator_init", type);
11004 call_expr = build_call_expr (fndecl, 0);
11005 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
11006 }
11007 case IX86_BUILTIN_CPU_IS:
11008 case IX86_BUILTIN_CPU_SUPPORTS:
11009 {
11010 tree arg0 = CALL_EXPR_ARG (exp, 0);
11011 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
11012 gcc_assert (fold_expr != NULL_TREE);
11013 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
11014 }
11015 }
11016
11017 HOST_WIDE_INT isa = ix86_isa_flags;
11018 HOST_WIDE_INT isa2 = ix86_isa_flags2;
11019 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
11020 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
11021 /* The general case is we require all the ISAs specified in bisa{,2}
11022 to be enabled.
11023 The exceptions are:
11024 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11025 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11026 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11027 where for each this pair it is sufficient if either of the ISAs is
11028 enabled, plus if it is ored with other options also those others. */
11029 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11030 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11031 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
11032 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
11033 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11034 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11035 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
11036 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
11037 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11038 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11039 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
11040 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
11041 /* Use SSE/SSE2/SSSE3 to emulate MMX intrinsics in 64-bit mode when
11042 MMX is disabled. NB: Since MMX intrinsics are marked with
11043 SSE/SSE2/SSSE3, enable them without SSE/SSE2/SSSE3 if MMX is
11044 enabled. */
11045 if (TARGET_MMX || TARGET_MMX_WITH_SSE)
11046 {
11047 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
11048 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
11049 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX)) != 0)
11050 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX);
11051 if (((bisa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
11052 == (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
11053 && (isa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX)) != 0)
11054 isa |= (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX);
11055 if (((bisa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
11056 == (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
11057 && (isa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX)) != 0)
11058 isa |= (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX);
11059 }
11060 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
11061 {
11062 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
11063 if (TARGET_ABI_X32)
11064 bisa |= OPTION_MASK_ABI_X32;
11065 else
11066 bisa |= OPTION_MASK_ABI_64;
11067 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
11068 (enum fpmath_unit) 0, false, add_abi_p);
11069 if (!opts)
11070 error ("%qE needs unknown isa option", fndecl);
11071 else
11072 {
11073 gcc_assert (opts != NULL);
11074 error ("%qE needs isa option %s", fndecl, opts);
11075 free (opts);
11076 }
11077 return expand_call (exp, target, ignore);
11078 }
11079
11080 switch (fcode)
11081 {
11082 case IX86_BUILTIN_MASKMOVQ:
11083 case IX86_BUILTIN_MASKMOVDQU:
11084 icode = (fcode == IX86_BUILTIN_MASKMOVQ
11085 ? CODE_FOR_mmx_maskmovq
11086 : CODE_FOR_sse2_maskmovdqu);
11087 /* Note the arg order is different from the operand order. */
11088 arg1 = CALL_EXPR_ARG (exp, 0);
11089 arg2 = CALL_EXPR_ARG (exp, 1);
11090 arg0 = CALL_EXPR_ARG (exp, 2);
11091 op0 = expand_normal (arg0);
11092 op1 = expand_normal (arg1);
11093 op2 = expand_normal (arg2);
11094 mode0 = insn_data[icode].operand[0].mode;
11095 mode1 = insn_data[icode].operand[1].mode;
11096 mode2 = insn_data[icode].operand[2].mode;
11097
11098 op0 = ix86_zero_extend_to_Pmode (op0);
11099 op0 = gen_rtx_MEM (mode1, op0);
11100
11101 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11102 op0 = copy_to_mode_reg (mode0, op0);
11103 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11104 op1 = copy_to_mode_reg (mode1, op1);
11105 if (!insn_data[icode].operand[2].predicate (op2, mode2))
11106 op2 = copy_to_mode_reg (mode2, op2);
11107 pat = GEN_FCN (icode) (op0, op1, op2);
11108 if (! pat)
11109 return 0;
11110 emit_insn (pat);
11111 return 0;
11112
11113 case IX86_BUILTIN_LDMXCSR:
11114 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11115 target = assign_386_stack_local (SImode, SLOT_TEMP);
11116 emit_move_insn (target, op0);
11117 emit_insn (gen_sse_ldmxcsr (target));
11118 return 0;
11119
11120 case IX86_BUILTIN_STMXCSR:
11121 target = assign_386_stack_local (SImode, SLOT_TEMP);
11122 emit_insn (gen_sse_stmxcsr (target));
11123 return copy_to_mode_reg (SImode, target);
11124
11125 case IX86_BUILTIN_CLFLUSH:
11126 arg0 = CALL_EXPR_ARG (exp, 0);
11127 op0 = expand_normal (arg0);
11128 icode = CODE_FOR_sse2_clflush;
11129 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11130 op0 = ix86_zero_extend_to_Pmode (op0);
11131
11132 emit_insn (gen_sse2_clflush (op0));
11133 return 0;
11134
11135 case IX86_BUILTIN_CLWB:
11136 arg0 = CALL_EXPR_ARG (exp, 0);
11137 op0 = expand_normal (arg0);
11138 icode = CODE_FOR_clwb;
11139 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11140 op0 = ix86_zero_extend_to_Pmode (op0);
11141
11142 emit_insn (gen_clwb (op0));
11143 return 0;
11144
11145 case IX86_BUILTIN_CLFLUSHOPT:
11146 arg0 = CALL_EXPR_ARG (exp, 0);
11147 op0 = expand_normal (arg0);
11148 icode = CODE_FOR_clflushopt;
11149 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11150 op0 = ix86_zero_extend_to_Pmode (op0);
11151
11152 emit_insn (gen_clflushopt (op0));
11153 return 0;
11154
11155 case IX86_BUILTIN_MONITOR:
11156 case IX86_BUILTIN_MONITORX:
11157 arg0 = CALL_EXPR_ARG (exp, 0);
11158 arg1 = CALL_EXPR_ARG (exp, 1);
11159 arg2 = CALL_EXPR_ARG (exp, 2);
11160 op0 = expand_normal (arg0);
11161 op1 = expand_normal (arg1);
11162 op2 = expand_normal (arg2);
11163 if (!REG_P (op0))
11164 op0 = ix86_zero_extend_to_Pmode (op0);
11165 if (!REG_P (op1))
11166 op1 = copy_to_mode_reg (SImode, op1);
11167 if (!REG_P (op2))
11168 op2 = copy_to_mode_reg (SImode, op2);
11169
11170 emit_insn (fcode == IX86_BUILTIN_MONITOR
11171 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11172 : gen_monitorx (Pmode, op0, op1, op2));
11173 return 0;
11174
11175 case IX86_BUILTIN_MWAIT:
11176 arg0 = CALL_EXPR_ARG (exp, 0);
11177 arg1 = CALL_EXPR_ARG (exp, 1);
11178 op0 = expand_normal (arg0);
11179 op1 = expand_normal (arg1);
11180 if (!REG_P (op0))
11181 op0 = copy_to_mode_reg (SImode, op0);
11182 if (!REG_P (op1))
11183 op1 = copy_to_mode_reg (SImode, op1);
11184 emit_insn (gen_sse3_mwait (op0, op1));
11185 return 0;
11186
11187 case IX86_BUILTIN_MWAITX:
11188 arg0 = CALL_EXPR_ARG (exp, 0);
11189 arg1 = CALL_EXPR_ARG (exp, 1);
11190 arg2 = CALL_EXPR_ARG (exp, 2);
11191 op0 = expand_normal (arg0);
11192 op1 = expand_normal (arg1);
11193 op2 = expand_normal (arg2);
11194 if (!REG_P (op0))
11195 op0 = copy_to_mode_reg (SImode, op0);
11196 if (!REG_P (op1))
11197 op1 = copy_to_mode_reg (SImode, op1);
11198 if (!REG_P (op2))
11199 op2 = copy_to_mode_reg (SImode, op2);
11200 emit_insn (gen_mwaitx (op0, op1, op2));
11201 return 0;
11202
11203 case IX86_BUILTIN_UMONITOR:
11204 arg0 = CALL_EXPR_ARG (exp, 0);
11205 op0 = expand_normal (arg0);
11206
11207 op0 = ix86_zero_extend_to_Pmode (op0);
11208 emit_insn (gen_umonitor (Pmode, op0));
11209 return 0;
11210
11211 case IX86_BUILTIN_UMWAIT:
11212 case IX86_BUILTIN_TPAUSE:
11213 arg0 = CALL_EXPR_ARG (exp, 0);
11214 arg1 = CALL_EXPR_ARG (exp, 1);
11215 op0 = expand_normal (arg0);
11216 op1 = expand_normal (arg1);
11217
11218 if (!REG_P (op0))
11219 op0 = copy_to_mode_reg (SImode, op0);
11220
11221 op1 = force_reg (DImode, op1);
11222
11223 if (TARGET_64BIT)
11224 {
11225 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11226 NULL, 1, OPTAB_DIRECT);
11227 switch (fcode)
11228 {
11229 case IX86_BUILTIN_UMWAIT:
11230 icode = CODE_FOR_umwait_rex64;
11231 break;
11232 case IX86_BUILTIN_TPAUSE:
11233 icode = CODE_FOR_tpause_rex64;
11234 break;
11235 default:
11236 gcc_unreachable ();
11237 }
11238
11239 op2 = gen_lowpart (SImode, op2);
11240 op1 = gen_lowpart (SImode, op1);
11241 pat = GEN_FCN (icode) (op0, op1, op2);
11242 }
11243 else
11244 {
11245 switch (fcode)
11246 {
11247 case IX86_BUILTIN_UMWAIT:
11248 icode = CODE_FOR_umwait;
11249 break;
11250 case IX86_BUILTIN_TPAUSE:
11251 icode = CODE_FOR_tpause;
11252 break;
11253 default:
11254 gcc_unreachable ();
11255 }
11256 pat = GEN_FCN (icode) (op0, op1);
11257 }
11258
11259 if (!pat)
11260 return 0;
11261
11262 emit_insn (pat);
11263
11264 if (target == 0
11265 || !register_operand (target, QImode))
11266 target = gen_reg_rtx (QImode);
11267
11268 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11269 const0_rtx);
11270 emit_insn (gen_rtx_SET (target, pat));
11271
11272 return target;
11273
11274 case IX86_BUILTIN_CLZERO:
11275 arg0 = CALL_EXPR_ARG (exp, 0);
11276 op0 = expand_normal (arg0);
11277 if (!REG_P (op0))
11278 op0 = ix86_zero_extend_to_Pmode (op0);
11279 emit_insn (gen_clzero (Pmode, op0));
11280 return 0;
11281
11282 case IX86_BUILTIN_CLDEMOTE:
11283 arg0 = CALL_EXPR_ARG (exp, 0);
11284 op0 = expand_normal (arg0);
11285 icode = CODE_FOR_cldemote;
11286 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11287 op0 = ix86_zero_extend_to_Pmode (op0);
11288
11289 emit_insn (gen_cldemote (op0));
11290 return 0;
11291
11292 case IX86_BUILTIN_VEC_INIT_V2SI:
11293 case IX86_BUILTIN_VEC_INIT_V4HI:
11294 case IX86_BUILTIN_VEC_INIT_V8QI:
11295 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11296
11297 case IX86_BUILTIN_VEC_EXT_V2DF:
11298 case IX86_BUILTIN_VEC_EXT_V2DI:
11299 case IX86_BUILTIN_VEC_EXT_V4SF:
11300 case IX86_BUILTIN_VEC_EXT_V4SI:
11301 case IX86_BUILTIN_VEC_EXT_V8HI:
11302 case IX86_BUILTIN_VEC_EXT_V2SI:
11303 case IX86_BUILTIN_VEC_EXT_V4HI:
11304 case IX86_BUILTIN_VEC_EXT_V16QI:
11305 return ix86_expand_vec_ext_builtin (exp, target);
11306
11307 case IX86_BUILTIN_VEC_SET_V2DI:
11308 case IX86_BUILTIN_VEC_SET_V4SF:
11309 case IX86_BUILTIN_VEC_SET_V4SI:
11310 case IX86_BUILTIN_VEC_SET_V8HI:
11311 case IX86_BUILTIN_VEC_SET_V4HI:
11312 case IX86_BUILTIN_VEC_SET_V16QI:
11313 return ix86_expand_vec_set_builtin (exp);
11314
11315 case IX86_BUILTIN_NANQ:
11316 case IX86_BUILTIN_NANSQ:
11317 return expand_call (exp, target, ignore);
11318
11319 case IX86_BUILTIN_RDPID:
11320
11321 op0 = gen_reg_rtx (word_mode);
11322
11323 if (TARGET_64BIT)
11324 {
11325 insn = gen_rdpid_rex64 (op0);
11326 op0 = convert_to_mode (SImode, op0, 1);
11327 }
11328 else
11329 insn = gen_rdpid (op0);
11330
11331 emit_insn (insn);
11332
11333 if (target == 0
11334 || !register_operand (target, SImode))
11335 target = gen_reg_rtx (SImode);
11336
11337 emit_move_insn (target, op0);
11338 return target;
11339
11340 case IX86_BUILTIN_2INTERSECTD512:
11341 case IX86_BUILTIN_2INTERSECTQ512:
11342 case IX86_BUILTIN_2INTERSECTD256:
11343 case IX86_BUILTIN_2INTERSECTQ256:
11344 case IX86_BUILTIN_2INTERSECTD128:
11345 case IX86_BUILTIN_2INTERSECTQ128:
11346 arg0 = CALL_EXPR_ARG (exp, 0);
11347 arg1 = CALL_EXPR_ARG (exp, 1);
11348 arg2 = CALL_EXPR_ARG (exp, 2);
11349 arg3 = CALL_EXPR_ARG (exp, 3);
11350 op0 = expand_normal (arg0);
11351 op1 = expand_normal (arg1);
11352 op2 = expand_normal (arg2);
11353 op3 = expand_normal (arg3);
11354
11355 if (!address_operand (op0, VOIDmode))
11356 {
11357 op0 = convert_memory_address (Pmode, op0);
11358 op0 = copy_addr_to_reg (op0);
11359 }
11360 if (!address_operand (op1, VOIDmode))
11361 {
11362 op1 = convert_memory_address (Pmode, op1);
11363 op1 = copy_addr_to_reg (op1);
11364 }
11365
11366 switch (fcode)
11367 {
11368 case IX86_BUILTIN_2INTERSECTD512:
11369 mode4 = P2HImode;
11370 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11371 break;
11372 case IX86_BUILTIN_2INTERSECTQ512:
11373 mode4 = P2QImode;
11374 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11375 break;
11376 case IX86_BUILTIN_2INTERSECTD256:
11377 mode4 = P2QImode;
11378 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11379 break;
11380 case IX86_BUILTIN_2INTERSECTQ256:
11381 mode4 = P2QImode;
11382 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11383 break;
11384 case IX86_BUILTIN_2INTERSECTD128:
11385 mode4 = P2QImode;
11386 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11387 break;
11388 case IX86_BUILTIN_2INTERSECTQ128:
11389 mode4 = P2QImode;
11390 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11391 break;
11392 default:
11393 gcc_unreachable ();
11394 }
11395
11396 mode2 = insn_data[icode].operand[1].mode;
11397 mode3 = insn_data[icode].operand[2].mode;
11398 if (!insn_data[icode].operand[1].predicate (op2, mode2))
11399 op2 = copy_to_mode_reg (mode2, op2);
11400 if (!insn_data[icode].operand[2].predicate (op3, mode3))
11401 op3 = copy_to_mode_reg (mode3, op3);
11402
11403 op4 = gen_reg_rtx (mode4);
11404 emit_insn (GEN_FCN (icode) (op4, op2, op3));
11405 mode0 = mode4 == P2HImode ? HImode : QImode;
11406 emit_move_insn (gen_rtx_MEM (mode0, op0),
11407 gen_lowpart (mode0, op4));
11408 emit_move_insn (gen_rtx_MEM (mode0, op1),
11409 gen_highpart (mode0, op4));
11410
11411 return 0;
11412
11413 case IX86_BUILTIN_RDPMC:
11414 case IX86_BUILTIN_RDTSC:
11415 case IX86_BUILTIN_RDTSCP:
11416 case IX86_BUILTIN_XGETBV:
11417
11418 op0 = gen_reg_rtx (DImode);
11419 op1 = gen_reg_rtx (DImode);
11420
11421 if (fcode == IX86_BUILTIN_RDPMC)
11422 {
11423 arg0 = CALL_EXPR_ARG (exp, 0);
11424 op2 = expand_normal (arg0);
11425 if (!register_operand (op2, SImode))
11426 op2 = copy_to_mode_reg (SImode, op2);
11427
11428 insn = (TARGET_64BIT
11429 ? gen_rdpmc_rex64 (op0, op1, op2)
11430 : gen_rdpmc (op0, op2));
11431 emit_insn (insn);
11432 }
11433 else if (fcode == IX86_BUILTIN_XGETBV)
11434 {
11435 arg0 = CALL_EXPR_ARG (exp, 0);
11436 op2 = expand_normal (arg0);
11437 if (!register_operand (op2, SImode))
11438 op2 = copy_to_mode_reg (SImode, op2);
11439
11440 insn = (TARGET_64BIT
11441 ? gen_xgetbv_rex64 (op0, op1, op2)
11442 : gen_xgetbv (op0, op2));
11443 emit_insn (insn);
11444 }
11445 else if (fcode == IX86_BUILTIN_RDTSC)
11446 {
11447 insn = (TARGET_64BIT
11448 ? gen_rdtsc_rex64 (op0, op1)
11449 : gen_rdtsc (op0));
11450 emit_insn (insn);
11451 }
11452 else
11453 {
11454 op2 = gen_reg_rtx (SImode);
11455
11456 insn = (TARGET_64BIT
11457 ? gen_rdtscp_rex64 (op0, op1, op2)
11458 : gen_rdtscp (op0, op2));
11459 emit_insn (insn);
11460
11461 arg0 = CALL_EXPR_ARG (exp, 0);
11462 op4 = expand_normal (arg0);
11463 if (!address_operand (op4, VOIDmode))
11464 {
11465 op4 = convert_memory_address (Pmode, op4);
11466 op4 = copy_addr_to_reg (op4);
11467 }
11468 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11469 }
11470
11471 if (target == 0
11472 || !register_operand (target, DImode))
11473 target = gen_reg_rtx (DImode);
11474
11475 if (TARGET_64BIT)
11476 {
11477 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11478 op1, 1, OPTAB_DIRECT);
11479 op0 = expand_simple_binop (DImode, IOR, op0, op1,
11480 op0, 1, OPTAB_DIRECT);
11481 }
11482
11483 emit_move_insn (target, op0);
11484 return target;
11485
11486 case IX86_BUILTIN_ENQCMD:
11487 case IX86_BUILTIN_ENQCMDS:
11488 case IX86_BUILTIN_MOVDIR64B:
11489
11490 arg0 = CALL_EXPR_ARG (exp, 0);
11491 arg1 = CALL_EXPR_ARG (exp, 1);
11492 op0 = expand_normal (arg0);
11493 op1 = expand_normal (arg1);
11494
11495 op0 = ix86_zero_extend_to_Pmode (op0);
11496 if (!address_operand (op1, VOIDmode))
11497 {
11498 op1 = convert_memory_address (Pmode, op1);
11499 op1 = copy_addr_to_reg (op1);
11500 }
11501 op1 = gen_rtx_MEM (XImode, op1);
11502
11503 if (fcode == IX86_BUILTIN_MOVDIR64B)
11504 {
11505 emit_insn (gen_movdir64b (Pmode, op0, op1));
11506 return 0;
11507 }
11508 else
11509 {
11510 rtx pat;
11511
11512 target = gen_reg_rtx (SImode);
11513 emit_move_insn (target, const0_rtx);
11514 target = gen_rtx_SUBREG (QImode, target, 0);
11515
11516 if (fcode == IX86_BUILTIN_ENQCMD)
11517 pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1);
11518 else
11519 pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1);
11520
11521 emit_insn (pat);
11522
11523 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11524 gen_rtx_fmt_ee (EQ, QImode,
11525 SET_DEST (pat),
11526 const0_rtx)));
11527
11528 return SUBREG_REG (target);
11529 }
11530
11531 case IX86_BUILTIN_FXSAVE:
11532 case IX86_BUILTIN_FXRSTOR:
11533 case IX86_BUILTIN_FXSAVE64:
11534 case IX86_BUILTIN_FXRSTOR64:
11535 case IX86_BUILTIN_FNSTENV:
11536 case IX86_BUILTIN_FLDENV:
11537 mode0 = BLKmode;
11538 switch (fcode)
11539 {
11540 case IX86_BUILTIN_FXSAVE:
11541 icode = CODE_FOR_fxsave;
11542 break;
11543 case IX86_BUILTIN_FXRSTOR:
11544 icode = CODE_FOR_fxrstor;
11545 break;
11546 case IX86_BUILTIN_FXSAVE64:
11547 icode = CODE_FOR_fxsave64;
11548 break;
11549 case IX86_BUILTIN_FXRSTOR64:
11550 icode = CODE_FOR_fxrstor64;
11551 break;
11552 case IX86_BUILTIN_FNSTENV:
11553 icode = CODE_FOR_fnstenv;
11554 break;
11555 case IX86_BUILTIN_FLDENV:
11556 icode = CODE_FOR_fldenv;
11557 break;
11558 default:
11559 gcc_unreachable ();
11560 }
11561
11562 arg0 = CALL_EXPR_ARG (exp, 0);
11563 op0 = expand_normal (arg0);
11564
11565 if (!address_operand (op0, VOIDmode))
11566 {
11567 op0 = convert_memory_address (Pmode, op0);
11568 op0 = copy_addr_to_reg (op0);
11569 }
11570 op0 = gen_rtx_MEM (mode0, op0);
11571
11572 pat = GEN_FCN (icode) (op0);
11573 if (pat)
11574 emit_insn (pat);
11575 return 0;
11576
11577 case IX86_BUILTIN_XSETBV:
11578 arg0 = CALL_EXPR_ARG (exp, 0);
11579 arg1 = CALL_EXPR_ARG (exp, 1);
11580 op0 = expand_normal (arg0);
11581 op1 = expand_normal (arg1);
11582
11583 if (!REG_P (op0))
11584 op0 = copy_to_mode_reg (SImode, op0);
11585
11586 op1 = force_reg (DImode, op1);
11587
11588 if (TARGET_64BIT)
11589 {
11590 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11591 NULL, 1, OPTAB_DIRECT);
11592
11593 icode = CODE_FOR_xsetbv_rex64;
11594
11595 op2 = gen_lowpart (SImode, op2);
11596 op1 = gen_lowpart (SImode, op1);
11597 pat = GEN_FCN (icode) (op0, op1, op2);
11598 }
11599 else
11600 {
11601 icode = CODE_FOR_xsetbv;
11602
11603 pat = GEN_FCN (icode) (op0, op1);
11604 }
11605 if (pat)
11606 emit_insn (pat);
11607 return 0;
11608
11609 case IX86_BUILTIN_XSAVE:
11610 case IX86_BUILTIN_XRSTOR:
11611 case IX86_BUILTIN_XSAVE64:
11612 case IX86_BUILTIN_XRSTOR64:
11613 case IX86_BUILTIN_XSAVEOPT:
11614 case IX86_BUILTIN_XSAVEOPT64:
11615 case IX86_BUILTIN_XSAVES:
11616 case IX86_BUILTIN_XRSTORS:
11617 case IX86_BUILTIN_XSAVES64:
11618 case IX86_BUILTIN_XRSTORS64:
11619 case IX86_BUILTIN_XSAVEC:
11620 case IX86_BUILTIN_XSAVEC64:
11621 arg0 = CALL_EXPR_ARG (exp, 0);
11622 arg1 = CALL_EXPR_ARG (exp, 1);
11623 op0 = expand_normal (arg0);
11624 op1 = expand_normal (arg1);
11625
11626 if (!address_operand (op0, VOIDmode))
11627 {
11628 op0 = convert_memory_address (Pmode, op0);
11629 op0 = copy_addr_to_reg (op0);
11630 }
11631 op0 = gen_rtx_MEM (BLKmode, op0);
11632
11633 op1 = force_reg (DImode, op1);
11634
11635 if (TARGET_64BIT)
11636 {
11637 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11638 NULL, 1, OPTAB_DIRECT);
11639 switch (fcode)
11640 {
11641 case IX86_BUILTIN_XSAVE:
11642 icode = CODE_FOR_xsave_rex64;
11643 break;
11644 case IX86_BUILTIN_XRSTOR:
11645 icode = CODE_FOR_xrstor_rex64;
11646 break;
11647 case IX86_BUILTIN_XSAVE64:
11648 icode = CODE_FOR_xsave64;
11649 break;
11650 case IX86_BUILTIN_XRSTOR64:
11651 icode = CODE_FOR_xrstor64;
11652 break;
11653 case IX86_BUILTIN_XSAVEOPT:
11654 icode = CODE_FOR_xsaveopt_rex64;
11655 break;
11656 case IX86_BUILTIN_XSAVEOPT64:
11657 icode = CODE_FOR_xsaveopt64;
11658 break;
11659 case IX86_BUILTIN_XSAVES:
11660 icode = CODE_FOR_xsaves_rex64;
11661 break;
11662 case IX86_BUILTIN_XRSTORS:
11663 icode = CODE_FOR_xrstors_rex64;
11664 break;
11665 case IX86_BUILTIN_XSAVES64:
11666 icode = CODE_FOR_xsaves64;
11667 break;
11668 case IX86_BUILTIN_XRSTORS64:
11669 icode = CODE_FOR_xrstors64;
11670 break;
11671 case IX86_BUILTIN_XSAVEC:
11672 icode = CODE_FOR_xsavec_rex64;
11673 break;
11674 case IX86_BUILTIN_XSAVEC64:
11675 icode = CODE_FOR_xsavec64;
11676 break;
11677 default:
11678 gcc_unreachable ();
11679 }
11680
11681 op2 = gen_lowpart (SImode, op2);
11682 op1 = gen_lowpart (SImode, op1);
11683 pat = GEN_FCN (icode) (op0, op1, op2);
11684 }
11685 else
11686 {
11687 switch (fcode)
11688 {
11689 case IX86_BUILTIN_XSAVE:
11690 icode = CODE_FOR_xsave;
11691 break;
11692 case IX86_BUILTIN_XRSTOR:
11693 icode = CODE_FOR_xrstor;
11694 break;
11695 case IX86_BUILTIN_XSAVEOPT:
11696 icode = CODE_FOR_xsaveopt;
11697 break;
11698 case IX86_BUILTIN_XSAVES:
11699 icode = CODE_FOR_xsaves;
11700 break;
11701 case IX86_BUILTIN_XRSTORS:
11702 icode = CODE_FOR_xrstors;
11703 break;
11704 case IX86_BUILTIN_XSAVEC:
11705 icode = CODE_FOR_xsavec;
11706 break;
11707 default:
11708 gcc_unreachable ();
11709 }
11710 pat = GEN_FCN (icode) (op0, op1);
11711 }
11712
11713 if (pat)
11714 emit_insn (pat);
11715 return 0;
11716
11717 case IX86_BUILTIN_LLWPCB:
11718 arg0 = CALL_EXPR_ARG (exp, 0);
11719 op0 = expand_normal (arg0);
11720 icode = CODE_FOR_lwp_llwpcb;
11721 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11722 op0 = ix86_zero_extend_to_Pmode (op0);
11723 emit_insn (gen_lwp_llwpcb (op0));
11724 return 0;
11725
11726 case IX86_BUILTIN_SLWPCB:
11727 icode = CODE_FOR_lwp_slwpcb;
11728 if (!target
11729 || !insn_data[icode].operand[0].predicate (target, Pmode))
11730 target = gen_reg_rtx (Pmode);
11731 emit_insn (gen_lwp_slwpcb (target));
11732 return target;
11733
11734 case IX86_BUILTIN_BEXTRI32:
11735 case IX86_BUILTIN_BEXTRI64:
11736 arg0 = CALL_EXPR_ARG (exp, 0);
11737 arg1 = CALL_EXPR_ARG (exp, 1);
11738 op0 = expand_normal (arg0);
11739 op1 = expand_normal (arg1);
11740 icode = (fcode == IX86_BUILTIN_BEXTRI32
11741 ? CODE_FOR_tbm_bextri_si
11742 : CODE_FOR_tbm_bextri_di);
11743 if (!CONST_INT_P (op1))
11744 {
11745 error ("last argument must be an immediate");
11746 return const0_rtx;
11747 }
11748 else
11749 {
11750 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
11751 unsigned char lsb_index = INTVAL (op1) & 0xFF;
11752 op1 = GEN_INT (length);
11753 op2 = GEN_INT (lsb_index);
11754
11755 mode1 = insn_data[icode].operand[1].mode;
11756 if (!insn_data[icode].operand[1].predicate (op0, mode1))
11757 op0 = copy_to_mode_reg (mode1, op0);
11758
11759 mode0 = insn_data[icode].operand[0].mode;
11760 if (target == 0
11761 || !register_operand (target, mode0))
11762 target = gen_reg_rtx (mode0);
11763
11764 pat = GEN_FCN (icode) (target, op0, op1, op2);
11765 if (pat)
11766 emit_insn (pat);
11767 return target;
11768 }
11769
11770 case IX86_BUILTIN_RDRAND16_STEP:
11771 icode = CODE_FOR_rdrandhi_1;
11772 mode0 = HImode;
11773 goto rdrand_step;
11774
11775 case IX86_BUILTIN_RDRAND32_STEP:
11776 icode = CODE_FOR_rdrandsi_1;
11777 mode0 = SImode;
11778 goto rdrand_step;
11779
11780 case IX86_BUILTIN_RDRAND64_STEP:
11781 icode = CODE_FOR_rdranddi_1;
11782 mode0 = DImode;
11783
11784 rdrand_step:
11785 arg0 = CALL_EXPR_ARG (exp, 0);
11786 op1 = expand_normal (arg0);
11787 if (!address_operand (op1, VOIDmode))
11788 {
11789 op1 = convert_memory_address (Pmode, op1);
11790 op1 = copy_addr_to_reg (op1);
11791 }
11792
11793 op0 = gen_reg_rtx (mode0);
11794 emit_insn (GEN_FCN (icode) (op0));
11795
11796 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11797
11798 op1 = gen_reg_rtx (SImode);
11799 emit_move_insn (op1, CONST1_RTX (SImode));
11800
11801 /* Emit SImode conditional move. */
11802 if (mode0 == HImode)
11803 {
11804 if (TARGET_ZERO_EXTEND_WITH_AND
11805 && optimize_function_for_speed_p (cfun))
11806 {
11807 op2 = force_reg (SImode, const0_rtx);
11808
11809 emit_insn (gen_movstricthi
11810 (gen_lowpart (HImode, op2), op0));
11811 }
11812 else
11813 {
11814 op2 = gen_reg_rtx (SImode);
11815
11816 emit_insn (gen_zero_extendhisi2 (op2, op0));
11817 }
11818 }
11819 else if (mode0 == SImode)
11820 op2 = op0;
11821 else
11822 op2 = gen_rtx_SUBREG (SImode, op0, 0);
11823
11824 if (target == 0
11825 || !register_operand (target, SImode))
11826 target = gen_reg_rtx (SImode);
11827
11828 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
11829 const0_rtx);
11830 emit_insn (gen_rtx_SET (target,
11831 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
11832 return target;
11833
11834 case IX86_BUILTIN_RDSEED16_STEP:
11835 icode = CODE_FOR_rdseedhi_1;
11836 mode0 = HImode;
11837 goto rdseed_step;
11838
11839 case IX86_BUILTIN_RDSEED32_STEP:
11840 icode = CODE_FOR_rdseedsi_1;
11841 mode0 = SImode;
11842 goto rdseed_step;
11843
11844 case IX86_BUILTIN_RDSEED64_STEP:
11845 icode = CODE_FOR_rdseeddi_1;
11846 mode0 = DImode;
11847
11848 rdseed_step:
11849 arg0 = CALL_EXPR_ARG (exp, 0);
11850 op1 = expand_normal (arg0);
11851 if (!address_operand (op1, VOIDmode))
11852 {
11853 op1 = convert_memory_address (Pmode, op1);
11854 op1 = copy_addr_to_reg (op1);
11855 }
11856
11857 op0 = gen_reg_rtx (mode0);
11858 emit_insn (GEN_FCN (icode) (op0));
11859
11860 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11861
11862 op2 = gen_reg_rtx (QImode);
11863
11864 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11865 const0_rtx);
11866 emit_insn (gen_rtx_SET (op2, pat));
11867
11868 if (target == 0
11869 || !register_operand (target, SImode))
11870 target = gen_reg_rtx (SImode);
11871
11872 emit_insn (gen_zero_extendqisi2 (target, op2));
11873 return target;
11874
11875 case IX86_BUILTIN_SBB32:
11876 icode = CODE_FOR_subborrowsi;
11877 icode2 = CODE_FOR_subborrowsi_0;
11878 mode0 = SImode;
11879 mode1 = DImode;
11880 mode2 = CCmode;
11881 goto handlecarry;
11882
11883 case IX86_BUILTIN_SBB64:
11884 icode = CODE_FOR_subborrowdi;
11885 icode2 = CODE_FOR_subborrowdi_0;
11886 mode0 = DImode;
11887 mode1 = TImode;
11888 mode2 = CCmode;
11889 goto handlecarry;
11890
11891 case IX86_BUILTIN_ADDCARRYX32:
11892 icode = CODE_FOR_addcarrysi;
11893 icode2 = CODE_FOR_addcarrysi_0;
11894 mode0 = SImode;
11895 mode1 = DImode;
11896 mode2 = CCCmode;
11897 goto handlecarry;
11898
11899 case IX86_BUILTIN_ADDCARRYX64:
11900 icode = CODE_FOR_addcarrydi;
11901 icode2 = CODE_FOR_addcarrydi_0;
11902 mode0 = DImode;
11903 mode1 = TImode;
11904 mode2 = CCCmode;
11905
11906 handlecarry:
11907 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
11908 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
11909 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
11910 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
11911
11912 op1 = expand_normal (arg0);
11913 if (!integer_zerop (arg0))
11914 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
11915
11916 op2 = expand_normal (arg1);
11917 if (!register_operand (op2, mode0))
11918 op2 = copy_to_mode_reg (mode0, op2);
11919
11920 op3 = expand_normal (arg2);
11921 if (!register_operand (op3, mode0))
11922 op3 = copy_to_mode_reg (mode0, op3);
11923
11924 op4 = expand_normal (arg3);
11925 if (!address_operand (op4, VOIDmode))
11926 {
11927 op4 = convert_memory_address (Pmode, op4);
11928 op4 = copy_addr_to_reg (op4);
11929 }
11930
11931 op0 = gen_reg_rtx (mode0);
11932 if (integer_zerop (arg0))
11933 {
11934 /* If arg0 is 0, optimize right away into add or sub
11935 instruction that sets CCCmode flags. */
11936 op1 = gen_rtx_REG (mode2, FLAGS_REG);
11937 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
11938 }
11939 else
11940 {
11941 /* Generate CF from input operand. */
11942 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
11943
11944 /* Generate instruction that consumes CF. */
11945 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
11946 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
11947 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
11948 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
11949 }
11950
11951 /* Return current CF value. */
11952 if (target == 0)
11953 target = gen_reg_rtx (QImode);
11954
11955 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
11956 emit_insn (gen_rtx_SET (target, pat));
11957
11958 /* Store the result. */
11959 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
11960
11961 return target;
11962
11963 case IX86_BUILTIN_READ_FLAGS:
11964 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
11965
11966 if (optimize
11967 || target == NULL_RTX
11968 || !nonimmediate_operand (target, word_mode)
11969 || GET_MODE (target) != word_mode)
11970 target = gen_reg_rtx (word_mode);
11971
11972 emit_insn (gen_pop (target));
11973 return target;
11974
11975 case IX86_BUILTIN_WRITE_FLAGS:
11976
11977 arg0 = CALL_EXPR_ARG (exp, 0);
11978 op0 = expand_normal (arg0);
11979 if (!general_no_elim_operand (op0, word_mode))
11980 op0 = copy_to_mode_reg (word_mode, op0);
11981
11982 emit_insn (gen_push (op0));
11983 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
11984 return 0;
11985
11986 case IX86_BUILTIN_KTESTC8:
11987 icode = CODE_FOR_ktestqi;
11988 mode3 = CCCmode;
11989 goto kortest;
11990
11991 case IX86_BUILTIN_KTESTZ8:
11992 icode = CODE_FOR_ktestqi;
11993 mode3 = CCZmode;
11994 goto kortest;
11995
11996 case IX86_BUILTIN_KTESTC16:
11997 icode = CODE_FOR_ktesthi;
11998 mode3 = CCCmode;
11999 goto kortest;
12000
12001 case IX86_BUILTIN_KTESTZ16:
12002 icode = CODE_FOR_ktesthi;
12003 mode3 = CCZmode;
12004 goto kortest;
12005
12006 case IX86_BUILTIN_KTESTC32:
12007 icode = CODE_FOR_ktestsi;
12008 mode3 = CCCmode;
12009 goto kortest;
12010
12011 case IX86_BUILTIN_KTESTZ32:
12012 icode = CODE_FOR_ktestsi;
12013 mode3 = CCZmode;
12014 goto kortest;
12015
12016 case IX86_BUILTIN_KTESTC64:
12017 icode = CODE_FOR_ktestdi;
12018 mode3 = CCCmode;
12019 goto kortest;
12020
12021 case IX86_BUILTIN_KTESTZ64:
12022 icode = CODE_FOR_ktestdi;
12023 mode3 = CCZmode;
12024 goto kortest;
12025
12026 case IX86_BUILTIN_KORTESTC8:
12027 icode = CODE_FOR_kortestqi;
12028 mode3 = CCCmode;
12029 goto kortest;
12030
12031 case IX86_BUILTIN_KORTESTZ8:
12032 icode = CODE_FOR_kortestqi;
12033 mode3 = CCZmode;
12034 goto kortest;
12035
12036 case IX86_BUILTIN_KORTESTC16:
12037 icode = CODE_FOR_kortesthi;
12038 mode3 = CCCmode;
12039 goto kortest;
12040
12041 case IX86_BUILTIN_KORTESTZ16:
12042 icode = CODE_FOR_kortesthi;
12043 mode3 = CCZmode;
12044 goto kortest;
12045
12046 case IX86_BUILTIN_KORTESTC32:
12047 icode = CODE_FOR_kortestsi;
12048 mode3 = CCCmode;
12049 goto kortest;
12050
12051 case IX86_BUILTIN_KORTESTZ32:
12052 icode = CODE_FOR_kortestsi;
12053 mode3 = CCZmode;
12054 goto kortest;
12055
12056 case IX86_BUILTIN_KORTESTC64:
12057 icode = CODE_FOR_kortestdi;
12058 mode3 = CCCmode;
12059 goto kortest;
12060
12061 case IX86_BUILTIN_KORTESTZ64:
12062 icode = CODE_FOR_kortestdi;
12063 mode3 = CCZmode;
12064
12065 kortest:
12066 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
12067 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
12068 op0 = expand_normal (arg0);
12069 op1 = expand_normal (arg1);
12070
12071 mode0 = insn_data[icode].operand[0].mode;
12072 mode1 = insn_data[icode].operand[1].mode;
12073
12074 if (GET_MODE (op0) != VOIDmode)
12075 op0 = force_reg (GET_MODE (op0), op0);
12076
12077 op0 = gen_lowpart (mode0, op0);
12078
12079 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12080 op0 = copy_to_mode_reg (mode0, op0);
12081
12082 if (GET_MODE (op1) != VOIDmode)
12083 op1 = force_reg (GET_MODE (op1), op1);
12084
12085 op1 = gen_lowpart (mode1, op1);
12086
12087 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12088 op1 = copy_to_mode_reg (mode1, op1);
12089
12090 target = gen_reg_rtx (QImode);
12091
12092 /* Emit kortest. */
12093 emit_insn (GEN_FCN (icode) (op0, op1));
12094 /* And use setcc to return result from flags. */
12095 ix86_expand_setcc (target, EQ,
12096 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12097 return target;
12098
12099 case IX86_BUILTIN_GATHERSIV2DF:
12100 icode = CODE_FOR_avx2_gathersiv2df;
12101 goto gather_gen;
12102 case IX86_BUILTIN_GATHERSIV4DF:
12103 icode = CODE_FOR_avx2_gathersiv4df;
12104 goto gather_gen;
12105 case IX86_BUILTIN_GATHERDIV2DF:
12106 icode = CODE_FOR_avx2_gatherdiv2df;
12107 goto gather_gen;
12108 case IX86_BUILTIN_GATHERDIV4DF:
12109 icode = CODE_FOR_avx2_gatherdiv4df;
12110 goto gather_gen;
12111 case IX86_BUILTIN_GATHERSIV4SF:
12112 icode = CODE_FOR_avx2_gathersiv4sf;
12113 goto gather_gen;
12114 case IX86_BUILTIN_GATHERSIV8SF:
12115 icode = CODE_FOR_avx2_gathersiv8sf;
12116 goto gather_gen;
12117 case IX86_BUILTIN_GATHERDIV4SF:
12118 icode = CODE_FOR_avx2_gatherdiv4sf;
12119 goto gather_gen;
12120 case IX86_BUILTIN_GATHERDIV8SF:
12121 icode = CODE_FOR_avx2_gatherdiv8sf;
12122 goto gather_gen;
12123 case IX86_BUILTIN_GATHERSIV2DI:
12124 icode = CODE_FOR_avx2_gathersiv2di;
12125 goto gather_gen;
12126 case IX86_BUILTIN_GATHERSIV4DI:
12127 icode = CODE_FOR_avx2_gathersiv4di;
12128 goto gather_gen;
12129 case IX86_BUILTIN_GATHERDIV2DI:
12130 icode = CODE_FOR_avx2_gatherdiv2di;
12131 goto gather_gen;
12132 case IX86_BUILTIN_GATHERDIV4DI:
12133 icode = CODE_FOR_avx2_gatherdiv4di;
12134 goto gather_gen;
12135 case IX86_BUILTIN_GATHERSIV4SI:
12136 icode = CODE_FOR_avx2_gathersiv4si;
12137 goto gather_gen;
12138 case IX86_BUILTIN_GATHERSIV8SI:
12139 icode = CODE_FOR_avx2_gathersiv8si;
12140 goto gather_gen;
12141 case IX86_BUILTIN_GATHERDIV4SI:
12142 icode = CODE_FOR_avx2_gatherdiv4si;
12143 goto gather_gen;
12144 case IX86_BUILTIN_GATHERDIV8SI:
12145 icode = CODE_FOR_avx2_gatherdiv8si;
12146 goto gather_gen;
12147 case IX86_BUILTIN_GATHERALTSIV4DF:
12148 icode = CODE_FOR_avx2_gathersiv4df;
12149 goto gather_gen;
12150 case IX86_BUILTIN_GATHERALTDIV8SF:
12151 icode = CODE_FOR_avx2_gatherdiv8sf;
12152 goto gather_gen;
12153 case IX86_BUILTIN_GATHERALTSIV4DI:
12154 icode = CODE_FOR_avx2_gathersiv4di;
12155 goto gather_gen;
12156 case IX86_BUILTIN_GATHERALTDIV8SI:
12157 icode = CODE_FOR_avx2_gatherdiv8si;
12158 goto gather_gen;
12159 case IX86_BUILTIN_GATHER3SIV16SF:
12160 icode = CODE_FOR_avx512f_gathersiv16sf;
12161 goto gather_gen;
12162 case IX86_BUILTIN_GATHER3SIV8DF:
12163 icode = CODE_FOR_avx512f_gathersiv8df;
12164 goto gather_gen;
12165 case IX86_BUILTIN_GATHER3DIV16SF:
12166 icode = CODE_FOR_avx512f_gatherdiv16sf;
12167 goto gather_gen;
12168 case IX86_BUILTIN_GATHER3DIV8DF:
12169 icode = CODE_FOR_avx512f_gatherdiv8df;
12170 goto gather_gen;
12171 case IX86_BUILTIN_GATHER3SIV16SI:
12172 icode = CODE_FOR_avx512f_gathersiv16si;
12173 goto gather_gen;
12174 case IX86_BUILTIN_GATHER3SIV8DI:
12175 icode = CODE_FOR_avx512f_gathersiv8di;
12176 goto gather_gen;
12177 case IX86_BUILTIN_GATHER3DIV16SI:
12178 icode = CODE_FOR_avx512f_gatherdiv16si;
12179 goto gather_gen;
12180 case IX86_BUILTIN_GATHER3DIV8DI:
12181 icode = CODE_FOR_avx512f_gatherdiv8di;
12182 goto gather_gen;
12183 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12184 icode = CODE_FOR_avx512f_gathersiv8df;
12185 goto gather_gen;
12186 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12187 icode = CODE_FOR_avx512f_gatherdiv16sf;
12188 goto gather_gen;
12189 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12190 icode = CODE_FOR_avx512f_gathersiv8di;
12191 goto gather_gen;
12192 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12193 icode = CODE_FOR_avx512f_gatherdiv16si;
12194 goto gather_gen;
12195 case IX86_BUILTIN_GATHER3SIV2DF:
12196 icode = CODE_FOR_avx512vl_gathersiv2df;
12197 goto gather_gen;
12198 case IX86_BUILTIN_GATHER3SIV4DF:
12199 icode = CODE_FOR_avx512vl_gathersiv4df;
12200 goto gather_gen;
12201 case IX86_BUILTIN_GATHER3DIV2DF:
12202 icode = CODE_FOR_avx512vl_gatherdiv2df;
12203 goto gather_gen;
12204 case IX86_BUILTIN_GATHER3DIV4DF:
12205 icode = CODE_FOR_avx512vl_gatherdiv4df;
12206 goto gather_gen;
12207 case IX86_BUILTIN_GATHER3SIV4SF:
12208 icode = CODE_FOR_avx512vl_gathersiv4sf;
12209 goto gather_gen;
12210 case IX86_BUILTIN_GATHER3SIV8SF:
12211 icode = CODE_FOR_avx512vl_gathersiv8sf;
12212 goto gather_gen;
12213 case IX86_BUILTIN_GATHER3DIV4SF:
12214 icode = CODE_FOR_avx512vl_gatherdiv4sf;
12215 goto gather_gen;
12216 case IX86_BUILTIN_GATHER3DIV8SF:
12217 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12218 goto gather_gen;
12219 case IX86_BUILTIN_GATHER3SIV2DI:
12220 icode = CODE_FOR_avx512vl_gathersiv2di;
12221 goto gather_gen;
12222 case IX86_BUILTIN_GATHER3SIV4DI:
12223 icode = CODE_FOR_avx512vl_gathersiv4di;
12224 goto gather_gen;
12225 case IX86_BUILTIN_GATHER3DIV2DI:
12226 icode = CODE_FOR_avx512vl_gatherdiv2di;
12227 goto gather_gen;
12228 case IX86_BUILTIN_GATHER3DIV4DI:
12229 icode = CODE_FOR_avx512vl_gatherdiv4di;
12230 goto gather_gen;
12231 case IX86_BUILTIN_GATHER3SIV4SI:
12232 icode = CODE_FOR_avx512vl_gathersiv4si;
12233 goto gather_gen;
12234 case IX86_BUILTIN_GATHER3SIV8SI:
12235 icode = CODE_FOR_avx512vl_gathersiv8si;
12236 goto gather_gen;
12237 case IX86_BUILTIN_GATHER3DIV4SI:
12238 icode = CODE_FOR_avx512vl_gatherdiv4si;
12239 goto gather_gen;
12240 case IX86_BUILTIN_GATHER3DIV8SI:
12241 icode = CODE_FOR_avx512vl_gatherdiv8si;
12242 goto gather_gen;
12243 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12244 icode = CODE_FOR_avx512vl_gathersiv4df;
12245 goto gather_gen;
12246 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12247 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12248 goto gather_gen;
12249 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12250 icode = CODE_FOR_avx512vl_gathersiv4di;
12251 goto gather_gen;
12252 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12253 icode = CODE_FOR_avx512vl_gatherdiv8si;
12254 goto gather_gen;
12255 case IX86_BUILTIN_SCATTERSIV16SF:
12256 icode = CODE_FOR_avx512f_scattersiv16sf;
12257 goto scatter_gen;
12258 case IX86_BUILTIN_SCATTERSIV8DF:
12259 icode = CODE_FOR_avx512f_scattersiv8df;
12260 goto scatter_gen;
12261 case IX86_BUILTIN_SCATTERDIV16SF:
12262 icode = CODE_FOR_avx512f_scatterdiv16sf;
12263 goto scatter_gen;
12264 case IX86_BUILTIN_SCATTERDIV8DF:
12265 icode = CODE_FOR_avx512f_scatterdiv8df;
12266 goto scatter_gen;
12267 case IX86_BUILTIN_SCATTERSIV16SI:
12268 icode = CODE_FOR_avx512f_scattersiv16si;
12269 goto scatter_gen;
12270 case IX86_BUILTIN_SCATTERSIV8DI:
12271 icode = CODE_FOR_avx512f_scattersiv8di;
12272 goto scatter_gen;
12273 case IX86_BUILTIN_SCATTERDIV16SI:
12274 icode = CODE_FOR_avx512f_scatterdiv16si;
12275 goto scatter_gen;
12276 case IX86_BUILTIN_SCATTERDIV8DI:
12277 icode = CODE_FOR_avx512f_scatterdiv8di;
12278 goto scatter_gen;
12279 case IX86_BUILTIN_SCATTERSIV8SF:
12280 icode = CODE_FOR_avx512vl_scattersiv8sf;
12281 goto scatter_gen;
12282 case IX86_BUILTIN_SCATTERSIV4SF:
12283 icode = CODE_FOR_avx512vl_scattersiv4sf;
12284 goto scatter_gen;
12285 case IX86_BUILTIN_SCATTERSIV4DF:
12286 icode = CODE_FOR_avx512vl_scattersiv4df;
12287 goto scatter_gen;
12288 case IX86_BUILTIN_SCATTERSIV2DF:
12289 icode = CODE_FOR_avx512vl_scattersiv2df;
12290 goto scatter_gen;
12291 case IX86_BUILTIN_SCATTERDIV8SF:
12292 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12293 goto scatter_gen;
12294 case IX86_BUILTIN_SCATTERDIV4SF:
12295 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12296 goto scatter_gen;
12297 case IX86_BUILTIN_SCATTERDIV4DF:
12298 icode = CODE_FOR_avx512vl_scatterdiv4df;
12299 goto scatter_gen;
12300 case IX86_BUILTIN_SCATTERDIV2DF:
12301 icode = CODE_FOR_avx512vl_scatterdiv2df;
12302 goto scatter_gen;
12303 case IX86_BUILTIN_SCATTERSIV8SI:
12304 icode = CODE_FOR_avx512vl_scattersiv8si;
12305 goto scatter_gen;
12306 case IX86_BUILTIN_SCATTERSIV4SI:
12307 icode = CODE_FOR_avx512vl_scattersiv4si;
12308 goto scatter_gen;
12309 case IX86_BUILTIN_SCATTERSIV4DI:
12310 icode = CODE_FOR_avx512vl_scattersiv4di;
12311 goto scatter_gen;
12312 case IX86_BUILTIN_SCATTERSIV2DI:
12313 icode = CODE_FOR_avx512vl_scattersiv2di;
12314 goto scatter_gen;
12315 case IX86_BUILTIN_SCATTERDIV8SI:
12316 icode = CODE_FOR_avx512vl_scatterdiv8si;
12317 goto scatter_gen;
12318 case IX86_BUILTIN_SCATTERDIV4SI:
12319 icode = CODE_FOR_avx512vl_scatterdiv4si;
12320 goto scatter_gen;
12321 case IX86_BUILTIN_SCATTERDIV4DI:
12322 icode = CODE_FOR_avx512vl_scatterdiv4di;
12323 goto scatter_gen;
12324 case IX86_BUILTIN_SCATTERDIV2DI:
12325 icode = CODE_FOR_avx512vl_scatterdiv2di;
12326 goto scatter_gen;
12327 case IX86_BUILTIN_GATHERPFDPD:
12328 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12329 goto vec_prefetch_gen;
12330 case IX86_BUILTIN_SCATTERALTSIV8DF:
12331 icode = CODE_FOR_avx512f_scattersiv8df;
12332 goto scatter_gen;
12333 case IX86_BUILTIN_SCATTERALTDIV16SF:
12334 icode = CODE_FOR_avx512f_scatterdiv16sf;
12335 goto scatter_gen;
12336 case IX86_BUILTIN_SCATTERALTSIV8DI:
12337 icode = CODE_FOR_avx512f_scattersiv8di;
12338 goto scatter_gen;
12339 case IX86_BUILTIN_SCATTERALTDIV16SI:
12340 icode = CODE_FOR_avx512f_scatterdiv16si;
12341 goto scatter_gen;
12342 case IX86_BUILTIN_SCATTERALTSIV4DF:
12343 icode = CODE_FOR_avx512vl_scattersiv4df;
12344 goto scatter_gen;
12345 case IX86_BUILTIN_SCATTERALTDIV8SF:
12346 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12347 goto scatter_gen;
12348 case IX86_BUILTIN_SCATTERALTSIV4DI:
12349 icode = CODE_FOR_avx512vl_scattersiv4di;
12350 goto scatter_gen;
12351 case IX86_BUILTIN_SCATTERALTDIV8SI:
12352 icode = CODE_FOR_avx512vl_scatterdiv8si;
12353 goto scatter_gen;
12354 case IX86_BUILTIN_SCATTERALTSIV2DF:
12355 icode = CODE_FOR_avx512vl_scattersiv2df;
12356 goto scatter_gen;
12357 case IX86_BUILTIN_SCATTERALTDIV4SF:
12358 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12359 goto scatter_gen;
12360 case IX86_BUILTIN_SCATTERALTSIV2DI:
12361 icode = CODE_FOR_avx512vl_scattersiv2di;
12362 goto scatter_gen;
12363 case IX86_BUILTIN_SCATTERALTDIV4SI:
12364 icode = CODE_FOR_avx512vl_scatterdiv4si;
12365 goto scatter_gen;
12366 case IX86_BUILTIN_GATHERPFDPS:
12367 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12368 goto vec_prefetch_gen;
12369 case IX86_BUILTIN_GATHERPFQPD:
12370 icode = CODE_FOR_avx512pf_gatherpfv8didf;
12371 goto vec_prefetch_gen;
12372 case IX86_BUILTIN_GATHERPFQPS:
12373 icode = CODE_FOR_avx512pf_gatherpfv8disf;
12374 goto vec_prefetch_gen;
12375 case IX86_BUILTIN_SCATTERPFDPD:
12376 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12377 goto vec_prefetch_gen;
12378 case IX86_BUILTIN_SCATTERPFDPS:
12379 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12380 goto vec_prefetch_gen;
12381 case IX86_BUILTIN_SCATTERPFQPD:
12382 icode = CODE_FOR_avx512pf_scatterpfv8didf;
12383 goto vec_prefetch_gen;
12384 case IX86_BUILTIN_SCATTERPFQPS:
12385 icode = CODE_FOR_avx512pf_scatterpfv8disf;
12386 goto vec_prefetch_gen;
12387
12388 gather_gen:
12389 rtx half;
12390 rtx (*gen) (rtx, rtx);
12391
12392 arg0 = CALL_EXPR_ARG (exp, 0);
12393 arg1 = CALL_EXPR_ARG (exp, 1);
12394 arg2 = CALL_EXPR_ARG (exp, 2);
12395 arg3 = CALL_EXPR_ARG (exp, 3);
12396 arg4 = CALL_EXPR_ARG (exp, 4);
12397 op0 = expand_normal (arg0);
12398 op1 = expand_normal (arg1);
12399 op2 = expand_normal (arg2);
12400 op3 = expand_normal (arg3);
12401 op4 = expand_normal (arg4);
12402 /* Note the arg order is different from the operand order. */
12403 mode0 = insn_data[icode].operand[1].mode;
12404 mode2 = insn_data[icode].operand[3].mode;
12405 mode3 = insn_data[icode].operand[4].mode;
12406 mode4 = insn_data[icode].operand[5].mode;
12407
12408 if (target == NULL_RTX
12409 || GET_MODE (target) != insn_data[icode].operand[0].mode
12410 || !insn_data[icode].operand[0].predicate (target,
12411 GET_MODE (target)))
12412 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12413 else
12414 subtarget = target;
12415
12416 switch (fcode)
12417 {
12418 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12419 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12420 half = gen_reg_rtx (V8SImode);
12421 if (!nonimmediate_operand (op2, V16SImode))
12422 op2 = copy_to_mode_reg (V16SImode, op2);
12423 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12424 op2 = half;
12425 break;
12426 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12427 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12428 case IX86_BUILTIN_GATHERALTSIV4DF:
12429 case IX86_BUILTIN_GATHERALTSIV4DI:
12430 half = gen_reg_rtx (V4SImode);
12431 if (!nonimmediate_operand (op2, V8SImode))
12432 op2 = copy_to_mode_reg (V8SImode, op2);
12433 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12434 op2 = half;
12435 break;
12436 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12437 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12438 half = gen_reg_rtx (mode0);
12439 if (mode0 == V8SFmode)
12440 gen = gen_vec_extract_lo_v16sf;
12441 else
12442 gen = gen_vec_extract_lo_v16si;
12443 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12444 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12445 emit_insn (gen (half, op0));
12446 op0 = half;
12447 op3 = lowpart_subreg (QImode, op3, HImode);
12448 break;
12449 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12450 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12451 case IX86_BUILTIN_GATHERALTDIV8SF:
12452 case IX86_BUILTIN_GATHERALTDIV8SI:
12453 half = gen_reg_rtx (mode0);
12454 if (mode0 == V4SFmode)
12455 gen = gen_vec_extract_lo_v8sf;
12456 else
12457 gen = gen_vec_extract_lo_v8si;
12458 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12459 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12460 emit_insn (gen (half, op0));
12461 op0 = half;
12462 if (VECTOR_MODE_P (GET_MODE (op3)))
12463 {
12464 half = gen_reg_rtx (mode0);
12465 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12466 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12467 emit_insn (gen (half, op3));
12468 op3 = half;
12469 }
12470 break;
12471 default:
12472 break;
12473 }
12474
12475 /* Force memory operand only with base register here. But we
12476 don't want to do it on memory operand for other builtin
12477 functions. */
12478 op1 = ix86_zero_extend_to_Pmode (op1);
12479
12480 if (!insn_data[icode].operand[1].predicate (op0, mode0))
12481 op0 = copy_to_mode_reg (mode0, op0);
12482 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12483 op1 = copy_to_mode_reg (Pmode, op1);
12484 if (!insn_data[icode].operand[3].predicate (op2, mode2))
12485 op2 = copy_to_mode_reg (mode2, op2);
12486
12487 op3 = fixup_modeless_constant (op3, mode3);
12488
12489 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12490 {
12491 if (!insn_data[icode].operand[4].predicate (op3, mode3))
12492 op3 = copy_to_mode_reg (mode3, op3);
12493 }
12494 else
12495 {
12496 op3 = copy_to_reg (op3);
12497 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12498 }
12499 if (!insn_data[icode].operand[5].predicate (op4, mode4))
12500 {
12501 error ("the last argument must be scale 1, 2, 4, 8");
12502 return const0_rtx;
12503 }
12504
12505 /* Optimize. If mask is known to have all high bits set,
12506 replace op0 with pc_rtx to signal that the instruction
12507 overwrites the whole destination and doesn't use its
12508 previous contents. */
12509 if (optimize)
12510 {
12511 if (TREE_CODE (arg3) == INTEGER_CST)
12512 {
12513 if (integer_all_onesp (arg3))
12514 op0 = pc_rtx;
12515 }
12516 else if (TREE_CODE (arg3) == VECTOR_CST)
12517 {
12518 unsigned int negative = 0;
12519 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12520 {
12521 tree cst = VECTOR_CST_ELT (arg3, i);
12522 if (TREE_CODE (cst) == INTEGER_CST
12523 && tree_int_cst_sign_bit (cst))
12524 negative++;
12525 else if (TREE_CODE (cst) == REAL_CST
12526 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12527 negative++;
12528 }
12529 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12530 op0 = pc_rtx;
12531 }
12532 else if (TREE_CODE (arg3) == SSA_NAME
12533 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12534 {
12535 /* Recognize also when mask is like:
12536 __v2df src = _mm_setzero_pd ();
12537 __v2df mask = _mm_cmpeq_pd (src, src);
12538 or
12539 __v8sf src = _mm256_setzero_ps ();
12540 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12541 as that is a cheaper way to load all ones into
12542 a register than having to load a constant from
12543 memory. */
12544 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12545 if (is_gimple_call (def_stmt))
12546 {
12547 tree fndecl = gimple_call_fndecl (def_stmt);
12548 if (fndecl
12549 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
12550 switch (DECL_MD_FUNCTION_CODE (fndecl))
12551 {
12552 case IX86_BUILTIN_CMPPD:
12553 case IX86_BUILTIN_CMPPS:
12554 case IX86_BUILTIN_CMPPD256:
12555 case IX86_BUILTIN_CMPPS256:
12556 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12557 break;
12558 /* FALLTHRU */
12559 case IX86_BUILTIN_CMPEQPD:
12560 case IX86_BUILTIN_CMPEQPS:
12561 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12562 && initializer_zerop (gimple_call_arg (def_stmt,
12563 1)))
12564 op0 = pc_rtx;
12565 break;
12566 default:
12567 break;
12568 }
12569 }
12570 }
12571 }
12572
12573 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12574 if (! pat)
12575 return const0_rtx;
12576 emit_insn (pat);
12577
12578 switch (fcode)
12579 {
12580 case IX86_BUILTIN_GATHER3DIV16SF:
12581 if (target == NULL_RTX)
12582 target = gen_reg_rtx (V8SFmode);
12583 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12584 break;
12585 case IX86_BUILTIN_GATHER3DIV16SI:
12586 if (target == NULL_RTX)
12587 target = gen_reg_rtx (V8SImode);
12588 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12589 break;
12590 case IX86_BUILTIN_GATHER3DIV8SF:
12591 case IX86_BUILTIN_GATHERDIV8SF:
12592 if (target == NULL_RTX)
12593 target = gen_reg_rtx (V4SFmode);
12594 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12595 break;
12596 case IX86_BUILTIN_GATHER3DIV8SI:
12597 case IX86_BUILTIN_GATHERDIV8SI:
12598 if (target == NULL_RTX)
12599 target = gen_reg_rtx (V4SImode);
12600 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12601 break;
12602 default:
12603 target = subtarget;
12604 break;
12605 }
12606 return target;
12607
12608 scatter_gen:
12609 arg0 = CALL_EXPR_ARG (exp, 0);
12610 arg1 = CALL_EXPR_ARG (exp, 1);
12611 arg2 = CALL_EXPR_ARG (exp, 2);
12612 arg3 = CALL_EXPR_ARG (exp, 3);
12613 arg4 = CALL_EXPR_ARG (exp, 4);
12614 op0 = expand_normal (arg0);
12615 op1 = expand_normal (arg1);
12616 op2 = expand_normal (arg2);
12617 op3 = expand_normal (arg3);
12618 op4 = expand_normal (arg4);
12619 mode1 = insn_data[icode].operand[1].mode;
12620 mode2 = insn_data[icode].operand[2].mode;
12621 mode3 = insn_data[icode].operand[3].mode;
12622 mode4 = insn_data[icode].operand[4].mode;
12623
12624 /* Scatter instruction stores operand op3 to memory with
12625 indices from op2 and scale from op4 under writemask op1.
12626 If index operand op2 has more elements then source operand
12627 op3 one need to use only its low half. And vice versa. */
12628 switch (fcode)
12629 {
12630 case IX86_BUILTIN_SCATTERALTSIV8DF:
12631 case IX86_BUILTIN_SCATTERALTSIV8DI:
12632 half = gen_reg_rtx (V8SImode);
12633 if (!nonimmediate_operand (op2, V16SImode))
12634 op2 = copy_to_mode_reg (V16SImode, op2);
12635 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12636 op2 = half;
12637 break;
12638 case IX86_BUILTIN_SCATTERALTDIV16SF:
12639 case IX86_BUILTIN_SCATTERALTDIV16SI:
12640 half = gen_reg_rtx (mode3);
12641 if (mode3 == V8SFmode)
12642 gen = gen_vec_extract_lo_v16sf;
12643 else
12644 gen = gen_vec_extract_lo_v16si;
12645 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12646 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12647 emit_insn (gen (half, op3));
12648 op3 = half;
12649 break;
12650 case IX86_BUILTIN_SCATTERALTSIV4DF:
12651 case IX86_BUILTIN_SCATTERALTSIV4DI:
12652 half = gen_reg_rtx (V4SImode);
12653 if (!nonimmediate_operand (op2, V8SImode))
12654 op2 = copy_to_mode_reg (V8SImode, op2);
12655 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12656 op2 = half;
12657 break;
12658 case IX86_BUILTIN_SCATTERALTDIV8SF:
12659 case IX86_BUILTIN_SCATTERALTDIV8SI:
12660 half = gen_reg_rtx (mode3);
12661 if (mode3 == V4SFmode)
12662 gen = gen_vec_extract_lo_v8sf;
12663 else
12664 gen = gen_vec_extract_lo_v8si;
12665 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12666 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12667 emit_insn (gen (half, op3));
12668 op3 = half;
12669 break;
12670 case IX86_BUILTIN_SCATTERALTSIV2DF:
12671 case IX86_BUILTIN_SCATTERALTSIV2DI:
12672 if (!nonimmediate_operand (op2, V4SImode))
12673 op2 = copy_to_mode_reg (V4SImode, op2);
12674 break;
12675 case IX86_BUILTIN_SCATTERALTDIV4SF:
12676 case IX86_BUILTIN_SCATTERALTDIV4SI:
12677 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12678 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12679 break;
12680 default:
12681 break;
12682 }
12683
12684 /* Force memory operand only with base register here. But we
12685 don't want to do it on memory operand for other builtin
12686 functions. */
12687 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
12688
12689 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12690 op0 = copy_to_mode_reg (Pmode, op0);
12691
12692 op1 = fixup_modeless_constant (op1, mode1);
12693
12694 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
12695 {
12696 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12697 op1 = copy_to_mode_reg (mode1, op1);
12698 }
12699 else
12700 {
12701 op1 = copy_to_reg (op1);
12702 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
12703 }
12704
12705 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12706 op2 = copy_to_mode_reg (mode2, op2);
12707
12708 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12709 op3 = copy_to_mode_reg (mode3, op3);
12710
12711 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12712 {
12713 error ("the last argument must be scale 1, 2, 4, 8");
12714 return const0_rtx;
12715 }
12716
12717 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12718 if (! pat)
12719 return const0_rtx;
12720
12721 emit_insn (pat);
12722 return 0;
12723
12724 vec_prefetch_gen:
12725 arg0 = CALL_EXPR_ARG (exp, 0);
12726 arg1 = CALL_EXPR_ARG (exp, 1);
12727 arg2 = CALL_EXPR_ARG (exp, 2);
12728 arg3 = CALL_EXPR_ARG (exp, 3);
12729 arg4 = CALL_EXPR_ARG (exp, 4);
12730 op0 = expand_normal (arg0);
12731 op1 = expand_normal (arg1);
12732 op2 = expand_normal (arg2);
12733 op3 = expand_normal (arg3);
12734 op4 = expand_normal (arg4);
12735 mode0 = insn_data[icode].operand[0].mode;
12736 mode1 = insn_data[icode].operand[1].mode;
12737 mode3 = insn_data[icode].operand[3].mode;
12738 mode4 = insn_data[icode].operand[4].mode;
12739
12740 op0 = fixup_modeless_constant (op0, mode0);
12741
12742 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
12743 {
12744 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12745 op0 = copy_to_mode_reg (mode0, op0);
12746 }
12747 else
12748 {
12749 op0 = copy_to_reg (op0);
12750 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
12751 }
12752
12753 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12754 op1 = copy_to_mode_reg (mode1, op1);
12755
12756 /* Force memory operand only with base register here. But we
12757 don't want to do it on memory operand for other builtin
12758 functions. */
12759 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
12760
12761 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
12762 op2 = copy_to_mode_reg (Pmode, op2);
12763
12764 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12765 {
12766 error ("the forth argument must be scale 1, 2, 4, 8");
12767 return const0_rtx;
12768 }
12769
12770 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12771 {
12772 error ("incorrect hint operand");
12773 return const0_rtx;
12774 }
12775
12776 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12777 if (! pat)
12778 return const0_rtx;
12779
12780 emit_insn (pat);
12781
12782 return 0;
12783
12784 case IX86_BUILTIN_XABORT:
12785 icode = CODE_FOR_xabort;
12786 arg0 = CALL_EXPR_ARG (exp, 0);
12787 op0 = expand_normal (arg0);
12788 mode0 = insn_data[icode].operand[0].mode;
12789 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12790 {
12791 error ("the argument to %<xabort%> intrinsic must "
12792 "be an 8-bit immediate");
12793 return const0_rtx;
12794 }
12795 emit_insn (gen_xabort (op0));
12796 return 0;
12797
12798 case IX86_BUILTIN_RSTORSSP:
12799 case IX86_BUILTIN_CLRSSBSY:
12800 arg0 = CALL_EXPR_ARG (exp, 0);
12801 op0 = expand_normal (arg0);
12802 icode = (fcode == IX86_BUILTIN_RSTORSSP
12803 ? CODE_FOR_rstorssp
12804 : CODE_FOR_clrssbsy);
12805 if (!address_operand (op0, VOIDmode))
12806 {
12807 op1 = convert_memory_address (Pmode, op0);
12808 op0 = copy_addr_to_reg (op1);
12809 }
12810 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
12811 return 0;
12812
12813 case IX86_BUILTIN_WRSSD:
12814 case IX86_BUILTIN_WRSSQ:
12815 case IX86_BUILTIN_WRUSSD:
12816 case IX86_BUILTIN_WRUSSQ:
12817 arg0 = CALL_EXPR_ARG (exp, 0);
12818 op0 = expand_normal (arg0);
12819 arg1 = CALL_EXPR_ARG (exp, 1);
12820 op1 = expand_normal (arg1);
12821 switch (fcode)
12822 {
12823 case IX86_BUILTIN_WRSSD:
12824 icode = CODE_FOR_wrsssi;
12825 mode = SImode;
12826 break;
12827 case IX86_BUILTIN_WRSSQ:
12828 icode = CODE_FOR_wrssdi;
12829 mode = DImode;
12830 break;
12831 case IX86_BUILTIN_WRUSSD:
12832 icode = CODE_FOR_wrusssi;
12833 mode = SImode;
12834 break;
12835 case IX86_BUILTIN_WRUSSQ:
12836 icode = CODE_FOR_wrussdi;
12837 mode = DImode;
12838 break;
12839 }
12840 op0 = force_reg (mode, op0);
12841 if (!address_operand (op1, VOIDmode))
12842 {
12843 op2 = convert_memory_address (Pmode, op1);
12844 op1 = copy_addr_to_reg (op2);
12845 }
12846 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
12847 return 0;
12848
12849 default:
12850 break;
12851 }
12852
12853 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12854 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
12855 {
12856 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
12857 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
12858 target);
12859 }
12860
12861 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
12862 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
12863 {
12864 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
12865 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
12866 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
12867 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
12868 int masked = 1;
12869 machine_mode mode, wide_mode, nar_mode;
12870
12871 nar_mode = V4SFmode;
12872 mode = V16SFmode;
12873 wide_mode = V64SFmode;
12874 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
12875 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
12876
12877 switch (fcode)
12878 {
12879 case IX86_BUILTIN_4FMAPS:
12880 fcn = gen_avx5124fmaddps_4fmaddps;
12881 masked = 0;
12882 goto v4fma_expand;
12883
12884 case IX86_BUILTIN_4DPWSSD:
12885 nar_mode = V4SImode;
12886 mode = V16SImode;
12887 wide_mode = V64SImode;
12888 fcn = gen_avx5124vnniw_vp4dpwssd;
12889 masked = 0;
12890 goto v4fma_expand;
12891
12892 case IX86_BUILTIN_4DPWSSDS:
12893 nar_mode = V4SImode;
12894 mode = V16SImode;
12895 wide_mode = V64SImode;
12896 fcn = gen_avx5124vnniw_vp4dpwssds;
12897 masked = 0;
12898 goto v4fma_expand;
12899
12900 case IX86_BUILTIN_4FNMAPS:
12901 fcn = gen_avx5124fmaddps_4fnmaddps;
12902 masked = 0;
12903 goto v4fma_expand;
12904
12905 case IX86_BUILTIN_4FNMAPS_MASK:
12906 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
12907 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
12908 goto v4fma_expand;
12909
12910 case IX86_BUILTIN_4DPWSSD_MASK:
12911 nar_mode = V4SImode;
12912 mode = V16SImode;
12913 wide_mode = V64SImode;
12914 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
12915 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
12916 goto v4fma_expand;
12917
12918 case IX86_BUILTIN_4DPWSSDS_MASK:
12919 nar_mode = V4SImode;
12920 mode = V16SImode;
12921 wide_mode = V64SImode;
12922 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
12923 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
12924 goto v4fma_expand;
12925
12926 case IX86_BUILTIN_4FMAPS_MASK:
12927 {
12928 tree args[4];
12929 rtx ops[4];
12930 rtx wide_reg;
12931 rtx accum;
12932 rtx addr;
12933 rtx mem;
12934
12935 v4fma_expand:
12936 wide_reg = gen_reg_rtx (wide_mode);
12937 for (i = 0; i < 4; i++)
12938 {
12939 args[i] = CALL_EXPR_ARG (exp, i);
12940 ops[i] = expand_normal (args[i]);
12941
12942 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
12943 ops[i]);
12944 }
12945
12946 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
12947 accum = force_reg (mode, accum);
12948
12949 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
12950 addr = force_reg (Pmode, addr);
12951
12952 mem = gen_rtx_MEM (nar_mode, addr);
12953
12954 target = gen_reg_rtx (mode);
12955
12956 emit_move_insn (target, accum);
12957
12958 if (! masked)
12959 emit_insn (fcn (target, accum, wide_reg, mem));
12960 else
12961 {
12962 rtx merge, mask;
12963 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
12964
12965 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
12966
12967 if (CONST_INT_P (mask))
12968 mask = fixup_modeless_constant (mask, HImode);
12969
12970 mask = force_reg (HImode, mask);
12971
12972 if (GET_MODE (mask) != HImode)
12973 mask = gen_rtx_SUBREG (HImode, mask, 0);
12974
12975 /* If merge is 0 then we're about to emit z-masked variant. */
12976 if (const0_operand (merge, mode))
12977 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
12978 /* If merge is the same as accum then emit merge-masked variant. */
12979 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
12980 {
12981 merge = force_reg (mode, merge);
12982 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
12983 }
12984 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12985 else
12986 {
12987 target = gen_reg_rtx (mode);
12988 emit_move_insn (target, merge);
12989 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
12990 }
12991 }
12992 return target;
12993 }
12994
12995 case IX86_BUILTIN_4FNMASS:
12996 fcn = gen_avx5124fmaddps_4fnmaddss;
12997 masked = 0;
12998 goto s4fma_expand;
12999
13000 case IX86_BUILTIN_4FMASS:
13001 fcn = gen_avx5124fmaddps_4fmaddss;
13002 masked = 0;
13003 goto s4fma_expand;
13004
13005 case IX86_BUILTIN_4FNMASS_MASK:
13006 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
13007 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
13008 goto s4fma_expand;
13009
13010 case IX86_BUILTIN_4FMASS_MASK:
13011 {
13012 tree args[4];
13013 rtx ops[4];
13014 rtx wide_reg;
13015 rtx accum;
13016 rtx addr;
13017 rtx mem;
13018
13019 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
13020 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
13021
13022 s4fma_expand:
13023 mode = V4SFmode;
13024 wide_reg = gen_reg_rtx (V64SFmode);
13025 for (i = 0; i < 4; i++)
13026 {
13027 rtx tmp;
13028 args[i] = CALL_EXPR_ARG (exp, i);
13029 ops[i] = expand_normal (args[i]);
13030
13031 tmp = gen_reg_rtx (SFmode);
13032 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
13033
13034 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
13035 gen_rtx_SUBREG (V16SFmode, tmp, 0));
13036 }
13037
13038 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13039 accum = force_reg (V4SFmode, accum);
13040
13041 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13042 addr = force_reg (Pmode, addr);
13043
13044 mem = gen_rtx_MEM (V4SFmode, addr);
13045
13046 target = gen_reg_rtx (V4SFmode);
13047
13048 emit_move_insn (target, accum);
13049
13050 if (! masked)
13051 emit_insn (fcn (target, accum, wide_reg, mem));
13052 else
13053 {
13054 rtx merge, mask;
13055 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13056
13057 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13058
13059 if (CONST_INT_P (mask))
13060 mask = fixup_modeless_constant (mask, QImode);
13061
13062 mask = force_reg (QImode, mask);
13063
13064 if (GET_MODE (mask) != QImode)
13065 mask = gen_rtx_SUBREG (QImode, mask, 0);
13066
13067 /* If merge is 0 then we're about to emit z-masked variant. */
13068 if (const0_operand (merge, mode))
13069 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13070 /* If merge is the same as accum then emit merge-masked
13071 variant. */
13072 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13073 {
13074 merge = force_reg (mode, merge);
13075 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13076 }
13077 /* Merge with something unknown might happen if we z-mask
13078 w/ -O0. */
13079 else
13080 {
13081 target = gen_reg_rtx (mode);
13082 emit_move_insn (target, merge);
13083 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13084 }
13085 }
13086 return target;
13087 }
13088 case IX86_BUILTIN_RDPID:
13089 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13090 target);
13091 case IX86_BUILTIN_FABSQ:
13092 case IX86_BUILTIN_COPYSIGNQ:
13093 if (!TARGET_SSE)
13094 /* Emit a normal call if SSE isn't available. */
13095 return expand_call (exp, target, ignore);
13096 /* FALLTHRU */
13097 default:
13098 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13099 }
13100 }
13101
13102 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13103 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13104 {
13105 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13106 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13107 }
13108
13109 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13110 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13111 {
13112 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13113 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13114 }
13115
13116 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13117 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13118 {
13119 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13120 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13121 }
13122
13123 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13124 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13125 {
13126 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13127 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13128 }
13129
13130 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13131 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13132 {
13133 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13134 const struct builtin_description *d = bdesc_multi_arg + i;
13135 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13136 (enum ix86_builtin_func_type)
13137 d->flag, d->comparison);
13138 }
13139
13140 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13141 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13142 {
13143 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13144 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13145 target);
13146 }
13147
13148 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
13149 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
13150 {
13151 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
13152 return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
13153 target);
13154 }
13155
13156 gcc_unreachable ();
13157 }
13158
13159 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13160 fill target with val via vec_duplicate. */
13161
13162 static bool
13163 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13164 {
13165 bool ok;
13166 rtx_insn *insn;
13167 rtx dup;
13168
13169 /* First attempt to recognize VAL as-is. */
13170 dup = gen_vec_duplicate (mode, val);
13171 insn = emit_insn (gen_rtx_SET (target, dup));
13172 if (recog_memoized (insn) < 0)
13173 {
13174 rtx_insn *seq;
13175 machine_mode innermode = GET_MODE_INNER (mode);
13176 rtx reg;
13177
13178 /* If that fails, force VAL into a register. */
13179
13180 start_sequence ();
13181 reg = force_reg (innermode, val);
13182 if (GET_MODE (reg) != innermode)
13183 reg = gen_lowpart (innermode, reg);
13184 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13185 seq = get_insns ();
13186 end_sequence ();
13187 if (seq)
13188 emit_insn_before (seq, insn);
13189
13190 ok = recog_memoized (insn) >= 0;
13191 gcc_assert (ok);
13192 }
13193 return true;
13194 }
13195
13196 /* Get a vector mode of the same size as the original but with elements
13197 twice as wide. This is only guaranteed to apply to integral vectors. */
13198
13199 static machine_mode
13200 get_mode_wider_vector (machine_mode o)
13201 {
13202 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13203 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13204 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13205 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13206 return n;
13207 }
13208
13209 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13210 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13211
13212 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13213 with all elements equal to VAR. Return true if successful. */
13214
13215 static bool
13216 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13217 rtx target, rtx val)
13218 {
13219 bool ok;
13220
13221 switch (mode)
13222 {
13223 case E_V2SImode:
13224 case E_V2SFmode:
13225 if (!mmx_ok)
13226 return false;
13227 /* FALLTHRU */
13228
13229 case E_V4DFmode:
13230 case E_V4DImode:
13231 case E_V8SFmode:
13232 case E_V8SImode:
13233 case E_V2DFmode:
13234 case E_V2DImode:
13235 case E_V4SFmode:
13236 case E_V4SImode:
13237 case E_V16SImode:
13238 case E_V8DImode:
13239 case E_V16SFmode:
13240 case E_V8DFmode:
13241 return ix86_vector_duplicate_value (mode, target, val);
13242
13243 case E_V4HImode:
13244 if (!mmx_ok)
13245 return false;
13246 if (TARGET_SSE || TARGET_3DNOW_A)
13247 {
13248 rtx x;
13249
13250 val = gen_lowpart (SImode, val);
13251 x = gen_rtx_TRUNCATE (HImode, val);
13252 x = gen_rtx_VEC_DUPLICATE (mode, x);
13253 emit_insn (gen_rtx_SET (target, x));
13254 return true;
13255 }
13256 goto widen;
13257
13258 case E_V8QImode:
13259 if (!mmx_ok)
13260 return false;
13261 goto widen;
13262
13263 case E_V8HImode:
13264 if (TARGET_AVX2)
13265 return ix86_vector_duplicate_value (mode, target, val);
13266
13267 if (TARGET_SSE2)
13268 {
13269 struct expand_vec_perm_d dperm;
13270 rtx tmp1, tmp2;
13271
13272 permute:
13273 memset (&dperm, 0, sizeof (dperm));
13274 dperm.target = target;
13275 dperm.vmode = mode;
13276 dperm.nelt = GET_MODE_NUNITS (mode);
13277 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13278 dperm.one_operand_p = true;
13279
13280 /* Extend to SImode using a paradoxical SUBREG. */
13281 tmp1 = gen_reg_rtx (SImode);
13282 emit_move_insn (tmp1, gen_lowpart (SImode, val));
13283
13284 /* Insert the SImode value as low element of a V4SImode vector. */
13285 tmp2 = gen_reg_rtx (V4SImode);
13286 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13287 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13288
13289 ok = (expand_vec_perm_1 (&dperm)
13290 || expand_vec_perm_broadcast_1 (&dperm));
13291 gcc_assert (ok);
13292 return ok;
13293 }
13294 goto widen;
13295
13296 case E_V16QImode:
13297 if (TARGET_AVX2)
13298 return ix86_vector_duplicate_value (mode, target, val);
13299
13300 if (TARGET_SSE2)
13301 goto permute;
13302 goto widen;
13303
13304 widen:
13305 /* Replicate the value once into the next wider mode and recurse. */
13306 {
13307 machine_mode smode, wsmode, wvmode;
13308 rtx x;
13309
13310 smode = GET_MODE_INNER (mode);
13311 wvmode = get_mode_wider_vector (mode);
13312 wsmode = GET_MODE_INNER (wvmode);
13313
13314 val = convert_modes (wsmode, smode, val, true);
13315 x = expand_simple_binop (wsmode, ASHIFT, val,
13316 GEN_INT (GET_MODE_BITSIZE (smode)),
13317 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13318 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13319
13320 x = gen_reg_rtx (wvmode);
13321 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13322 gcc_assert (ok);
13323 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13324 return ok;
13325 }
13326
13327 case E_V16HImode:
13328 case E_V32QImode:
13329 if (TARGET_AVX2)
13330 return ix86_vector_duplicate_value (mode, target, val);
13331 else
13332 {
13333 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13334 rtx x = gen_reg_rtx (hvmode);
13335
13336 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13337 gcc_assert (ok);
13338
13339 x = gen_rtx_VEC_CONCAT (mode, x, x);
13340 emit_insn (gen_rtx_SET (target, x));
13341 }
13342 return true;
13343
13344 case E_V64QImode:
13345 case E_V32HImode:
13346 if (TARGET_AVX512BW)
13347 return ix86_vector_duplicate_value (mode, target, val);
13348 else
13349 {
13350 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13351 rtx x = gen_reg_rtx (hvmode);
13352
13353 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13354 gcc_assert (ok);
13355
13356 x = gen_rtx_VEC_CONCAT (mode, x, x);
13357 emit_insn (gen_rtx_SET (target, x));
13358 }
13359 return true;
13360
13361 default:
13362 return false;
13363 }
13364 }
13365
13366 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13367 whose ONE_VAR element is VAR, and other elements are zero. Return true
13368 if successful. */
13369
13370 static bool
13371 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13372 rtx target, rtx var, int one_var)
13373 {
13374 machine_mode vsimode;
13375 rtx new_target;
13376 rtx x, tmp;
13377 bool use_vector_set = false;
13378 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13379
13380 switch (mode)
13381 {
13382 case E_V2DImode:
13383 /* For SSE4.1, we normally use vector set. But if the second
13384 element is zero and inter-unit moves are OK, we use movq
13385 instead. */
13386 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13387 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13388 && one_var == 0));
13389 break;
13390 case E_V16QImode:
13391 case E_V4SImode:
13392 case E_V4SFmode:
13393 use_vector_set = TARGET_SSE4_1;
13394 break;
13395 case E_V8HImode:
13396 use_vector_set = TARGET_SSE2;
13397 break;
13398 case E_V8QImode:
13399 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
13400 break;
13401 case E_V4HImode:
13402 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13403 break;
13404 case E_V32QImode:
13405 case E_V16HImode:
13406 use_vector_set = TARGET_AVX;
13407 break;
13408 case E_V8SImode:
13409 use_vector_set = TARGET_AVX;
13410 gen_vec_set_0 = gen_vec_setv8si_0;
13411 break;
13412 case E_V8SFmode:
13413 use_vector_set = TARGET_AVX;
13414 gen_vec_set_0 = gen_vec_setv8sf_0;
13415 break;
13416 case E_V4DFmode:
13417 use_vector_set = TARGET_AVX;
13418 gen_vec_set_0 = gen_vec_setv4df_0;
13419 break;
13420 case E_V4DImode:
13421 /* Use ix86_expand_vector_set in 64bit mode only. */
13422 use_vector_set = TARGET_AVX && TARGET_64BIT;
13423 gen_vec_set_0 = gen_vec_setv4di_0;
13424 break;
13425 case E_V16SImode:
13426 use_vector_set = TARGET_AVX512F && one_var == 0;
13427 gen_vec_set_0 = gen_vec_setv16si_0;
13428 break;
13429 case E_V16SFmode:
13430 use_vector_set = TARGET_AVX512F && one_var == 0;
13431 gen_vec_set_0 = gen_vec_setv16sf_0;
13432 break;
13433 case E_V8DFmode:
13434 use_vector_set = TARGET_AVX512F && one_var == 0;
13435 gen_vec_set_0 = gen_vec_setv8df_0;
13436 break;
13437 case E_V8DImode:
13438 /* Use ix86_expand_vector_set in 64bit mode only. */
13439 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13440 gen_vec_set_0 = gen_vec_setv8di_0;
13441 break;
13442 default:
13443 break;
13444 }
13445
13446 if (use_vector_set)
13447 {
13448 if (gen_vec_set_0 && one_var == 0)
13449 {
13450 var = force_reg (GET_MODE_INNER (mode), var);
13451 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13452 return true;
13453 }
13454 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13455 var = force_reg (GET_MODE_INNER (mode), var);
13456 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13457 return true;
13458 }
13459
13460 switch (mode)
13461 {
13462 case E_V2SFmode:
13463 case E_V2SImode:
13464 if (!mmx_ok)
13465 return false;
13466 /* FALLTHRU */
13467
13468 case E_V2DFmode:
13469 case E_V2DImode:
13470 if (one_var != 0)
13471 return false;
13472 var = force_reg (GET_MODE_INNER (mode), var);
13473 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13474 emit_insn (gen_rtx_SET (target, x));
13475 return true;
13476
13477 case E_V4SFmode:
13478 case E_V4SImode:
13479 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13480 new_target = gen_reg_rtx (mode);
13481 else
13482 new_target = target;
13483 var = force_reg (GET_MODE_INNER (mode), var);
13484 x = gen_rtx_VEC_DUPLICATE (mode, var);
13485 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13486 emit_insn (gen_rtx_SET (new_target, x));
13487 if (one_var != 0)
13488 {
13489 /* We need to shuffle the value to the correct position, so
13490 create a new pseudo to store the intermediate result. */
13491
13492 /* With SSE2, we can use the integer shuffle insns. */
13493 if (mode != V4SFmode && TARGET_SSE2)
13494 {
13495 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13496 const1_rtx,
13497 GEN_INT (one_var == 1 ? 0 : 1),
13498 GEN_INT (one_var == 2 ? 0 : 1),
13499 GEN_INT (one_var == 3 ? 0 : 1)));
13500 if (target != new_target)
13501 emit_move_insn (target, new_target);
13502 return true;
13503 }
13504
13505 /* Otherwise convert the intermediate result to V4SFmode and
13506 use the SSE1 shuffle instructions. */
13507 if (mode != V4SFmode)
13508 {
13509 tmp = gen_reg_rtx (V4SFmode);
13510 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13511 }
13512 else
13513 tmp = new_target;
13514
13515 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13516 const1_rtx,
13517 GEN_INT (one_var == 1 ? 0 : 1),
13518 GEN_INT (one_var == 2 ? 0+4 : 1+4),
13519 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13520
13521 if (mode != V4SFmode)
13522 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13523 else if (tmp != target)
13524 emit_move_insn (target, tmp);
13525 }
13526 else if (target != new_target)
13527 emit_move_insn (target, new_target);
13528 return true;
13529
13530 case E_V8HImode:
13531 case E_V16QImode:
13532 vsimode = V4SImode;
13533 goto widen;
13534 case E_V4HImode:
13535 case E_V8QImode:
13536 if (!mmx_ok)
13537 return false;
13538 vsimode = V2SImode;
13539 goto widen;
13540 widen:
13541 if (one_var != 0)
13542 return false;
13543
13544 /* Zero extend the variable element to SImode and recurse. */
13545 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13546
13547 x = gen_reg_rtx (vsimode);
13548 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13549 var, one_var))
13550 gcc_unreachable ();
13551
13552 emit_move_insn (target, gen_lowpart (mode, x));
13553 return true;
13554
13555 default:
13556 return false;
13557 }
13558 }
13559
13560 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13561 consisting of the values in VALS. It is known that all elements
13562 except ONE_VAR are constants. Return true if successful. */
13563
13564 static bool
13565 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13566 rtx target, rtx vals, int one_var)
13567 {
13568 rtx var = XVECEXP (vals, 0, one_var);
13569 machine_mode wmode;
13570 rtx const_vec, x;
13571
13572 const_vec = copy_rtx (vals);
13573 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13574 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13575
13576 switch (mode)
13577 {
13578 case E_V2DFmode:
13579 case E_V2DImode:
13580 case E_V2SFmode:
13581 case E_V2SImode:
13582 /* For the two element vectors, it's just as easy to use
13583 the general case. */
13584 return false;
13585
13586 case E_V4DImode:
13587 /* Use ix86_expand_vector_set in 64bit mode only. */
13588 if (!TARGET_64BIT)
13589 return false;
13590 /* FALLTHRU */
13591 case E_V4DFmode:
13592 case E_V8SFmode:
13593 case E_V8SImode:
13594 case E_V16HImode:
13595 case E_V32QImode:
13596 case E_V4SFmode:
13597 case E_V4SImode:
13598 case E_V8HImode:
13599 case E_V4HImode:
13600 break;
13601
13602 case E_V16QImode:
13603 if (TARGET_SSE4_1)
13604 break;
13605 wmode = V8HImode;
13606 goto widen;
13607 case E_V8QImode:
13608 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
13609 break;
13610 wmode = V4HImode;
13611 goto widen;
13612 widen:
13613 /* There's no way to set one QImode entry easily. Combine
13614 the variable value with its adjacent constant value, and
13615 promote to an HImode set. */
13616 x = XVECEXP (vals, 0, one_var ^ 1);
13617 if (one_var & 1)
13618 {
13619 var = convert_modes (HImode, QImode, var, true);
13620 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13621 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13622 x = GEN_INT (INTVAL (x) & 0xff);
13623 }
13624 else
13625 {
13626 var = convert_modes (HImode, QImode, var, true);
13627 x = gen_int_mode (UINTVAL (x) << 8, HImode);
13628 }
13629 if (x != const0_rtx)
13630 var = expand_simple_binop (HImode, IOR, var, x, var,
13631 1, OPTAB_LIB_WIDEN);
13632
13633 x = gen_reg_rtx (wmode);
13634 emit_move_insn (x, gen_lowpart (wmode, const_vec));
13635 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
13636
13637 emit_move_insn (target, gen_lowpart (mode, x));
13638 return true;
13639
13640 default:
13641 return false;
13642 }
13643
13644 emit_move_insn (target, const_vec);
13645 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13646 return true;
13647 }
13648
13649 /* A subroutine of ix86_expand_vector_init_general. Use vector
13650 concatenate to handle the most general case: all values variable,
13651 and none identical. */
13652
13653 static void
13654 ix86_expand_vector_init_concat (machine_mode mode,
13655 rtx target, rtx *ops, int n)
13656 {
13657 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
13658 rtx first[16], second[8], third[4];
13659 rtvec v;
13660 int i, j;
13661
13662 switch (n)
13663 {
13664 case 2:
13665 switch (mode)
13666 {
13667 case E_V16SImode:
13668 cmode = V8SImode;
13669 break;
13670 case E_V16SFmode:
13671 cmode = V8SFmode;
13672 break;
13673 case E_V8DImode:
13674 cmode = V4DImode;
13675 break;
13676 case E_V8DFmode:
13677 cmode = V4DFmode;
13678 break;
13679 case E_V8SImode:
13680 cmode = V4SImode;
13681 break;
13682 case E_V8SFmode:
13683 cmode = V4SFmode;
13684 break;
13685 case E_V4DImode:
13686 cmode = V2DImode;
13687 break;
13688 case E_V4DFmode:
13689 cmode = V2DFmode;
13690 break;
13691 case E_V4SImode:
13692 cmode = V2SImode;
13693 break;
13694 case E_V4SFmode:
13695 cmode = V2SFmode;
13696 break;
13697 case E_V2DImode:
13698 cmode = DImode;
13699 break;
13700 case E_V2SImode:
13701 cmode = SImode;
13702 break;
13703 case E_V2DFmode:
13704 cmode = DFmode;
13705 break;
13706 case E_V2SFmode:
13707 cmode = SFmode;
13708 break;
13709 default:
13710 gcc_unreachable ();
13711 }
13712
13713 if (!register_operand (ops[1], cmode))
13714 ops[1] = force_reg (cmode, ops[1]);
13715 if (!register_operand (ops[0], cmode))
13716 ops[0] = force_reg (cmode, ops[0]);
13717 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
13718 ops[1])));
13719 break;
13720
13721 case 4:
13722 switch (mode)
13723 {
13724 case E_V4DImode:
13725 cmode = V2DImode;
13726 break;
13727 case E_V4DFmode:
13728 cmode = V2DFmode;
13729 break;
13730 case E_V4SImode:
13731 cmode = V2SImode;
13732 break;
13733 case E_V4SFmode:
13734 cmode = V2SFmode;
13735 break;
13736 default:
13737 gcc_unreachable ();
13738 }
13739 goto half;
13740
13741 case 8:
13742 switch (mode)
13743 {
13744 case E_V8DImode:
13745 cmode = V2DImode;
13746 hmode = V4DImode;
13747 break;
13748 case E_V8DFmode:
13749 cmode = V2DFmode;
13750 hmode = V4DFmode;
13751 break;
13752 case E_V8SImode:
13753 cmode = V2SImode;
13754 hmode = V4SImode;
13755 break;
13756 case E_V8SFmode:
13757 cmode = V2SFmode;
13758 hmode = V4SFmode;
13759 break;
13760 default:
13761 gcc_unreachable ();
13762 }
13763 goto half;
13764
13765 case 16:
13766 switch (mode)
13767 {
13768 case E_V16SImode:
13769 cmode = V2SImode;
13770 hmode = V4SImode;
13771 gmode = V8SImode;
13772 break;
13773 case E_V16SFmode:
13774 cmode = V2SFmode;
13775 hmode = V4SFmode;
13776 gmode = V8SFmode;
13777 break;
13778 default:
13779 gcc_unreachable ();
13780 }
13781 goto half;
13782
13783 half:
13784 /* FIXME: We process inputs backward to help RA. PR 36222. */
13785 i = n - 1;
13786 j = (n >> 1) - 1;
13787 for (; i > 0; i -= 2, j--)
13788 {
13789 first[j] = gen_reg_rtx (cmode);
13790 v = gen_rtvec (2, ops[i - 1], ops[i]);
13791 ix86_expand_vector_init (false, first[j],
13792 gen_rtx_PARALLEL (cmode, v));
13793 }
13794
13795 n >>= 1;
13796 if (n > 4)
13797 {
13798 gcc_assert (hmode != VOIDmode);
13799 gcc_assert (gmode != VOIDmode);
13800 for (i = j = 0; i < n; i += 2, j++)
13801 {
13802 second[j] = gen_reg_rtx (hmode);
13803 ix86_expand_vector_init_concat (hmode, second [j],
13804 &first [i], 2);
13805 }
13806 n >>= 1;
13807 for (i = j = 0; i < n; i += 2, j++)
13808 {
13809 third[j] = gen_reg_rtx (gmode);
13810 ix86_expand_vector_init_concat (gmode, third[j],
13811 &second[i], 2);
13812 }
13813 n >>= 1;
13814 ix86_expand_vector_init_concat (mode, target, third, n);
13815 }
13816 else if (n > 2)
13817 {
13818 gcc_assert (hmode != VOIDmode);
13819 for (i = j = 0; i < n; i += 2, j++)
13820 {
13821 second[j] = gen_reg_rtx (hmode);
13822 ix86_expand_vector_init_concat (hmode, second [j],
13823 &first [i], 2);
13824 }
13825 n >>= 1;
13826 ix86_expand_vector_init_concat (mode, target, second, n);
13827 }
13828 else
13829 ix86_expand_vector_init_concat (mode, target, first, n);
13830 break;
13831
13832 default:
13833 gcc_unreachable ();
13834 }
13835 }
13836
13837 /* A subroutine of ix86_expand_vector_init_general. Use vector
13838 interleave to handle the most general case: all values variable,
13839 and none identical. */
13840
13841 static void
13842 ix86_expand_vector_init_interleave (machine_mode mode,
13843 rtx target, rtx *ops, int n)
13844 {
13845 machine_mode first_imode, second_imode, third_imode, inner_mode;
13846 int i, j;
13847 rtx op0, op1;
13848 rtx (*gen_load_even) (rtx, rtx, rtx);
13849 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
13850 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
13851
13852 switch (mode)
13853 {
13854 case E_V8HImode:
13855 gen_load_even = gen_vec_setv8hi;
13856 gen_interleave_first_low = gen_vec_interleave_lowv4si;
13857 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13858 inner_mode = HImode;
13859 first_imode = V4SImode;
13860 second_imode = V2DImode;
13861 third_imode = VOIDmode;
13862 break;
13863 case E_V16QImode:
13864 gen_load_even = gen_vec_setv16qi;
13865 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
13866 gen_interleave_second_low = gen_vec_interleave_lowv4si;
13867 inner_mode = QImode;
13868 first_imode = V8HImode;
13869 second_imode = V4SImode;
13870 third_imode = V2DImode;
13871 break;
13872 default:
13873 gcc_unreachable ();
13874 }
13875
13876 for (i = 0; i < n; i++)
13877 {
13878 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13879 op0 = gen_reg_rtx (SImode);
13880 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
13881
13882 /* Insert the SImode value as low element of V4SImode vector. */
13883 op1 = gen_reg_rtx (V4SImode);
13884 op0 = gen_rtx_VEC_MERGE (V4SImode,
13885 gen_rtx_VEC_DUPLICATE (V4SImode,
13886 op0),
13887 CONST0_RTX (V4SImode),
13888 const1_rtx);
13889 emit_insn (gen_rtx_SET (op1, op0));
13890
13891 /* Cast the V4SImode vector back to a vector in orignal mode. */
13892 op0 = gen_reg_rtx (mode);
13893 emit_move_insn (op0, gen_lowpart (mode, op1));
13894
13895 /* Load even elements into the second position. */
13896 emit_insn (gen_load_even (op0,
13897 force_reg (inner_mode,
13898 ops [i + i + 1]),
13899 const1_rtx));
13900
13901 /* Cast vector to FIRST_IMODE vector. */
13902 ops[i] = gen_reg_rtx (first_imode);
13903 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
13904 }
13905
13906 /* Interleave low FIRST_IMODE vectors. */
13907 for (i = j = 0; i < n; i += 2, j++)
13908 {
13909 op0 = gen_reg_rtx (first_imode);
13910 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
13911
13912 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13913 ops[j] = gen_reg_rtx (second_imode);
13914 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
13915 }
13916
13917 /* Interleave low SECOND_IMODE vectors. */
13918 switch (second_imode)
13919 {
13920 case E_V4SImode:
13921 for (i = j = 0; i < n / 2; i += 2, j++)
13922 {
13923 op0 = gen_reg_rtx (second_imode);
13924 emit_insn (gen_interleave_second_low (op0, ops[i],
13925 ops[i + 1]));
13926
13927 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13928 vector. */
13929 ops[j] = gen_reg_rtx (third_imode);
13930 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
13931 }
13932 second_imode = V2DImode;
13933 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13934 /* FALLTHRU */
13935
13936 case E_V2DImode:
13937 op0 = gen_reg_rtx (second_imode);
13938 emit_insn (gen_interleave_second_low (op0, ops[0],
13939 ops[1]));
13940
13941 /* Cast the SECOND_IMODE vector back to a vector on original
13942 mode. */
13943 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
13944 break;
13945
13946 default:
13947 gcc_unreachable ();
13948 }
13949 }
13950
13951 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
13952 all values variable, and none identical. */
13953
13954 static void
13955 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
13956 rtx target, rtx vals)
13957 {
13958 rtx ops[64], op0, op1, op2, op3, op4, op5;
13959 machine_mode half_mode = VOIDmode;
13960 machine_mode quarter_mode = VOIDmode;
13961 int n, i;
13962
13963 switch (mode)
13964 {
13965 case E_V2SFmode:
13966 case E_V2SImode:
13967 if (!mmx_ok && !TARGET_SSE)
13968 break;
13969 /* FALLTHRU */
13970
13971 case E_V16SImode:
13972 case E_V16SFmode:
13973 case E_V8DFmode:
13974 case E_V8DImode:
13975 case E_V8SFmode:
13976 case E_V8SImode:
13977 case E_V4DFmode:
13978 case E_V4DImode:
13979 case E_V4SFmode:
13980 case E_V4SImode:
13981 case E_V2DFmode:
13982 case E_V2DImode:
13983 n = GET_MODE_NUNITS (mode);
13984 for (i = 0; i < n; i++)
13985 ops[i] = XVECEXP (vals, 0, i);
13986 ix86_expand_vector_init_concat (mode, target, ops, n);
13987 return;
13988
13989 case E_V2TImode:
13990 for (i = 0; i < 2; i++)
13991 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13992 op0 = gen_reg_rtx (V4DImode);
13993 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
13994 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13995 return;
13996
13997 case E_V4TImode:
13998 for (i = 0; i < 4; i++)
13999 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
14000 ops[4] = gen_reg_rtx (V4DImode);
14001 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
14002 ops[5] = gen_reg_rtx (V4DImode);
14003 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
14004 op0 = gen_reg_rtx (V8DImode);
14005 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
14006 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14007 return;
14008
14009 case E_V32QImode:
14010 half_mode = V16QImode;
14011 goto half;
14012
14013 case E_V16HImode:
14014 half_mode = V8HImode;
14015 goto half;
14016
14017 half:
14018 n = GET_MODE_NUNITS (mode);
14019 for (i = 0; i < n; i++)
14020 ops[i] = XVECEXP (vals, 0, i);
14021 op0 = gen_reg_rtx (half_mode);
14022 op1 = gen_reg_rtx (half_mode);
14023 ix86_expand_vector_init_interleave (half_mode, op0, ops,
14024 n >> 2);
14025 ix86_expand_vector_init_interleave (half_mode, op1,
14026 &ops [n >> 1], n >> 2);
14027 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
14028 return;
14029
14030 case E_V64QImode:
14031 quarter_mode = V16QImode;
14032 half_mode = V32QImode;
14033 goto quarter;
14034
14035 case E_V32HImode:
14036 quarter_mode = V8HImode;
14037 half_mode = V16HImode;
14038 goto quarter;
14039
14040 quarter:
14041 n = GET_MODE_NUNITS (mode);
14042 for (i = 0; i < n; i++)
14043 ops[i] = XVECEXP (vals, 0, i);
14044 op0 = gen_reg_rtx (quarter_mode);
14045 op1 = gen_reg_rtx (quarter_mode);
14046 op2 = gen_reg_rtx (quarter_mode);
14047 op3 = gen_reg_rtx (quarter_mode);
14048 op4 = gen_reg_rtx (half_mode);
14049 op5 = gen_reg_rtx (half_mode);
14050 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
14051 n >> 3);
14052 ix86_expand_vector_init_interleave (quarter_mode, op1,
14053 &ops [n >> 2], n >> 3);
14054 ix86_expand_vector_init_interleave (quarter_mode, op2,
14055 &ops [n >> 1], n >> 3);
14056 ix86_expand_vector_init_interleave (quarter_mode, op3,
14057 &ops [(n >> 1) | (n >> 2)], n >> 3);
14058 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
14059 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
14060 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
14061 return;
14062
14063 case E_V16QImode:
14064 if (!TARGET_SSE4_1)
14065 break;
14066 /* FALLTHRU */
14067
14068 case E_V8HImode:
14069 if (!TARGET_SSE2)
14070 break;
14071
14072 /* Don't use ix86_expand_vector_init_interleave if we can't
14073 move from GPR to SSE register directly. */
14074 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
14075 break;
14076
14077 n = GET_MODE_NUNITS (mode);
14078 for (i = 0; i < n; i++)
14079 ops[i] = XVECEXP (vals, 0, i);
14080 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
14081 return;
14082
14083 case E_V4HImode:
14084 case E_V8QImode:
14085 break;
14086
14087 default:
14088 gcc_unreachable ();
14089 }
14090
14091 {
14092 int i, j, n_elts, n_words, n_elt_per_word;
14093 machine_mode inner_mode;
14094 rtx words[4], shift;
14095
14096 inner_mode = GET_MODE_INNER (mode);
14097 n_elts = GET_MODE_NUNITS (mode);
14098 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
14099 n_elt_per_word = n_elts / n_words;
14100 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14101
14102 for (i = 0; i < n_words; ++i)
14103 {
14104 rtx word = NULL_RTX;
14105
14106 for (j = 0; j < n_elt_per_word; ++j)
14107 {
14108 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14109 elt = convert_modes (word_mode, inner_mode, elt, true);
14110
14111 if (j == 0)
14112 word = elt;
14113 else
14114 {
14115 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14116 word, 1, OPTAB_LIB_WIDEN);
14117 word = expand_simple_binop (word_mode, IOR, word, elt,
14118 word, 1, OPTAB_LIB_WIDEN);
14119 }
14120 }
14121
14122 words[i] = word;
14123 }
14124
14125 if (n_words == 1)
14126 emit_move_insn (target, gen_lowpart (mode, words[0]));
14127 else if (n_words == 2)
14128 {
14129 rtx tmp = gen_reg_rtx (mode);
14130 emit_clobber (tmp);
14131 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14132 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14133 emit_move_insn (target, tmp);
14134 }
14135 else if (n_words == 4)
14136 {
14137 rtx tmp = gen_reg_rtx (V4SImode);
14138 gcc_assert (word_mode == SImode);
14139 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14140 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14141 emit_move_insn (target, gen_lowpart (mode, tmp));
14142 }
14143 else
14144 gcc_unreachable ();
14145 }
14146 }
14147
14148 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14149 instructions unless MMX_OK is true. */
14150
14151 void
14152 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14153 {
14154 machine_mode mode = GET_MODE (target);
14155 machine_mode inner_mode = GET_MODE_INNER (mode);
14156 int n_elts = GET_MODE_NUNITS (mode);
14157 int n_var = 0, one_var = -1;
14158 bool all_same = true, all_const_zero = true;
14159 int i;
14160 rtx x;
14161
14162 /* Handle first initialization from vector elts. */
14163 if (n_elts != XVECLEN (vals, 0))
14164 {
14165 rtx subtarget = target;
14166 x = XVECEXP (vals, 0, 0);
14167 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14168 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14169 {
14170 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14171 if (inner_mode == QImode || inner_mode == HImode)
14172 {
14173 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14174 mode = mode_for_vector (SImode, n_bits / 4).require ();
14175 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
14176 ops[0] = gen_lowpart (inner_mode, ops[0]);
14177 ops[1] = gen_lowpart (inner_mode, ops[1]);
14178 subtarget = gen_reg_rtx (mode);
14179 }
14180 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14181 if (subtarget != target)
14182 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14183 return;
14184 }
14185 gcc_unreachable ();
14186 }
14187
14188 for (i = 0; i < n_elts; ++i)
14189 {
14190 x = XVECEXP (vals, 0, i);
14191 if (!(CONST_SCALAR_INT_P (x)
14192 || CONST_DOUBLE_P (x)
14193 || CONST_FIXED_P (x)))
14194 n_var++, one_var = i;
14195 else if (x != CONST0_RTX (inner_mode))
14196 all_const_zero = false;
14197 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14198 all_same = false;
14199 }
14200
14201 /* Constants are best loaded from the constant pool. */
14202 if (n_var == 0)
14203 {
14204 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14205 return;
14206 }
14207
14208 /* If all values are identical, broadcast the value. */
14209 if (all_same
14210 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14211 XVECEXP (vals, 0, 0)))
14212 return;
14213
14214 /* Values where only one field is non-constant are best loaded from
14215 the pool and overwritten via move later. */
14216 if (n_var == 1)
14217 {
14218 if (all_const_zero
14219 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14220 XVECEXP (vals, 0, one_var),
14221 one_var))
14222 return;
14223
14224 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14225 return;
14226 }
14227
14228 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14229 }
14230
14231 void
14232 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14233 {
14234 machine_mode mode = GET_MODE (target);
14235 machine_mode inner_mode = GET_MODE_INNER (mode);
14236 machine_mode half_mode;
14237 bool use_vec_merge = false;
14238 rtx tmp;
14239 static rtx (*gen_extract[6][2]) (rtx, rtx)
14240 = {
14241 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14242 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14243 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14244 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14245 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14246 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14247 };
14248 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14249 = {
14250 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14251 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14252 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14253 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14254 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14255 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14256 };
14257 int i, j, n;
14258 machine_mode mmode = VOIDmode;
14259 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14260
14261 switch (mode)
14262 {
14263 case E_V2SImode:
14264 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14265 if (use_vec_merge)
14266 break;
14267 /* FALLTHRU */
14268
14269 case E_V2SFmode:
14270 if (mmx_ok)
14271 {
14272 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14273 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14274 if (elt == 0)
14275 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14276 else
14277 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14278 emit_insn (gen_rtx_SET (target, tmp));
14279 return;
14280 }
14281 break;
14282
14283 case E_V2DImode:
14284 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14285 if (use_vec_merge)
14286 break;
14287
14288 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14289 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14290 if (elt == 0)
14291 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14292 else
14293 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14294 emit_insn (gen_rtx_SET (target, tmp));
14295 return;
14296
14297 case E_V2DFmode:
14298 /* NB: For ELT == 0, use standard scalar operation patterns which
14299 preserve the rest of the vector for combiner:
14300
14301 (vec_merge:V2DF
14302 (vec_duplicate:V2DF (reg:DF))
14303 (reg:V2DF)
14304 (const_int 1))
14305 */
14306 if (elt == 0)
14307 goto do_vec_merge;
14308
14309 {
14310 rtx op0, op1;
14311
14312 /* For the two element vectors, we implement a VEC_CONCAT with
14313 the extraction of the other element. */
14314
14315 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14316 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14317
14318 if (elt == 0)
14319 op0 = val, op1 = tmp;
14320 else
14321 op0 = tmp, op1 = val;
14322
14323 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14324 emit_insn (gen_rtx_SET (target, tmp));
14325 }
14326 return;
14327
14328 case E_V4SFmode:
14329 use_vec_merge = TARGET_SSE4_1;
14330 if (use_vec_merge)
14331 break;
14332
14333 switch (elt)
14334 {
14335 case 0:
14336 use_vec_merge = true;
14337 break;
14338
14339 case 1:
14340 /* tmp = target = A B C D */
14341 tmp = copy_to_reg (target);
14342 /* target = A A B B */
14343 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14344 /* target = X A B B */
14345 ix86_expand_vector_set (false, target, val, 0);
14346 /* target = A X C D */
14347 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14348 const1_rtx, const0_rtx,
14349 GEN_INT (2+4), GEN_INT (3+4)));
14350 return;
14351
14352 case 2:
14353 /* tmp = target = A B C D */
14354 tmp = copy_to_reg (target);
14355 /* tmp = X B C D */
14356 ix86_expand_vector_set (false, tmp, val, 0);
14357 /* target = A B X D */
14358 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14359 const0_rtx, const1_rtx,
14360 GEN_INT (0+4), GEN_INT (3+4)));
14361 return;
14362
14363 case 3:
14364 /* tmp = target = A B C D */
14365 tmp = copy_to_reg (target);
14366 /* tmp = X B C D */
14367 ix86_expand_vector_set (false, tmp, val, 0);
14368 /* target = A B X D */
14369 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14370 const0_rtx, const1_rtx,
14371 GEN_INT (2+4), GEN_INT (0+4)));
14372 return;
14373
14374 default:
14375 gcc_unreachable ();
14376 }
14377 break;
14378
14379 case E_V4SImode:
14380 use_vec_merge = TARGET_SSE4_1;
14381 if (use_vec_merge)
14382 break;
14383
14384 /* Element 0 handled by vec_merge below. */
14385 if (elt == 0)
14386 {
14387 use_vec_merge = true;
14388 break;
14389 }
14390
14391 if (TARGET_SSE2)
14392 {
14393 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14394 store into element 0, then shuffle them back. */
14395
14396 rtx order[4];
14397
14398 order[0] = GEN_INT (elt);
14399 order[1] = const1_rtx;
14400 order[2] = const2_rtx;
14401 order[3] = GEN_INT (3);
14402 order[elt] = const0_rtx;
14403
14404 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14405 order[1], order[2], order[3]));
14406
14407 ix86_expand_vector_set (false, target, val, 0);
14408
14409 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14410 order[1], order[2], order[3]));
14411 }
14412 else
14413 {
14414 /* For SSE1, we have to reuse the V4SF code. */
14415 rtx t = gen_reg_rtx (V4SFmode);
14416 emit_move_insn (t, gen_lowpart (V4SFmode, target));
14417 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14418 emit_move_insn (target, gen_lowpart (mode, t));
14419 }
14420 return;
14421
14422 case E_V8HImode:
14423 use_vec_merge = TARGET_SSE2;
14424 break;
14425 case E_V4HImode:
14426 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14427 break;
14428
14429 case E_V16QImode:
14430 use_vec_merge = TARGET_SSE4_1;
14431 break;
14432
14433 case E_V8QImode:
14434 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14435 break;
14436
14437 case E_V32QImode:
14438 half_mode = V16QImode;
14439 j = 0;
14440 n = 16;
14441 goto half;
14442
14443 case E_V16HImode:
14444 half_mode = V8HImode;
14445 j = 1;
14446 n = 8;
14447 goto half;
14448
14449 case E_V8SImode:
14450 half_mode = V4SImode;
14451 j = 2;
14452 n = 4;
14453 goto half;
14454
14455 case E_V4DImode:
14456 half_mode = V2DImode;
14457 j = 3;
14458 n = 2;
14459 goto half;
14460
14461 case E_V8SFmode:
14462 half_mode = V4SFmode;
14463 j = 4;
14464 n = 4;
14465 goto half;
14466
14467 case E_V4DFmode:
14468 half_mode = V2DFmode;
14469 j = 5;
14470 n = 2;
14471 goto half;
14472
14473 half:
14474 /* Compute offset. */
14475 i = elt / n;
14476 elt %= n;
14477
14478 gcc_assert (i <= 1);
14479
14480 /* Extract the half. */
14481 tmp = gen_reg_rtx (half_mode);
14482 emit_insn (gen_extract[j][i] (tmp, target));
14483
14484 /* Put val in tmp at elt. */
14485 ix86_expand_vector_set (false, tmp, val, elt);
14486
14487 /* Put it back. */
14488 emit_insn (gen_insert[j][i] (target, target, tmp));
14489 return;
14490
14491 case E_V8DFmode:
14492 if (TARGET_AVX512F)
14493 {
14494 mmode = QImode;
14495 gen_blendm = gen_avx512f_blendmv8df;
14496 }
14497 break;
14498
14499 case E_V8DImode:
14500 if (TARGET_AVX512F)
14501 {
14502 mmode = QImode;
14503 gen_blendm = gen_avx512f_blendmv8di;
14504 }
14505 break;
14506
14507 case E_V16SFmode:
14508 if (TARGET_AVX512F)
14509 {
14510 mmode = HImode;
14511 gen_blendm = gen_avx512f_blendmv16sf;
14512 }
14513 break;
14514
14515 case E_V16SImode:
14516 if (TARGET_AVX512F)
14517 {
14518 mmode = HImode;
14519 gen_blendm = gen_avx512f_blendmv16si;
14520 }
14521 break;
14522
14523 case E_V32HImode:
14524 if (TARGET_AVX512BW)
14525 {
14526 mmode = SImode;
14527 gen_blendm = gen_avx512bw_blendmv32hi;
14528 }
14529 else if (TARGET_AVX512F)
14530 {
14531 half_mode = E_V8HImode;
14532 n = 8;
14533 goto quarter;
14534 }
14535 break;
14536
14537 case E_V64QImode:
14538 if (TARGET_AVX512BW)
14539 {
14540 mmode = DImode;
14541 gen_blendm = gen_avx512bw_blendmv64qi;
14542 }
14543 else if (TARGET_AVX512F)
14544 {
14545 half_mode = E_V16QImode;
14546 n = 16;
14547 goto quarter;
14548 }
14549 break;
14550
14551 quarter:
14552 /* Compute offset. */
14553 i = elt / n;
14554 elt %= n;
14555
14556 gcc_assert (i <= 3);
14557
14558 {
14559 /* Extract the quarter. */
14560 tmp = gen_reg_rtx (V4SImode);
14561 rtx tmp2 = gen_lowpart (V16SImode, target);
14562 rtx mask = gen_reg_rtx (QImode);
14563
14564 emit_move_insn (mask, constm1_rtx);
14565 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
14566 tmp, mask));
14567
14568 tmp2 = gen_reg_rtx (half_mode);
14569 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
14570 tmp = tmp2;
14571
14572 /* Put val in tmp at elt. */
14573 ix86_expand_vector_set (false, tmp, val, elt);
14574
14575 /* Put it back. */
14576 tmp2 = gen_reg_rtx (V16SImode);
14577 rtx tmp3 = gen_lowpart (V16SImode, target);
14578 mask = gen_reg_rtx (HImode);
14579 emit_move_insn (mask, constm1_rtx);
14580 tmp = gen_lowpart (V4SImode, tmp);
14581 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
14582 tmp3, mask));
14583 emit_move_insn (target, gen_lowpart (mode, tmp2));
14584 }
14585 return;
14586
14587 default:
14588 break;
14589 }
14590
14591 if (mmode != VOIDmode)
14592 {
14593 tmp = gen_reg_rtx (mode);
14594 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
14595 /* The avx512*_blendm<mode> expanders have different operand order
14596 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14597 elements where the mask is set and second input operand otherwise,
14598 in {sse,avx}*_*blend* the first input operand is used for elements
14599 where the mask is clear and second input operand otherwise. */
14600 emit_insn (gen_blendm (target, target, tmp,
14601 force_reg (mmode,
14602 gen_int_mode (HOST_WIDE_INT_1U << elt,
14603 mmode))));
14604 }
14605 else if (use_vec_merge)
14606 {
14607 do_vec_merge:
14608 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
14609 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
14610 GEN_INT (HOST_WIDE_INT_1U << elt));
14611 emit_insn (gen_rtx_SET (target, tmp));
14612 }
14613 else
14614 {
14615 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14616
14617 emit_move_insn (mem, target);
14618
14619 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
14620 emit_move_insn (tmp, val);
14621
14622 emit_move_insn (target, mem);
14623 }
14624 }
14625
14626 void
14627 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
14628 {
14629 machine_mode mode = GET_MODE (vec);
14630 machine_mode inner_mode = GET_MODE_INNER (mode);
14631 bool use_vec_extr = false;
14632 rtx tmp;
14633
14634 switch (mode)
14635 {
14636 case E_V2SImode:
14637 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14638 if (use_vec_extr)
14639 break;
14640 /* FALLTHRU */
14641
14642 case E_V2SFmode:
14643 if (!mmx_ok)
14644 break;
14645 /* FALLTHRU */
14646
14647 case E_V2DFmode:
14648 case E_V2DImode:
14649 case E_V2TImode:
14650 case E_V4TImode:
14651 use_vec_extr = true;
14652 break;
14653
14654 case E_V4SFmode:
14655 use_vec_extr = TARGET_SSE4_1;
14656 if (use_vec_extr)
14657 break;
14658
14659 switch (elt)
14660 {
14661 case 0:
14662 tmp = vec;
14663 break;
14664
14665 case 1:
14666 case 3:
14667 tmp = gen_reg_rtx (mode);
14668 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
14669 GEN_INT (elt), GEN_INT (elt),
14670 GEN_INT (elt+4), GEN_INT (elt+4)));
14671 break;
14672
14673 case 2:
14674 tmp = gen_reg_rtx (mode);
14675 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
14676 break;
14677
14678 default:
14679 gcc_unreachable ();
14680 }
14681 vec = tmp;
14682 use_vec_extr = true;
14683 elt = 0;
14684 break;
14685
14686 case E_V4SImode:
14687 use_vec_extr = TARGET_SSE4_1;
14688 if (use_vec_extr)
14689 break;
14690
14691 if (TARGET_SSE2)
14692 {
14693 switch (elt)
14694 {
14695 case 0:
14696 tmp = vec;
14697 break;
14698
14699 case 1:
14700 case 3:
14701 tmp = gen_reg_rtx (mode);
14702 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
14703 GEN_INT (elt), GEN_INT (elt),
14704 GEN_INT (elt), GEN_INT (elt)));
14705 break;
14706
14707 case 2:
14708 tmp = gen_reg_rtx (mode);
14709 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
14710 break;
14711
14712 default:
14713 gcc_unreachable ();
14714 }
14715 vec = tmp;
14716 use_vec_extr = true;
14717 elt = 0;
14718 }
14719 else
14720 {
14721 /* For SSE1, we have to reuse the V4SF code. */
14722 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
14723 gen_lowpart (V4SFmode, vec), elt);
14724 return;
14725 }
14726 break;
14727
14728 case E_V8HImode:
14729 use_vec_extr = TARGET_SSE2;
14730 break;
14731 case E_V4HImode:
14732 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14733 break;
14734
14735 case E_V16QImode:
14736 use_vec_extr = TARGET_SSE4_1;
14737 if (!use_vec_extr
14738 && TARGET_SSE2
14739 && elt == 0
14740 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
14741 {
14742 tmp = gen_reg_rtx (SImode);
14743 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
14744 0);
14745 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
14746 return;
14747 }
14748 break;
14749
14750 case E_V8SFmode:
14751 if (TARGET_AVX)
14752 {
14753 tmp = gen_reg_rtx (V4SFmode);
14754 if (elt < 4)
14755 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
14756 else
14757 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
14758 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14759 return;
14760 }
14761 break;
14762
14763 case E_V4DFmode:
14764 if (TARGET_AVX)
14765 {
14766 tmp = gen_reg_rtx (V2DFmode);
14767 if (elt < 2)
14768 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
14769 else
14770 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
14771 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14772 return;
14773 }
14774 break;
14775
14776 case E_V32QImode:
14777 if (TARGET_AVX)
14778 {
14779 tmp = gen_reg_rtx (V16QImode);
14780 if (elt < 16)
14781 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
14782 else
14783 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
14784 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14785 return;
14786 }
14787 break;
14788
14789 case E_V16HImode:
14790 if (TARGET_AVX)
14791 {
14792 tmp = gen_reg_rtx (V8HImode);
14793 if (elt < 8)
14794 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
14795 else
14796 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
14797 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14798 return;
14799 }
14800 break;
14801
14802 case E_V8SImode:
14803 if (TARGET_AVX)
14804 {
14805 tmp = gen_reg_rtx (V4SImode);
14806 if (elt < 4)
14807 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
14808 else
14809 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
14810 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14811 return;
14812 }
14813 break;
14814
14815 case E_V4DImode:
14816 if (TARGET_AVX)
14817 {
14818 tmp = gen_reg_rtx (V2DImode);
14819 if (elt < 2)
14820 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
14821 else
14822 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
14823 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14824 return;
14825 }
14826 break;
14827
14828 case E_V32HImode:
14829 if (TARGET_AVX512BW)
14830 {
14831 tmp = gen_reg_rtx (V16HImode);
14832 if (elt < 16)
14833 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
14834 else
14835 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
14836 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14837 return;
14838 }
14839 break;
14840
14841 case E_V64QImode:
14842 if (TARGET_AVX512BW)
14843 {
14844 tmp = gen_reg_rtx (V32QImode);
14845 if (elt < 32)
14846 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
14847 else
14848 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
14849 ix86_expand_vector_extract (false, target, tmp, elt & 31);
14850 return;
14851 }
14852 break;
14853
14854 case E_V16SFmode:
14855 tmp = gen_reg_rtx (V8SFmode);
14856 if (elt < 8)
14857 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
14858 else
14859 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
14860 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14861 return;
14862
14863 case E_V8DFmode:
14864 tmp = gen_reg_rtx (V4DFmode);
14865 if (elt < 4)
14866 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
14867 else
14868 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
14869 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14870 return;
14871
14872 case E_V16SImode:
14873 tmp = gen_reg_rtx (V8SImode);
14874 if (elt < 8)
14875 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
14876 else
14877 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
14878 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14879 return;
14880
14881 case E_V8DImode:
14882 tmp = gen_reg_rtx (V4DImode);
14883 if (elt < 4)
14884 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
14885 else
14886 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
14887 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14888 return;
14889
14890 case E_V8QImode:
14891 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14892 /* ??? Could extract the appropriate HImode element and shift. */
14893 break;
14894
14895 default:
14896 break;
14897 }
14898
14899 if (use_vec_extr)
14900 {
14901 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
14902 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
14903
14904 /* Let the rtl optimizers know about the zero extension performed. */
14905 if (inner_mode == QImode || inner_mode == HImode)
14906 {
14907 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
14908 target = gen_lowpart (SImode, target);
14909 }
14910
14911 emit_insn (gen_rtx_SET (target, tmp));
14912 }
14913 else
14914 {
14915 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14916
14917 emit_move_insn (mem, vec);
14918
14919 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
14920 emit_move_insn (target, tmp);
14921 }
14922 }
14923
14924 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14925 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14926 The upper bits of DEST are undefined, though they shouldn't cause
14927 exceptions (some bits from src or all zeros are ok). */
14928
14929 static void
14930 emit_reduc_half (rtx dest, rtx src, int i)
14931 {
14932 rtx tem, d = dest;
14933 switch (GET_MODE (src))
14934 {
14935 case E_V4SFmode:
14936 if (i == 128)
14937 tem = gen_sse_movhlps (dest, src, src);
14938 else
14939 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
14940 GEN_INT (1 + 4), GEN_INT (1 + 4));
14941 break;
14942 case E_V2DFmode:
14943 tem = gen_vec_interleave_highv2df (dest, src, src);
14944 break;
14945 case E_V16QImode:
14946 case E_V8HImode:
14947 case E_V4SImode:
14948 case E_V2DImode:
14949 d = gen_reg_rtx (V1TImode);
14950 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
14951 GEN_INT (i / 2));
14952 break;
14953 case E_V8SFmode:
14954 if (i == 256)
14955 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
14956 else
14957 tem = gen_avx_shufps256 (dest, src, src,
14958 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
14959 break;
14960 case E_V4DFmode:
14961 if (i == 256)
14962 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
14963 else
14964 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
14965 break;
14966 case E_V32QImode:
14967 case E_V16HImode:
14968 case E_V8SImode:
14969 case E_V4DImode:
14970 if (i == 256)
14971 {
14972 if (GET_MODE (dest) != V4DImode)
14973 d = gen_reg_rtx (V4DImode);
14974 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
14975 gen_lowpart (V4DImode, src),
14976 const1_rtx);
14977 }
14978 else
14979 {
14980 d = gen_reg_rtx (V2TImode);
14981 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
14982 GEN_INT (i / 2));
14983 }
14984 break;
14985 case E_V64QImode:
14986 case E_V32HImode:
14987 case E_V16SImode:
14988 case E_V16SFmode:
14989 case E_V8DImode:
14990 case E_V8DFmode:
14991 if (i > 128)
14992 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
14993 gen_lowpart (V16SImode, src),
14994 gen_lowpart (V16SImode, src),
14995 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
14996 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
14997 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
14998 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
14999 GEN_INT (0xC), GEN_INT (0xD),
15000 GEN_INT (0xE), GEN_INT (0xF),
15001 GEN_INT (0x10), GEN_INT (0x11),
15002 GEN_INT (0x12), GEN_INT (0x13),
15003 GEN_INT (0x14), GEN_INT (0x15),
15004 GEN_INT (0x16), GEN_INT (0x17));
15005 else
15006 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
15007 gen_lowpart (V16SImode, src),
15008 GEN_INT (i == 128 ? 0x2 : 0x1),
15009 GEN_INT (0x3),
15010 GEN_INT (0x3),
15011 GEN_INT (0x3),
15012 GEN_INT (i == 128 ? 0x6 : 0x5),
15013 GEN_INT (0x7),
15014 GEN_INT (0x7),
15015 GEN_INT (0x7),
15016 GEN_INT (i == 128 ? 0xA : 0x9),
15017 GEN_INT (0xB),
15018 GEN_INT (0xB),
15019 GEN_INT (0xB),
15020 GEN_INT (i == 128 ? 0xE : 0xD),
15021 GEN_INT (0xF),
15022 GEN_INT (0xF),
15023 GEN_INT (0xF));
15024 break;
15025 default:
15026 gcc_unreachable ();
15027 }
15028 emit_insn (tem);
15029 if (d != dest)
15030 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
15031 }
15032
15033 /* Expand a vector reduction. FN is the binary pattern to reduce;
15034 DEST is the destination; IN is the input vector. */
15035
15036 void
15037 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
15038 {
15039 rtx half, dst, vec = in;
15040 machine_mode mode = GET_MODE (in);
15041 int i;
15042
15043 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15044 if (TARGET_SSE4_1
15045 && mode == V8HImode
15046 && fn == gen_uminv8hi3)
15047 {
15048 emit_insn (gen_sse4_1_phminposuw (dest, in));
15049 return;
15050 }
15051
15052 for (i = GET_MODE_BITSIZE (mode);
15053 i > GET_MODE_UNIT_BITSIZE (mode);
15054 i >>= 1)
15055 {
15056 half = gen_reg_rtx (mode);
15057 emit_reduc_half (half, vec, i);
15058 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
15059 dst = dest;
15060 else
15061 dst = gen_reg_rtx (mode);
15062 emit_insn (fn (dst, half, vec));
15063 vec = dst;
15064 }
15065 }
15066
15067 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15068 FP status register is set. */
15069
15070 void
15071 ix86_emit_fp_unordered_jump (rtx label)
15072 {
15073 rtx reg = gen_reg_rtx (HImode);
15074 rtx_insn *insn;
15075 rtx temp;
15076
15077 emit_insn (gen_x86_fnstsw_1 (reg));
15078
15079 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
15080 {
15081 emit_insn (gen_x86_sahf_1 (reg));
15082
15083 temp = gen_rtx_REG (CCmode, FLAGS_REG);
15084 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
15085 }
15086 else
15087 {
15088 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
15089
15090 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15091 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15092 }
15093
15094 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15095 gen_rtx_LABEL_REF (VOIDmode, label),
15096 pc_rtx);
15097 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15098 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15099 JUMP_LABEL (insn) = label;
15100 }
15101
15102 /* Output code to perform an sinh XFmode calculation. */
15103
15104 void ix86_emit_i387_sinh (rtx op0, rtx op1)
15105 {
15106 rtx e1 = gen_reg_rtx (XFmode);
15107 rtx e2 = gen_reg_rtx (XFmode);
15108 rtx scratch = gen_reg_rtx (HImode);
15109 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15110 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15111 rtx cst1, tmp;
15112 rtx_code_label *jump_label = gen_label_rtx ();
15113 rtx_insn *insn;
15114
15115 /* scratch = fxam (op1) */
15116 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15117
15118 /* e1 = expm1 (|op1|) */
15119 emit_insn (gen_absxf2 (e2, op1));
15120 emit_insn (gen_expm1xf2 (e1, e2));
15121
15122 /* e2 = e1 / (e1 + 1.0) + e1 */
15123 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15124 emit_insn (gen_addxf3 (e2, e1, cst1));
15125 emit_insn (gen_divxf3 (e2, e1, e2));
15126 emit_insn (gen_addxf3 (e2, e2, e1));
15127
15128 /* flags = signbit (op1) */
15129 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15130
15131 /* if (flags) then e2 = -e2 */
15132 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15133 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15134 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15135 pc_rtx);
15136 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15137 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15138 JUMP_LABEL (insn) = jump_label;
15139
15140 emit_insn (gen_negxf2 (e2, e2));
15141
15142 emit_label (jump_label);
15143 LABEL_NUSES (jump_label) = 1;
15144
15145 /* op0 = 0.5 * e2 */
15146 half = force_reg (XFmode, half);
15147 emit_insn (gen_mulxf3 (op0, e2, half));
15148 }
15149
15150 /* Output code to perform an cosh XFmode calculation. */
15151
15152 void ix86_emit_i387_cosh (rtx op0, rtx op1)
15153 {
15154 rtx e1 = gen_reg_rtx (XFmode);
15155 rtx e2 = gen_reg_rtx (XFmode);
15156 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15157 rtx cst1;
15158
15159 /* e1 = exp (op1) */
15160 emit_insn (gen_expxf2 (e1, op1));
15161
15162 /* e2 = e1 + 1.0 / e1 */
15163 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15164 emit_insn (gen_divxf3 (e2, cst1, e1));
15165 emit_insn (gen_addxf3 (e2, e1, e2));
15166
15167 /* op0 = 0.5 * e2 */
15168 half = force_reg (XFmode, half);
15169 emit_insn (gen_mulxf3 (op0, e2, half));
15170 }
15171
15172 /* Output code to perform an tanh XFmode calculation. */
15173
15174 void ix86_emit_i387_tanh (rtx op0, rtx op1)
15175 {
15176 rtx e1 = gen_reg_rtx (XFmode);
15177 rtx e2 = gen_reg_rtx (XFmode);
15178 rtx scratch = gen_reg_rtx (HImode);
15179 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15180 rtx cst2, tmp;
15181 rtx_code_label *jump_label = gen_label_rtx ();
15182 rtx_insn *insn;
15183
15184 /* scratch = fxam (op1) */
15185 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15186
15187 /* e1 = expm1 (-|2 * op1|) */
15188 emit_insn (gen_addxf3 (e2, op1, op1));
15189 emit_insn (gen_absxf2 (e2, e2));
15190 emit_insn (gen_negxf2 (e2, e2));
15191 emit_insn (gen_expm1xf2 (e1, e2));
15192
15193 /* e2 = e1 / (e1 + 2.0) */
15194 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15195 emit_insn (gen_addxf3 (e2, e1, cst2));
15196 emit_insn (gen_divxf3 (e2, e1, e2));
15197
15198 /* flags = signbit (op1) */
15199 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15200
15201 /* if (!flags) then e2 = -e2 */
15202 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15203 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15204 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15205 pc_rtx);
15206 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15207 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15208 JUMP_LABEL (insn) = jump_label;
15209
15210 emit_insn (gen_negxf2 (e2, e2));
15211
15212 emit_label (jump_label);
15213 LABEL_NUSES (jump_label) = 1;
15214
15215 emit_move_insn (op0, e2);
15216 }
15217
15218 /* Output code to perform an asinh XFmode calculation. */
15219
15220 void ix86_emit_i387_asinh (rtx op0, rtx op1)
15221 {
15222 rtx e1 = gen_reg_rtx (XFmode);
15223 rtx e2 = gen_reg_rtx (XFmode);
15224 rtx scratch = gen_reg_rtx (HImode);
15225 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15226 rtx cst1, tmp;
15227 rtx_code_label *jump_label = gen_label_rtx ();
15228 rtx_insn *insn;
15229
15230 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15231 emit_insn (gen_mulxf3 (e1, op1, op1));
15232 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15233 emit_insn (gen_addxf3 (e2, e1, cst1));
15234 emit_insn (gen_sqrtxf2 (e2, e2));
15235 emit_insn (gen_addxf3 (e2, e2, cst1));
15236
15237 /* e1 = e1 / e2 */
15238 emit_insn (gen_divxf3 (e1, e1, e2));
15239
15240 /* scratch = fxam (op1) */
15241 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15242
15243 /* e1 = e1 + |op1| */
15244 emit_insn (gen_absxf2 (e2, op1));
15245 emit_insn (gen_addxf3 (e1, e1, e2));
15246
15247 /* e2 = log1p (e1) */
15248 ix86_emit_i387_log1p (e2, e1);
15249
15250 /* flags = signbit (op1) */
15251 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15252
15253 /* if (flags) then e2 = -e2 */
15254 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15255 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15256 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15257 pc_rtx);
15258 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15259 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15260 JUMP_LABEL (insn) = jump_label;
15261
15262 emit_insn (gen_negxf2 (e2, e2));
15263
15264 emit_label (jump_label);
15265 LABEL_NUSES (jump_label) = 1;
15266
15267 emit_move_insn (op0, e2);
15268 }
15269
15270 /* Output code to perform an acosh XFmode calculation. */
15271
15272 void ix86_emit_i387_acosh (rtx op0, rtx op1)
15273 {
15274 rtx e1 = gen_reg_rtx (XFmode);
15275 rtx e2 = gen_reg_rtx (XFmode);
15276 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15277
15278 /* e2 = sqrt (op1 + 1.0) */
15279 emit_insn (gen_addxf3 (e2, op1, cst1));
15280 emit_insn (gen_sqrtxf2 (e2, e2));
15281
15282 /* e1 = sqrt (op1 - 1.0) */
15283 emit_insn (gen_subxf3 (e1, op1, cst1));
15284 emit_insn (gen_sqrtxf2 (e1, e1));
15285
15286 /* e1 = e1 * e2 */
15287 emit_insn (gen_mulxf3 (e1, e1, e2));
15288
15289 /* e1 = e1 + op1 */
15290 emit_insn (gen_addxf3 (e1, e1, op1));
15291
15292 /* op0 = log (e1) */
15293 emit_insn (gen_logxf2 (op0, e1));
15294 }
15295
15296 /* Output code to perform an atanh XFmode calculation. */
15297
15298 void ix86_emit_i387_atanh (rtx op0, rtx op1)
15299 {
15300 rtx e1 = gen_reg_rtx (XFmode);
15301 rtx e2 = gen_reg_rtx (XFmode);
15302 rtx scratch = gen_reg_rtx (HImode);
15303 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15304 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15305 rtx cst1, tmp;
15306 rtx_code_label *jump_label = gen_label_rtx ();
15307 rtx_insn *insn;
15308
15309 /* scratch = fxam (op1) */
15310 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15311
15312 /* e2 = |op1| */
15313 emit_insn (gen_absxf2 (e2, op1));
15314
15315 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15316 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15317 emit_insn (gen_addxf3 (e1, e2, cst1));
15318 emit_insn (gen_addxf3 (e2, e2, e2));
15319 emit_insn (gen_negxf2 (e2, e2));
15320 emit_insn (gen_divxf3 (e1, e2, e1));
15321
15322 /* e2 = log1p (e1) */
15323 ix86_emit_i387_log1p (e2, e1);
15324
15325 /* flags = signbit (op1) */
15326 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15327
15328 /* if (!flags) then e2 = -e2 */
15329 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15330 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15331 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15332 pc_rtx);
15333 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15334 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15335 JUMP_LABEL (insn) = jump_label;
15336
15337 emit_insn (gen_negxf2 (e2, e2));
15338
15339 emit_label (jump_label);
15340 LABEL_NUSES (jump_label) = 1;
15341
15342 /* op0 = 0.5 * e2 */
15343 half = force_reg (XFmode, half);
15344 emit_insn (gen_mulxf3 (op0, e2, half));
15345 }
15346
15347 /* Output code to perform a log1p XFmode calculation. */
15348
15349 void ix86_emit_i387_log1p (rtx op0, rtx op1)
15350 {
15351 rtx_code_label *label1 = gen_label_rtx ();
15352 rtx_code_label *label2 = gen_label_rtx ();
15353
15354 rtx tmp = gen_reg_rtx (XFmode);
15355 rtx res = gen_reg_rtx (XFmode);
15356 rtx cst, cstln2, cst1;
15357 rtx_insn *insn;
15358
15359 cst = const_double_from_real_value
15360 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15361 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15362
15363 emit_insn (gen_absxf2 (tmp, op1));
15364
15365 cst = force_reg (XFmode, cst);
15366 ix86_expand_branch (GE, tmp, cst, label1);
15367 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15368 insn = get_last_insn ();
15369 JUMP_LABEL (insn) = label1;
15370
15371 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15372 emit_jump (label2);
15373
15374 emit_label (label1);
15375 LABEL_NUSES (label1) = 1;
15376
15377 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15378 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15379 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15380
15381 emit_label (label2);
15382 LABEL_NUSES (label2) = 1;
15383
15384 emit_move_insn (op0, res);
15385 }
15386
15387 /* Emit code for round calculation. */
15388 void ix86_emit_i387_round (rtx op0, rtx op1)
15389 {
15390 machine_mode inmode = GET_MODE (op1);
15391 machine_mode outmode = GET_MODE (op0);
15392 rtx e1 = gen_reg_rtx (XFmode);
15393 rtx e2 = gen_reg_rtx (XFmode);
15394 rtx scratch = gen_reg_rtx (HImode);
15395 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15396 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15397 rtx res = gen_reg_rtx (outmode);
15398 rtx_code_label *jump_label = gen_label_rtx ();
15399 rtx (*floor_insn) (rtx, rtx);
15400 rtx (*neg_insn) (rtx, rtx);
15401 rtx_insn *insn;
15402 rtx tmp;
15403
15404 switch (inmode)
15405 {
15406 case E_SFmode:
15407 case E_DFmode:
15408 tmp = gen_reg_rtx (XFmode);
15409
15410 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15411 op1 = tmp;
15412 break;
15413 case E_XFmode:
15414 break;
15415 default:
15416 gcc_unreachable ();
15417 }
15418
15419 switch (outmode)
15420 {
15421 case E_SFmode:
15422 floor_insn = gen_frndintxf2_floor;
15423 neg_insn = gen_negsf2;
15424 break;
15425 case E_DFmode:
15426 floor_insn = gen_frndintxf2_floor;
15427 neg_insn = gen_negdf2;
15428 break;
15429 case E_XFmode:
15430 floor_insn = gen_frndintxf2_floor;
15431 neg_insn = gen_negxf2;
15432 break;
15433 case E_HImode:
15434 floor_insn = gen_lfloorxfhi2;
15435 neg_insn = gen_neghi2;
15436 break;
15437 case E_SImode:
15438 floor_insn = gen_lfloorxfsi2;
15439 neg_insn = gen_negsi2;
15440 break;
15441 case E_DImode:
15442 floor_insn = gen_lfloorxfdi2;
15443 neg_insn = gen_negdi2;
15444 break;
15445 default:
15446 gcc_unreachable ();
15447 }
15448
15449 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15450
15451 /* scratch = fxam(op1) */
15452 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15453
15454 /* e1 = fabs(op1) */
15455 emit_insn (gen_absxf2 (e1, op1));
15456
15457 /* e2 = e1 + 0.5 */
15458 half = force_reg (XFmode, half);
15459 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15460
15461 /* res = floor(e2) */
15462 switch (outmode)
15463 {
15464 case E_SFmode:
15465 case E_DFmode:
15466 {
15467 tmp = gen_reg_rtx (XFmode);
15468
15469 emit_insn (floor_insn (tmp, e2));
15470 emit_insn (gen_rtx_SET (res,
15471 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15472 UNSPEC_TRUNC_NOOP)));
15473 }
15474 break;
15475 default:
15476 emit_insn (floor_insn (res, e2));
15477 }
15478
15479 /* flags = signbit(a) */
15480 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15481
15482 /* if (flags) then res = -res */
15483 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15484 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15485 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15486 pc_rtx);
15487 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15488 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15489 JUMP_LABEL (insn) = jump_label;
15490
15491 emit_insn (neg_insn (res, res));
15492
15493 emit_label (jump_label);
15494 LABEL_NUSES (jump_label) = 1;
15495
15496 emit_move_insn (op0, res);
15497 }
15498
15499 /* Output code to perform a Newton-Rhapson approximation of a single precision
15500 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15501
15502 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15503 {
15504 rtx x0, x1, e0, e1;
15505
15506 x0 = gen_reg_rtx (mode);
15507 e0 = gen_reg_rtx (mode);
15508 e1 = gen_reg_rtx (mode);
15509 x1 = gen_reg_rtx (mode);
15510
15511 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15512
15513 b = force_reg (mode, b);
15514
15515 /* x0 = rcp(b) estimate */
15516 if (mode == V16SFmode || mode == V8DFmode)
15517 {
15518 if (TARGET_AVX512ER)
15519 {
15520 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15521 UNSPEC_RCP28)));
15522 /* res = a * x0 */
15523 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15524 return;
15525 }
15526 else
15527 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15528 UNSPEC_RCP14)));
15529 }
15530 else
15531 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15532 UNSPEC_RCP)));
15533
15534 /* e0 = x0 * b */
15535 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15536
15537 /* e0 = x0 * e0 */
15538 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
15539
15540 /* e1 = x0 + x0 */
15541 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
15542
15543 /* x1 = e1 - e0 */
15544 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
15545
15546 /* res = a * x1 */
15547 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
15548 }
15549
15550 /* Output code to perform a Newton-Rhapson approximation of a
15551 single precision floating point [reciprocal] square root. */
15552
15553 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
15554 {
15555 rtx x0, e0, e1, e2, e3, mthree, mhalf;
15556 REAL_VALUE_TYPE r;
15557 int unspec;
15558
15559 x0 = gen_reg_rtx (mode);
15560 e0 = gen_reg_rtx (mode);
15561 e1 = gen_reg_rtx (mode);
15562 e2 = gen_reg_rtx (mode);
15563 e3 = gen_reg_rtx (mode);
15564
15565 if (TARGET_AVX512ER && mode == V16SFmode)
15566 {
15567 if (recip)
15568 /* res = rsqrt28(a) estimate */
15569 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15570 UNSPEC_RSQRT28)));
15571 else
15572 {
15573 /* x0 = rsqrt28(a) estimate */
15574 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15575 UNSPEC_RSQRT28)));
15576 /* res = rcp28(x0) estimate */
15577 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
15578 UNSPEC_RCP28)));
15579 }
15580 return;
15581 }
15582
15583 real_from_integer (&r, VOIDmode, -3, SIGNED);
15584 mthree = const_double_from_real_value (r, SFmode);
15585
15586 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
15587 mhalf = const_double_from_real_value (r, SFmode);
15588 unspec = UNSPEC_RSQRT;
15589
15590 if (VECTOR_MODE_P (mode))
15591 {
15592 mthree = ix86_build_const_vector (mode, true, mthree);
15593 mhalf = ix86_build_const_vector (mode, true, mhalf);
15594 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15595 if (GET_MODE_SIZE (mode) == 64)
15596 unspec = UNSPEC_RSQRT14;
15597 }
15598
15599 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15600 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15601
15602 a = force_reg (mode, a);
15603
15604 /* x0 = rsqrt(a) estimate */
15605 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15606 unspec)));
15607
15608 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15609 if (!recip)
15610 {
15611 rtx zero = force_reg (mode, CONST0_RTX(mode));
15612 rtx mask;
15613
15614 /* Handle masked compare. */
15615 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
15616 {
15617 mask = gen_reg_rtx (HImode);
15618 /* Imm value 0x4 corresponds to not-equal comparison. */
15619 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
15620 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
15621 }
15622 else
15623 {
15624 mask = gen_reg_rtx (mode);
15625 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
15626 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
15627 }
15628 }
15629
15630 /* e0 = x0 * a */
15631 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
15632 /* e1 = e0 * x0 */
15633 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
15634
15635 /* e2 = e1 - 3. */
15636 mthree = force_reg (mode, mthree);
15637 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
15638
15639 mhalf = force_reg (mode, mhalf);
15640 if (recip)
15641 /* e3 = -.5 * x0 */
15642 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
15643 else
15644 /* e3 = -.5 * e0 */
15645 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
15646 /* ret = e2 * e3 */
15647 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
15648 }
15649
15650 /* Expand fabs (OP0) and return a new rtx that holds the result. The
15651 mask for masking out the sign-bit is stored in *SMASK, if that is
15652 non-null. */
15653
15654 static rtx
15655 ix86_expand_sse_fabs (rtx op0, rtx *smask)
15656 {
15657 machine_mode vmode, mode = GET_MODE (op0);
15658 rtx xa, mask;
15659
15660 xa = gen_reg_rtx (mode);
15661 if (mode == SFmode)
15662 vmode = V4SFmode;
15663 else if (mode == DFmode)
15664 vmode = V2DFmode;
15665 else
15666 vmode = mode;
15667 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
15668 if (!VECTOR_MODE_P (mode))
15669 {
15670 /* We need to generate a scalar mode mask in this case. */
15671 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15672 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15673 mask = gen_reg_rtx (mode);
15674 emit_insn (gen_rtx_SET (mask, tmp));
15675 }
15676 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
15677
15678 if (smask)
15679 *smask = mask;
15680
15681 return xa;
15682 }
15683
15684 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15685 swapping the operands if SWAP_OPERANDS is true. The expanded
15686 code is a forward jump to a newly created label in case the
15687 comparison is true. The generated label rtx is returned. */
15688 static rtx_code_label *
15689 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
15690 bool swap_operands)
15691 {
15692 bool unordered_compare = ix86_unordered_fp_compare (code);
15693 rtx_code_label *label;
15694 rtx tmp, reg;
15695
15696 if (swap_operands)
15697 std::swap (op0, op1);
15698
15699 label = gen_label_rtx ();
15700 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
15701 if (unordered_compare)
15702 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
15703 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
15704 emit_insn (gen_rtx_SET (reg, tmp));
15705 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
15706 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
15707 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
15708 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15709 JUMP_LABEL (tmp) = label;
15710
15711 return label;
15712 }
15713
15714 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15715 using comparison code CODE. Operands are swapped for the comparison if
15716 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15717 static rtx
15718 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
15719 bool swap_operands)
15720 {
15721 rtx (*insn)(rtx, rtx, rtx, rtx);
15722 machine_mode mode = GET_MODE (op0);
15723 rtx mask = gen_reg_rtx (mode);
15724
15725 if (swap_operands)
15726 std::swap (op0, op1);
15727
15728 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
15729
15730 emit_insn (insn (mask, op0, op1,
15731 gen_rtx_fmt_ee (code, mode, op0, op1)));
15732 return mask;
15733 }
15734
15735 /* Expand copysign from SIGN to the positive value ABS_VALUE
15736 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15737 the sign-bit. */
15738
15739 static void
15740 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
15741 {
15742 machine_mode mode = GET_MODE (sign);
15743 rtx sgn = gen_reg_rtx (mode);
15744 if (mask == NULL_RTX)
15745 {
15746 machine_mode vmode;
15747
15748 if (mode == SFmode)
15749 vmode = V4SFmode;
15750 else if (mode == DFmode)
15751 vmode = V2DFmode;
15752 else
15753 vmode = mode;
15754
15755 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
15756 if (!VECTOR_MODE_P (mode))
15757 {
15758 /* We need to generate a scalar mode mask in this case. */
15759 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15760 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15761 mask = gen_reg_rtx (mode);
15762 emit_insn (gen_rtx_SET (mask, tmp));
15763 }
15764 }
15765 else
15766 mask = gen_rtx_NOT (mode, mask);
15767 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
15768 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
15769 }
15770
15771 /* Expand SSE sequence for computing lround from OP1 storing
15772 into OP0. */
15773
15774 void
15775 ix86_expand_lround (rtx op0, rtx op1)
15776 {
15777 /* C code for the stuff we're doing below:
15778 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15779 return (long)tmp;
15780 */
15781 machine_mode mode = GET_MODE (op1);
15782 const struct real_format *fmt;
15783 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
15784 rtx adj;
15785
15786 /* load nextafter (0.5, 0.0) */
15787 fmt = REAL_MODE_FORMAT (mode);
15788 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
15789 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
15790
15791 /* adj = copysign (0.5, op1) */
15792 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
15793 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
15794
15795 /* adj = op1 + adj */
15796 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
15797
15798 /* op0 = (imode)adj */
15799 expand_fix (op0, adj, 0);
15800 }
15801
15802 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15803 into OPERAND0. */
15804
15805 void
15806 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
15807 {
15808 /* C code for the stuff we're doing below (for do_floor):
15809 xi = (long)op1;
15810 xi -= (double)xi > op1 ? 1 : 0;
15811 return xi;
15812 */
15813 machine_mode fmode = GET_MODE (op1);
15814 machine_mode imode = GET_MODE (op0);
15815 rtx ireg, freg, tmp;
15816 rtx_code_label *label;
15817
15818 /* reg = (long)op1 */
15819 ireg = gen_reg_rtx (imode);
15820 expand_fix (ireg, op1, 0);
15821
15822 /* freg = (double)reg */
15823 freg = gen_reg_rtx (fmode);
15824 expand_float (freg, ireg, 0);
15825
15826 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15827 label = ix86_expand_sse_compare_and_jump (UNLE,
15828 freg, op1, !do_floor);
15829 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
15830 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
15831 emit_move_insn (ireg, tmp);
15832
15833 emit_label (label);
15834 LABEL_NUSES (label) = 1;
15835
15836 emit_move_insn (op0, ireg);
15837 }
15838
15839 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15840 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15841
15842 static rtx
15843 ix86_gen_TWO52 (machine_mode mode)
15844 {
15845 REAL_VALUE_TYPE TWO52r;
15846 rtx TWO52;
15847
15848 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
15849 TWO52 = const_double_from_real_value (TWO52r, mode);
15850 TWO52 = force_reg (mode, TWO52);
15851
15852 return TWO52;
15853 }
15854
15855 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15856
15857 void
15858 ix86_expand_rint (rtx operand0, rtx operand1)
15859 {
15860 /* C code for the stuff we're doing below:
15861 xa = fabs (operand1);
15862 if (!isless (xa, 2**52))
15863 return operand1;
15864 two52 = 2**52;
15865 if (flag_rounding_math)
15866 {
15867 two52 = copysign (two52, operand1);
15868 xa = operand1;
15869 }
15870 xa = xa + two52 - two52;
15871 return copysign (xa, operand1);
15872 */
15873 machine_mode mode = GET_MODE (operand0);
15874 rtx res, xa, TWO52, two52, mask;
15875 rtx_code_label *label;
15876
15877 res = gen_reg_rtx (mode);
15878 emit_move_insn (res, operand1);
15879
15880 /* xa = abs (operand1) */
15881 xa = ix86_expand_sse_fabs (res, &mask);
15882
15883 /* if (!isless (xa, TWO52)) goto label; */
15884 TWO52 = ix86_gen_TWO52 (mode);
15885 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15886
15887 two52 = TWO52;
15888 if (flag_rounding_math)
15889 {
15890 two52 = gen_reg_rtx (mode);
15891 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
15892 xa = res;
15893 }
15894
15895 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
15896 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
15897
15898 ix86_sse_copysign_to_positive (res, xa, res, mask);
15899
15900 emit_label (label);
15901 LABEL_NUSES (label) = 1;
15902
15903 emit_move_insn (operand0, res);
15904 }
15905
15906 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15907 into OPERAND0. */
15908 void
15909 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
15910 {
15911 /* C code for the stuff we expand below.
15912 double xa = fabs (x), x2;
15913 if (!isless (xa, TWO52))
15914 return x;
15915 xa = xa + TWO52 - TWO52;
15916 x2 = copysign (xa, x);
15917 Compensate. Floor:
15918 if (x2 > x)
15919 x2 -= 1;
15920 Compensate. Ceil:
15921 if (x2 < x)
15922 x2 += 1;
15923 if (HONOR_SIGNED_ZEROS (mode))
15924 x2 = copysign (x2, x);
15925 return x2;
15926 */
15927 machine_mode mode = GET_MODE (operand0);
15928 rtx xa, TWO52, tmp, one, res, mask;
15929 rtx_code_label *label;
15930
15931 TWO52 = ix86_gen_TWO52 (mode);
15932
15933 /* Temporary for holding the result, initialized to the input
15934 operand to ease control flow. */
15935 res = gen_reg_rtx (mode);
15936 emit_move_insn (res, operand1);
15937
15938 /* xa = abs (operand1) */
15939 xa = ix86_expand_sse_fabs (res, &mask);
15940
15941 /* if (!isless (xa, TWO52)) goto label; */
15942 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15943
15944 /* xa = xa + TWO52 - TWO52; */
15945 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
15946 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
15947
15948 /* xa = copysign (xa, operand1) */
15949 ix86_sse_copysign_to_positive (xa, xa, res, mask);
15950
15951 /* generate 1.0 */
15952 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15953
15954 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15955 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15956 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15957 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15958 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15959 if (!do_floor && HONOR_SIGNED_ZEROS (mode))
15960 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
15961 emit_move_insn (res, tmp);
15962
15963 emit_label (label);
15964 LABEL_NUSES (label) = 1;
15965
15966 emit_move_insn (operand0, res);
15967 }
15968
15969 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15970 into OPERAND0. */
15971 void
15972 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
15973 {
15974 /* C code for the stuff we expand below.
15975 double xa = fabs (x), x2;
15976 if (!isless (xa, TWO52))
15977 return x;
15978 x2 = (double)(long)x;
15979 Compensate. Floor:
15980 if (x2 > x)
15981 x2 -= 1;
15982 Compensate. Ceil:
15983 if (x2 < x)
15984 x2 += 1;
15985 if (HONOR_SIGNED_ZEROS (mode))
15986 return copysign (x2, x);
15987 return x2;
15988 */
15989 machine_mode mode = GET_MODE (operand0);
15990 rtx xa, xi, TWO52, tmp, one, res, mask;
15991 rtx_code_label *label;
15992
15993 TWO52 = ix86_gen_TWO52 (mode);
15994
15995 /* Temporary for holding the result, initialized to the input
15996 operand to ease control flow. */
15997 res = gen_reg_rtx (mode);
15998 emit_move_insn (res, operand1);
15999
16000 /* xa = abs (operand1) */
16001 xa = ix86_expand_sse_fabs (res, &mask);
16002
16003 /* if (!isless (xa, TWO52)) goto label; */
16004 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16005
16006 /* xa = (double)(long)x */
16007 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16008 expand_fix (xi, res, 0);
16009 expand_float (xa, xi, 0);
16010
16011 /* generate 1.0 */
16012 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16013
16014 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16015 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16016 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16017 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16018 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16019 emit_move_insn (res, tmp);
16020
16021 if (HONOR_SIGNED_ZEROS (mode))
16022 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
16023
16024 emit_label (label);
16025 LABEL_NUSES (label) = 1;
16026
16027 emit_move_insn (operand0, res);
16028 }
16029
16030 /* Expand SSE sequence for computing round from OPERAND1 storing
16031 into OPERAND0. Sequence that works without relying on DImode truncation
16032 via cvttsd2siq that is only available on 64bit targets. */
16033 void
16034 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16035 {
16036 /* C code for the stuff we expand below.
16037 double xa = fabs (x), xa2, x2;
16038 if (!isless (xa, TWO52))
16039 return x;
16040 Using the absolute value and copying back sign makes
16041 -0.0 -> -0.0 correct.
16042 xa2 = xa + TWO52 - TWO52;
16043 Compensate.
16044 dxa = xa2 - xa;
16045 if (dxa <= -0.5)
16046 xa2 += 1;
16047 else if (dxa > 0.5)
16048 xa2 -= 1;
16049 x2 = copysign (xa2, x);
16050 return x2;
16051 */
16052 machine_mode mode = GET_MODE (operand0);
16053 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16054 rtx_code_label *label;
16055
16056 TWO52 = ix86_gen_TWO52 (mode);
16057
16058 /* Temporary for holding the result, initialized to the input
16059 operand to ease control flow. */
16060 res = gen_reg_rtx (mode);
16061 emit_move_insn (res, operand1);
16062
16063 /* xa = abs (operand1) */
16064 xa = ix86_expand_sse_fabs (res, &mask);
16065
16066 /* if (!isless (xa, TWO52)) goto label; */
16067 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16068
16069 /* xa2 = xa + TWO52 - TWO52; */
16070 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16071 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16072
16073 /* dxa = xa2 - xa; */
16074 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16075
16076 /* generate 0.5, 1.0 and -0.5 */
16077 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16078 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16079 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16080 0, OPTAB_DIRECT);
16081
16082 /* Compensate. */
16083 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16084 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16085 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16086 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16087 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16088 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16089 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16090 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16091
16092 /* res = copysign (xa2, operand1) */
16093 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
16094
16095 emit_label (label);
16096 LABEL_NUSES (label) = 1;
16097
16098 emit_move_insn (operand0, res);
16099 }
16100
16101 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16102 into OPERAND0. */
16103 void
16104 ix86_expand_trunc (rtx operand0, rtx operand1)
16105 {
16106 /* C code for SSE variant we expand below.
16107 double xa = fabs (x), x2;
16108 if (!isless (xa, TWO52))
16109 return x;
16110 x2 = (double)(long)x;
16111 if (HONOR_SIGNED_ZEROS (mode))
16112 return copysign (x2, x);
16113 return x2;
16114 */
16115 machine_mode mode = GET_MODE (operand0);
16116 rtx xa, xi, TWO52, res, mask;
16117 rtx_code_label *label;
16118
16119 TWO52 = ix86_gen_TWO52 (mode);
16120
16121 /* Temporary for holding the result, initialized to the input
16122 operand to ease control flow. */
16123 res = gen_reg_rtx (mode);
16124 emit_move_insn (res, operand1);
16125
16126 /* xa = abs (operand1) */
16127 xa = ix86_expand_sse_fabs (res, &mask);
16128
16129 /* if (!isless (xa, TWO52)) goto label; */
16130 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16131
16132 /* x = (double)(long)x */
16133 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16134 expand_fix (xi, res, 0);
16135 expand_float (res, xi, 0);
16136
16137 if (HONOR_SIGNED_ZEROS (mode))
16138 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
16139
16140 emit_label (label);
16141 LABEL_NUSES (label) = 1;
16142
16143 emit_move_insn (operand0, res);
16144 }
16145
16146 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16147 into OPERAND0. */
16148 void
16149 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16150 {
16151 machine_mode mode = GET_MODE (operand0);
16152 rtx xa, mask, TWO52, one, res, smask, tmp;
16153 rtx_code_label *label;
16154
16155 /* C code for SSE variant we expand below.
16156 double xa = fabs (x), x2;
16157 if (!isless (xa, TWO52))
16158 return x;
16159 xa2 = xa + TWO52 - TWO52;
16160 Compensate:
16161 if (xa2 > xa)
16162 xa2 -= 1.0;
16163 x2 = copysign (xa2, x);
16164 return x2;
16165 */
16166
16167 TWO52 = ix86_gen_TWO52 (mode);
16168
16169 /* Temporary for holding the result, initialized to the input
16170 operand to ease control flow. */
16171 res = gen_reg_rtx (mode);
16172 emit_move_insn (res, operand1);
16173
16174 /* xa = abs (operand1) */
16175 xa = ix86_expand_sse_fabs (res, &smask);
16176
16177 /* if (!isless (xa, TWO52)) goto label; */
16178 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16179
16180 /* res = xa + TWO52 - TWO52; */
16181 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16182 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
16183 emit_move_insn (res, tmp);
16184
16185 /* generate 1.0 */
16186 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16187
16188 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
16189 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
16190 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
16191 tmp = expand_simple_binop (mode, MINUS,
16192 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
16193 emit_move_insn (res, tmp);
16194
16195 /* res = copysign (res, operand1) */
16196 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
16197
16198 emit_label (label);
16199 LABEL_NUSES (label) = 1;
16200
16201 emit_move_insn (operand0, res);
16202 }
16203
16204 /* Expand SSE sequence for computing round from OPERAND1 storing
16205 into OPERAND0. */
16206 void
16207 ix86_expand_round (rtx operand0, rtx operand1)
16208 {
16209 /* C code for the stuff we're doing below:
16210 double xa = fabs (x);
16211 if (!isless (xa, TWO52))
16212 return x;
16213 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16214 return copysign (xa, x);
16215 */
16216 machine_mode mode = GET_MODE (operand0);
16217 rtx res, TWO52, xa, xi, half, mask;
16218 rtx_code_label *label;
16219 const struct real_format *fmt;
16220 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16221
16222 /* Temporary for holding the result, initialized to the input
16223 operand to ease control flow. */
16224 res = gen_reg_rtx (mode);
16225 emit_move_insn (res, operand1);
16226
16227 TWO52 = ix86_gen_TWO52 (mode);
16228 xa = ix86_expand_sse_fabs (res, &mask);
16229 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16230
16231 /* load nextafter (0.5, 0.0) */
16232 fmt = REAL_MODE_FORMAT (mode);
16233 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16234 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16235
16236 /* xa = xa + 0.5 */
16237 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16238 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16239
16240 /* xa = (double)(int64_t)xa */
16241 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16242 expand_fix (xi, xa, 0);
16243 expand_float (xa, xi, 0);
16244
16245 /* res = copysign (xa, operand1) */
16246 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
16247
16248 emit_label (label);
16249 LABEL_NUSES (label) = 1;
16250
16251 emit_move_insn (operand0, res);
16252 }
16253
16254 /* Expand SSE sequence for computing round
16255 from OP1 storing into OP0 using sse4 round insn. */
16256 void
16257 ix86_expand_round_sse4 (rtx op0, rtx op1)
16258 {
16259 machine_mode mode = GET_MODE (op0);
16260 rtx e1, e2, res, half;
16261 const struct real_format *fmt;
16262 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16263 rtx (*gen_copysign) (rtx, rtx, rtx);
16264 rtx (*gen_round) (rtx, rtx, rtx);
16265
16266 switch (mode)
16267 {
16268 case E_SFmode:
16269 gen_copysign = gen_copysignsf3;
16270 gen_round = gen_sse4_1_roundsf2;
16271 break;
16272 case E_DFmode:
16273 gen_copysign = gen_copysigndf3;
16274 gen_round = gen_sse4_1_rounddf2;
16275 break;
16276 default:
16277 gcc_unreachable ();
16278 }
16279
16280 /* round (a) = trunc (a + copysign (0.5, a)) */
16281
16282 /* load nextafter (0.5, 0.0) */
16283 fmt = REAL_MODE_FORMAT (mode);
16284 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16285 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16286 half = const_double_from_real_value (pred_half, mode);
16287
16288 /* e1 = copysign (0.5, op1) */
16289 e1 = gen_reg_rtx (mode);
16290 emit_insn (gen_copysign (e1, half, op1));
16291
16292 /* e2 = op1 + e1 */
16293 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16294
16295 /* res = trunc (e2) */
16296 res = gen_reg_rtx (mode);
16297 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16298
16299 emit_move_insn (op0, res);
16300 }
16301
16302 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16303 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16304 insn every time. */
16305
16306 static GTY(()) rtx_insn *vselect_insn;
16307
16308 /* Initialize vselect_insn. */
16309
16310 static void
16311 init_vselect_insn (void)
16312 {
16313 unsigned i;
16314 rtx x;
16315
16316 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16317 for (i = 0; i < MAX_VECT_LEN; ++i)
16318 XVECEXP (x, 0, i) = const0_rtx;
16319 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16320 const0_rtx), x);
16321 x = gen_rtx_SET (const0_rtx, x);
16322 start_sequence ();
16323 vselect_insn = emit_insn (x);
16324 end_sequence ();
16325 }
16326
16327 /* Construct (set target (vec_select op0 (parallel perm))) and
16328 return true if that's a valid instruction in the active ISA. */
16329
16330 static bool
16331 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16332 unsigned nelt, bool testing_p)
16333 {
16334 unsigned int i;
16335 rtx x, save_vconcat;
16336 int icode;
16337
16338 if (vselect_insn == NULL_RTX)
16339 init_vselect_insn ();
16340
16341 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16342 PUT_NUM_ELEM (XVEC (x, 0), nelt);
16343 for (i = 0; i < nelt; ++i)
16344 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16345 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16346 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16347 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16348 SET_DEST (PATTERN (vselect_insn)) = target;
16349 icode = recog_memoized (vselect_insn);
16350
16351 if (icode >= 0 && !testing_p)
16352 emit_insn (copy_rtx (PATTERN (vselect_insn)));
16353
16354 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16355 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16356 INSN_CODE (vselect_insn) = -1;
16357
16358 return icode >= 0;
16359 }
16360
16361 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16362
16363 static bool
16364 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16365 const unsigned char *perm, unsigned nelt,
16366 bool testing_p)
16367 {
16368 machine_mode v2mode;
16369 rtx x;
16370 bool ok;
16371
16372 if (vselect_insn == NULL_RTX)
16373 init_vselect_insn ();
16374
16375 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16376 return false;
16377 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16378 PUT_MODE (x, v2mode);
16379 XEXP (x, 0) = op0;
16380 XEXP (x, 1) = op1;
16381 ok = expand_vselect (target, x, perm, nelt, testing_p);
16382 XEXP (x, 0) = const0_rtx;
16383 XEXP (x, 1) = const0_rtx;
16384 return ok;
16385 }
16386
16387 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16388 using movss or movsd. */
16389 static bool
16390 expand_vec_perm_movs (struct expand_vec_perm_d *d)
16391 {
16392 machine_mode vmode = d->vmode;
16393 unsigned i, nelt = d->nelt;
16394 rtx x;
16395
16396 if (d->one_operand_p)
16397 return false;
16398
16399 if (!(TARGET_SSE && vmode == V4SFmode)
16400 && !(TARGET_SSE2 && vmode == V2DFmode))
16401 return false;
16402
16403 /* Only the first element is changed. */
16404 if (d->perm[0] != nelt && d->perm[0] != 0)
16405 return false;
16406 for (i = 1; i < nelt; ++i)
16407 if (d->perm[i] != i + nelt - d->perm[0])
16408 return false;
16409
16410 if (d->testing_p)
16411 return true;
16412
16413 if (d->perm[0] == nelt)
16414 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16415 else
16416 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16417
16418 emit_insn (gen_rtx_SET (d->target, x));
16419
16420 return true;
16421 }
16422
16423 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16424 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16425
16426 static bool
16427 expand_vec_perm_blend (struct expand_vec_perm_d *d)
16428 {
16429 machine_mode mmode, vmode = d->vmode;
16430 unsigned i, nelt = d->nelt;
16431 unsigned HOST_WIDE_INT mask;
16432 rtx target, op0, op1, maskop, x;
16433 rtx rperm[32], vperm;
16434
16435 if (d->one_operand_p)
16436 return false;
16437 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16438 && (TARGET_AVX512BW
16439 || GET_MODE_UNIT_SIZE (vmode) >= 4))
16440 ;
16441 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16442 ;
16443 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16444 ;
16445 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16446 ;
16447 else
16448 return false;
16449
16450 /* This is a blend, not a permute. Elements must stay in their
16451 respective lanes. */
16452 for (i = 0; i < nelt; ++i)
16453 {
16454 unsigned e = d->perm[i];
16455 if (!(e == i || e == i + nelt))
16456 return false;
16457 }
16458
16459 if (d->testing_p)
16460 return true;
16461
16462 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16463 decision should be extracted elsewhere, so that we only try that
16464 sequence once all budget==3 options have been tried. */
16465 target = d->target;
16466 op0 = d->op0;
16467 op1 = d->op1;
16468 mask = 0;
16469
16470 switch (vmode)
16471 {
16472 case E_V8DFmode:
16473 case E_V16SFmode:
16474 case E_V4DFmode:
16475 case E_V8SFmode:
16476 case E_V2DFmode:
16477 case E_V4SFmode:
16478 case E_V8HImode:
16479 case E_V8SImode:
16480 case E_V32HImode:
16481 case E_V64QImode:
16482 case E_V16SImode:
16483 case E_V8DImode:
16484 for (i = 0; i < nelt; ++i)
16485 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
16486 break;
16487
16488 case E_V2DImode:
16489 for (i = 0; i < 2; ++i)
16490 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16491 vmode = V8HImode;
16492 goto do_subreg;
16493
16494 case E_V4SImode:
16495 for (i = 0; i < 4; ++i)
16496 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16497 vmode = V8HImode;
16498 goto do_subreg;
16499
16500 case E_V16QImode:
16501 /* See if bytes move in pairs so we can use pblendw with
16502 an immediate argument, rather than pblendvb with a vector
16503 argument. */
16504 for (i = 0; i < 16; i += 2)
16505 if (d->perm[i] + 1 != d->perm[i + 1])
16506 {
16507 use_pblendvb:
16508 for (i = 0; i < nelt; ++i)
16509 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
16510
16511 finish_pblendvb:
16512 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16513 vperm = force_reg (vmode, vperm);
16514
16515 if (GET_MODE_SIZE (vmode) == 16)
16516 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
16517 else
16518 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
16519 if (target != d->target)
16520 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16521 return true;
16522 }
16523
16524 for (i = 0; i < 8; ++i)
16525 mask |= (d->perm[i * 2] >= 16) << i;
16526 vmode = V8HImode;
16527 /* FALLTHRU */
16528
16529 do_subreg:
16530 target = gen_reg_rtx (vmode);
16531 op0 = gen_lowpart (vmode, op0);
16532 op1 = gen_lowpart (vmode, op1);
16533 break;
16534
16535 case E_V32QImode:
16536 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16537 for (i = 0; i < 32; i += 2)
16538 if (d->perm[i] + 1 != d->perm[i + 1])
16539 goto use_pblendvb;
16540 /* See if bytes move in quadruplets. If yes, vpblendd
16541 with immediate can be used. */
16542 for (i = 0; i < 32; i += 4)
16543 if (d->perm[i] + 2 != d->perm[i + 2])
16544 break;
16545 if (i < 32)
16546 {
16547 /* See if bytes move the same in both lanes. If yes,
16548 vpblendw with immediate can be used. */
16549 for (i = 0; i < 16; i += 2)
16550 if (d->perm[i] + 16 != d->perm[i + 16])
16551 goto use_pblendvb;
16552
16553 /* Use vpblendw. */
16554 for (i = 0; i < 16; ++i)
16555 mask |= (d->perm[i * 2] >= 32) << i;
16556 vmode = V16HImode;
16557 goto do_subreg;
16558 }
16559
16560 /* Use vpblendd. */
16561 for (i = 0; i < 8; ++i)
16562 mask |= (d->perm[i * 4] >= 32) << i;
16563 vmode = V8SImode;
16564 goto do_subreg;
16565
16566 case E_V16HImode:
16567 /* See if words move in pairs. If yes, vpblendd can be used. */
16568 for (i = 0; i < 16; i += 2)
16569 if (d->perm[i] + 1 != d->perm[i + 1])
16570 break;
16571 if (i < 16)
16572 {
16573 /* See if words move the same in both lanes. If not,
16574 vpblendvb must be used. */
16575 for (i = 0; i < 8; i++)
16576 if (d->perm[i] + 8 != d->perm[i + 8])
16577 {
16578 /* Use vpblendvb. */
16579 for (i = 0; i < 32; ++i)
16580 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
16581
16582 vmode = V32QImode;
16583 nelt = 32;
16584 target = gen_reg_rtx (vmode);
16585 op0 = gen_lowpart (vmode, op0);
16586 op1 = gen_lowpart (vmode, op1);
16587 goto finish_pblendvb;
16588 }
16589
16590 /* Use vpblendw. */
16591 for (i = 0; i < 16; ++i)
16592 mask |= (d->perm[i] >= 16) << i;
16593 break;
16594 }
16595
16596 /* Use vpblendd. */
16597 for (i = 0; i < 8; ++i)
16598 mask |= (d->perm[i * 2] >= 16) << i;
16599 vmode = V8SImode;
16600 goto do_subreg;
16601
16602 case E_V4DImode:
16603 /* Use vpblendd. */
16604 for (i = 0; i < 4; ++i)
16605 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16606 vmode = V8SImode;
16607 goto do_subreg;
16608
16609 default:
16610 gcc_unreachable ();
16611 }
16612
16613 switch (vmode)
16614 {
16615 case E_V8DFmode:
16616 case E_V8DImode:
16617 mmode = QImode;
16618 break;
16619 case E_V16SFmode:
16620 case E_V16SImode:
16621 mmode = HImode;
16622 break;
16623 case E_V32HImode:
16624 mmode = SImode;
16625 break;
16626 case E_V64QImode:
16627 mmode = DImode;
16628 break;
16629 default:
16630 mmode = VOIDmode;
16631 }
16632
16633 if (mmode != VOIDmode)
16634 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
16635 else
16636 maskop = GEN_INT (mask);
16637
16638 /* This matches five different patterns with the different modes. */
16639 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
16640 x = gen_rtx_SET (target, x);
16641 emit_insn (x);
16642 if (target != d->target)
16643 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16644
16645 return true;
16646 }
16647
16648 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16649 in terms of the variable form of vpermilps.
16650
16651 Note that we will have already failed the immediate input vpermilps,
16652 which requires that the high and low part shuffle be identical; the
16653 variable form doesn't require that. */
16654
16655 static bool
16656 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
16657 {
16658 rtx rperm[8], vperm;
16659 unsigned i;
16660
16661 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
16662 return false;
16663
16664 /* We can only permute within the 128-bit lane. */
16665 for (i = 0; i < 8; ++i)
16666 {
16667 unsigned e = d->perm[i];
16668 if (i < 4 ? e >= 4 : e < 4)
16669 return false;
16670 }
16671
16672 if (d->testing_p)
16673 return true;
16674
16675 for (i = 0; i < 8; ++i)
16676 {
16677 unsigned e = d->perm[i];
16678
16679 /* Within each 128-bit lane, the elements of op0 are numbered
16680 from 0 and the elements of op1 are numbered from 4. */
16681 if (e >= 8 + 4)
16682 e -= 8;
16683 else if (e >= 4)
16684 e -= 4;
16685
16686 rperm[i] = GEN_INT (e);
16687 }
16688
16689 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
16690 vperm = force_reg (V8SImode, vperm);
16691 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
16692
16693 return true;
16694 }
16695
16696 /* Return true if permutation D can be performed as VMODE permutation
16697 instead. */
16698
16699 static bool
16700 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
16701 {
16702 unsigned int i, j, chunk;
16703
16704 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
16705 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
16706 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
16707 return false;
16708
16709 if (GET_MODE_NUNITS (vmode) >= d->nelt)
16710 return true;
16711
16712 chunk = d->nelt / GET_MODE_NUNITS (vmode);
16713 for (i = 0; i < d->nelt; i += chunk)
16714 if (d->perm[i] & (chunk - 1))
16715 return false;
16716 else
16717 for (j = 1; j < chunk; ++j)
16718 if (d->perm[i] + j != d->perm[i + j])
16719 return false;
16720
16721 return true;
16722 }
16723
16724 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16725 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16726
16727 static bool
16728 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
16729 {
16730 unsigned i, nelt, eltsz, mask;
16731 unsigned char perm[64];
16732 machine_mode vmode = V16QImode;
16733 rtx rperm[64], vperm, target, op0, op1;
16734
16735 nelt = d->nelt;
16736
16737 if (!d->one_operand_p)
16738 {
16739 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
16740 {
16741 if (TARGET_AVX2
16742 && valid_perm_using_mode_p (V2TImode, d))
16743 {
16744 if (d->testing_p)
16745 return true;
16746
16747 /* Use vperm2i128 insn. The pattern uses
16748 V4DImode instead of V2TImode. */
16749 target = d->target;
16750 if (d->vmode != V4DImode)
16751 target = gen_reg_rtx (V4DImode);
16752 op0 = gen_lowpart (V4DImode, d->op0);
16753 op1 = gen_lowpart (V4DImode, d->op1);
16754 rperm[0]
16755 = GEN_INT ((d->perm[0] / (nelt / 2))
16756 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
16757 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
16758 if (target != d->target)
16759 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16760 return true;
16761 }
16762 return false;
16763 }
16764 }
16765 else
16766 {
16767 if (GET_MODE_SIZE (d->vmode) == 16)
16768 {
16769 if (!TARGET_SSSE3)
16770 return false;
16771 }
16772 else if (GET_MODE_SIZE (d->vmode) == 32)
16773 {
16774 if (!TARGET_AVX2)
16775 return false;
16776
16777 /* V4DImode should be already handled through
16778 expand_vselect by vpermq instruction. */
16779 gcc_assert (d->vmode != V4DImode);
16780
16781 vmode = V32QImode;
16782 if (d->vmode == V8SImode
16783 || d->vmode == V16HImode
16784 || d->vmode == V32QImode)
16785 {
16786 /* First see if vpermq can be used for
16787 V8SImode/V16HImode/V32QImode. */
16788 if (valid_perm_using_mode_p (V4DImode, d))
16789 {
16790 for (i = 0; i < 4; i++)
16791 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
16792 if (d->testing_p)
16793 return true;
16794 target = gen_reg_rtx (V4DImode);
16795 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
16796 perm, 4, false))
16797 {
16798 emit_move_insn (d->target,
16799 gen_lowpart (d->vmode, target));
16800 return true;
16801 }
16802 return false;
16803 }
16804
16805 /* Next see if vpermd can be used. */
16806 if (valid_perm_using_mode_p (V8SImode, d))
16807 vmode = V8SImode;
16808 }
16809 /* Or if vpermps can be used. */
16810 else if (d->vmode == V8SFmode)
16811 vmode = V8SImode;
16812
16813 if (vmode == V32QImode)
16814 {
16815 /* vpshufb only works intra lanes, it is not
16816 possible to shuffle bytes in between the lanes. */
16817 for (i = 0; i < nelt; ++i)
16818 if ((d->perm[i] ^ i) & (nelt / 2))
16819 return false;
16820 }
16821 }
16822 else if (GET_MODE_SIZE (d->vmode) == 64)
16823 {
16824 if (!TARGET_AVX512BW)
16825 return false;
16826
16827 /* If vpermq didn't work, vpshufb won't work either. */
16828 if (d->vmode == V8DFmode || d->vmode == V8DImode)
16829 return false;
16830
16831 vmode = V64QImode;
16832 if (d->vmode == V16SImode
16833 || d->vmode == V32HImode
16834 || d->vmode == V64QImode)
16835 {
16836 /* First see if vpermq can be used for
16837 V16SImode/V32HImode/V64QImode. */
16838 if (valid_perm_using_mode_p (V8DImode, d))
16839 {
16840 for (i = 0; i < 8; i++)
16841 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
16842 if (d->testing_p)
16843 return true;
16844 target = gen_reg_rtx (V8DImode);
16845 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
16846 perm, 8, false))
16847 {
16848 emit_move_insn (d->target,
16849 gen_lowpart (d->vmode, target));
16850 return true;
16851 }
16852 return false;
16853 }
16854
16855 /* Next see if vpermd can be used. */
16856 if (valid_perm_using_mode_p (V16SImode, d))
16857 vmode = V16SImode;
16858 }
16859 /* Or if vpermps can be used. */
16860 else if (d->vmode == V16SFmode)
16861 vmode = V16SImode;
16862 if (vmode == V64QImode)
16863 {
16864 /* vpshufb only works intra lanes, it is not
16865 possible to shuffle bytes in between the lanes. */
16866 for (i = 0; i < nelt; ++i)
16867 if ((d->perm[i] ^ i) & (nelt / 4))
16868 return false;
16869 }
16870 }
16871 else
16872 return false;
16873 }
16874
16875 if (d->testing_p)
16876 return true;
16877
16878 if (vmode == V8SImode)
16879 for (i = 0; i < 8; ++i)
16880 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
16881 else if (vmode == V16SImode)
16882 for (i = 0; i < 16; ++i)
16883 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
16884 else
16885 {
16886 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
16887 if (!d->one_operand_p)
16888 mask = 2 * nelt - 1;
16889 else if (vmode == V16QImode)
16890 mask = nelt - 1;
16891 else if (vmode == V64QImode)
16892 mask = nelt / 4 - 1;
16893 else
16894 mask = nelt / 2 - 1;
16895
16896 for (i = 0; i < nelt; ++i)
16897 {
16898 unsigned j, e = d->perm[i] & mask;
16899 for (j = 0; j < eltsz; ++j)
16900 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
16901 }
16902 }
16903
16904 vperm = gen_rtx_CONST_VECTOR (vmode,
16905 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
16906 vperm = force_reg (vmode, vperm);
16907
16908 target = d->target;
16909 if (d->vmode != vmode)
16910 target = gen_reg_rtx (vmode);
16911 op0 = gen_lowpart (vmode, d->op0);
16912 if (d->one_operand_p)
16913 {
16914 if (vmode == V16QImode)
16915 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
16916 else if (vmode == V32QImode)
16917 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
16918 else if (vmode == V64QImode)
16919 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
16920 else if (vmode == V8SFmode)
16921 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
16922 else if (vmode == V8SImode)
16923 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
16924 else if (vmode == V16SFmode)
16925 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
16926 else if (vmode == V16SImode)
16927 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
16928 else
16929 gcc_unreachable ();
16930 }
16931 else
16932 {
16933 op1 = gen_lowpart (vmode, d->op1);
16934 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
16935 }
16936 if (target != d->target)
16937 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16938
16939 return true;
16940 }
16941
16942 /* For V*[QHS]Imode permutations, check if the same permutation
16943 can't be performed in a 2x, 4x or 8x wider inner mode. */
16944
16945 static bool
16946 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
16947 struct expand_vec_perm_d *nd)
16948 {
16949 int i;
16950 machine_mode mode = VOIDmode;
16951
16952 switch (d->vmode)
16953 {
16954 case E_V16QImode: mode = V8HImode; break;
16955 case E_V32QImode: mode = V16HImode; break;
16956 case E_V64QImode: mode = V32HImode; break;
16957 case E_V8HImode: mode = V4SImode; break;
16958 case E_V16HImode: mode = V8SImode; break;
16959 case E_V32HImode: mode = V16SImode; break;
16960 case E_V4SImode: mode = V2DImode; break;
16961 case E_V8SImode: mode = V4DImode; break;
16962 case E_V16SImode: mode = V8DImode; break;
16963 default: return false;
16964 }
16965 for (i = 0; i < d->nelt; i += 2)
16966 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
16967 return false;
16968 nd->vmode = mode;
16969 nd->nelt = d->nelt / 2;
16970 for (i = 0; i < nd->nelt; i++)
16971 nd->perm[i] = d->perm[2 * i] / 2;
16972 if (GET_MODE_INNER (mode) != DImode)
16973 canonicalize_vector_int_perm (nd, nd);
16974 if (nd != d)
16975 {
16976 nd->one_operand_p = d->one_operand_p;
16977 nd->testing_p = d->testing_p;
16978 if (d->op0 == d->op1)
16979 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
16980 else
16981 {
16982 nd->op0 = gen_lowpart (nd->vmode, d->op0);
16983 nd->op1 = gen_lowpart (nd->vmode, d->op1);
16984 }
16985 if (d->testing_p)
16986 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
16987 else
16988 nd->target = gen_reg_rtx (nd->vmode);
16989 }
16990 return true;
16991 }
16992
16993 /* Try to expand one-operand permutation with constant mask. */
16994
16995 static bool
16996 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
16997 {
16998 machine_mode mode = GET_MODE (d->op0);
16999 machine_mode maskmode = mode;
17000 rtx (*gen) (rtx, rtx, rtx) = NULL;
17001 rtx target, op0, mask;
17002 rtx vec[64];
17003
17004 if (!rtx_equal_p (d->op0, d->op1))
17005 return false;
17006
17007 if (!TARGET_AVX512F)
17008 return false;
17009
17010 switch (mode)
17011 {
17012 case E_V16SImode:
17013 gen = gen_avx512f_permvarv16si;
17014 break;
17015 case E_V16SFmode:
17016 gen = gen_avx512f_permvarv16sf;
17017 maskmode = V16SImode;
17018 break;
17019 case E_V8DImode:
17020 gen = gen_avx512f_permvarv8di;
17021 break;
17022 case E_V8DFmode:
17023 gen = gen_avx512f_permvarv8df;
17024 maskmode = V8DImode;
17025 break;
17026 default:
17027 return false;
17028 }
17029
17030 target = d->target;
17031 op0 = d->op0;
17032 for (int i = 0; i < d->nelt; ++i)
17033 vec[i] = GEN_INT (d->perm[i]);
17034 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
17035 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
17036 return true;
17037 }
17038
17039 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
17040
17041 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
17042 in a single instruction. */
17043
17044 static bool
17045 expand_vec_perm_1 (struct expand_vec_perm_d *d)
17046 {
17047 unsigned i, nelt = d->nelt;
17048 struct expand_vec_perm_d nd;
17049
17050 /* Check plain VEC_SELECT first, because AVX has instructions that could
17051 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17052 input where SEL+CONCAT may not. */
17053 if (d->one_operand_p)
17054 {
17055 int mask = nelt - 1;
17056 bool identity_perm = true;
17057 bool broadcast_perm = true;
17058
17059 for (i = 0; i < nelt; i++)
17060 {
17061 nd.perm[i] = d->perm[i] & mask;
17062 if (nd.perm[i] != i)
17063 identity_perm = false;
17064 if (nd.perm[i])
17065 broadcast_perm = false;
17066 }
17067
17068 if (identity_perm)
17069 {
17070 if (!d->testing_p)
17071 emit_move_insn (d->target, d->op0);
17072 return true;
17073 }
17074 else if (broadcast_perm && TARGET_AVX2)
17075 {
17076 /* Use vpbroadcast{b,w,d}. */
17077 rtx (*gen) (rtx, rtx) = NULL;
17078 switch (d->vmode)
17079 {
17080 case E_V64QImode:
17081 if (TARGET_AVX512BW)
17082 gen = gen_avx512bw_vec_dupv64qi_1;
17083 break;
17084 case E_V32QImode:
17085 gen = gen_avx2_pbroadcastv32qi_1;
17086 break;
17087 case E_V32HImode:
17088 if (TARGET_AVX512BW)
17089 gen = gen_avx512bw_vec_dupv32hi_1;
17090 break;
17091 case E_V16HImode:
17092 gen = gen_avx2_pbroadcastv16hi_1;
17093 break;
17094 case E_V16SImode:
17095 if (TARGET_AVX512F)
17096 gen = gen_avx512f_vec_dupv16si_1;
17097 break;
17098 case E_V8SImode:
17099 gen = gen_avx2_pbroadcastv8si_1;
17100 break;
17101 case E_V16QImode:
17102 gen = gen_avx2_pbroadcastv16qi;
17103 break;
17104 case E_V8HImode:
17105 gen = gen_avx2_pbroadcastv8hi;
17106 break;
17107 case E_V16SFmode:
17108 if (TARGET_AVX512F)
17109 gen = gen_avx512f_vec_dupv16sf_1;
17110 break;
17111 case E_V8SFmode:
17112 gen = gen_avx2_vec_dupv8sf_1;
17113 break;
17114 case E_V8DFmode:
17115 if (TARGET_AVX512F)
17116 gen = gen_avx512f_vec_dupv8df_1;
17117 break;
17118 case E_V8DImode:
17119 if (TARGET_AVX512F)
17120 gen = gen_avx512f_vec_dupv8di_1;
17121 break;
17122 /* For other modes prefer other shuffles this function creates. */
17123 default: break;
17124 }
17125 if (gen != NULL)
17126 {
17127 if (!d->testing_p)
17128 emit_insn (gen (d->target, d->op0));
17129 return true;
17130 }
17131 }
17132
17133 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17134 return true;
17135
17136 /* There are plenty of patterns in sse.md that are written for
17137 SEL+CONCAT and are not replicated for a single op. Perhaps
17138 that should be changed, to avoid the nastiness here. */
17139
17140 /* Recognize interleave style patterns, which means incrementing
17141 every other permutation operand. */
17142 for (i = 0; i < nelt; i += 2)
17143 {
17144 nd.perm[i] = d->perm[i] & mask;
17145 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17146 }
17147 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17148 d->testing_p))
17149 return true;
17150
17151 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17152 if (nelt >= 4)
17153 {
17154 for (i = 0; i < nelt; i += 4)
17155 {
17156 nd.perm[i + 0] = d->perm[i + 0] & mask;
17157 nd.perm[i + 1] = d->perm[i + 1] & mask;
17158 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17159 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17160 }
17161
17162 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17163 d->testing_p))
17164 return true;
17165 }
17166 }
17167
17168 /* Try movss/movsd instructions. */
17169 if (expand_vec_perm_movs (d))
17170 return true;
17171
17172 /* Finally, try the fully general two operand permute. */
17173 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17174 d->testing_p))
17175 return true;
17176
17177 /* Recognize interleave style patterns with reversed operands. */
17178 if (!d->one_operand_p)
17179 {
17180 for (i = 0; i < nelt; ++i)
17181 {
17182 unsigned e = d->perm[i];
17183 if (e >= nelt)
17184 e -= nelt;
17185 else
17186 e += nelt;
17187 nd.perm[i] = e;
17188 }
17189
17190 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17191 d->testing_p))
17192 return true;
17193 }
17194
17195 /* Try the SSE4.1 blend variable merge instructions. */
17196 if (expand_vec_perm_blend (d))
17197 return true;
17198
17199 /* Try one of the AVX vpermil variable permutations. */
17200 if (expand_vec_perm_vpermil (d))
17201 return true;
17202
17203 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17204 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17205 if (expand_vec_perm_pshufb (d))
17206 return true;
17207
17208 /* Try the AVX2 vpalignr instruction. */
17209 if (expand_vec_perm_palignr (d, true))
17210 return true;
17211
17212 /* Try the AVX512F vperm{s,d} instructions. */
17213 if (ix86_expand_vec_one_operand_perm_avx512 (d))
17214 return true;
17215
17216 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17217 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17218 return true;
17219
17220 /* See if we can get the same permutation in different vector integer
17221 mode. */
17222 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17223 {
17224 if (!d->testing_p)
17225 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17226 return true;
17227 }
17228 return false;
17229 }
17230
17231 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17232 in terms of a pair of pshuflw + pshufhw instructions. */
17233
17234 static bool
17235 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17236 {
17237 unsigned char perm2[MAX_VECT_LEN];
17238 unsigned i;
17239 bool ok;
17240
17241 if (d->vmode != V8HImode || !d->one_operand_p)
17242 return false;
17243
17244 /* The two permutations only operate in 64-bit lanes. */
17245 for (i = 0; i < 4; ++i)
17246 if (d->perm[i] >= 4)
17247 return false;
17248 for (i = 4; i < 8; ++i)
17249 if (d->perm[i] < 4)
17250 return false;
17251
17252 if (d->testing_p)
17253 return true;
17254
17255 /* Emit the pshuflw. */
17256 memcpy (perm2, d->perm, 4);
17257 for (i = 4; i < 8; ++i)
17258 perm2[i] = i;
17259 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17260 gcc_assert (ok);
17261
17262 /* Emit the pshufhw. */
17263 memcpy (perm2 + 4, d->perm + 4, 4);
17264 for (i = 0; i < 4; ++i)
17265 perm2[i] = i;
17266 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17267 gcc_assert (ok);
17268
17269 return true;
17270 }
17271
17272 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17273 the permutation using the SSSE3 palignr instruction. This succeeds
17274 when all of the elements in PERM fit within one vector and we merely
17275 need to shift them down so that a single vector permutation has a
17276 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17277 the vpalignr instruction itself can perform the requested permutation. */
17278
17279 static bool
17280 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17281 {
17282 unsigned i, nelt = d->nelt;
17283 unsigned min, max, minswap, maxswap;
17284 bool in_order, ok, swap = false;
17285 rtx shift, target;
17286 struct expand_vec_perm_d dcopy;
17287
17288 /* Even with AVX, palignr only operates on 128-bit vectors,
17289 in AVX2 palignr operates on both 128-bit lanes. */
17290 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17291 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17292 return false;
17293
17294 min = 2 * nelt;
17295 max = 0;
17296 minswap = 2 * nelt;
17297 maxswap = 0;
17298 for (i = 0; i < nelt; ++i)
17299 {
17300 unsigned e = d->perm[i];
17301 unsigned eswap = d->perm[i] ^ nelt;
17302 if (GET_MODE_SIZE (d->vmode) == 32)
17303 {
17304 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17305 eswap = e ^ (nelt / 2);
17306 }
17307 if (e < min)
17308 min = e;
17309 if (e > max)
17310 max = e;
17311 if (eswap < minswap)
17312 minswap = eswap;
17313 if (eswap > maxswap)
17314 maxswap = eswap;
17315 }
17316 if (min == 0
17317 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17318 {
17319 if (d->one_operand_p
17320 || minswap == 0
17321 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17322 ? nelt / 2 : nelt))
17323 return false;
17324 swap = true;
17325 min = minswap;
17326 max = maxswap;
17327 }
17328
17329 /* Given that we have SSSE3, we know we'll be able to implement the
17330 single operand permutation after the palignr with pshufb for
17331 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17332 first. */
17333 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17334 return true;
17335
17336 dcopy = *d;
17337 if (swap)
17338 {
17339 dcopy.op0 = d->op1;
17340 dcopy.op1 = d->op0;
17341 for (i = 0; i < nelt; ++i)
17342 dcopy.perm[i] ^= nelt;
17343 }
17344
17345 in_order = true;
17346 for (i = 0; i < nelt; ++i)
17347 {
17348 unsigned e = dcopy.perm[i];
17349 if (GET_MODE_SIZE (d->vmode) == 32
17350 && e >= nelt
17351 && (e & (nelt / 2 - 1)) < min)
17352 e = e - min - (nelt / 2);
17353 else
17354 e = e - min;
17355 if (e != i)
17356 in_order = false;
17357 dcopy.perm[i] = e;
17358 }
17359 dcopy.one_operand_p = true;
17360
17361 if (single_insn_only_p && !in_order)
17362 return false;
17363
17364 /* For AVX2, test whether we can permute the result in one instruction. */
17365 if (d->testing_p)
17366 {
17367 if (in_order)
17368 return true;
17369 dcopy.op1 = dcopy.op0;
17370 return expand_vec_perm_1 (&dcopy);
17371 }
17372
17373 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17374 if (GET_MODE_SIZE (d->vmode) == 16)
17375 {
17376 target = gen_reg_rtx (TImode);
17377 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17378 gen_lowpart (TImode, dcopy.op0), shift));
17379 }
17380 else
17381 {
17382 target = gen_reg_rtx (V2TImode);
17383 emit_insn (gen_avx2_palignrv2ti (target,
17384 gen_lowpart (V2TImode, dcopy.op1),
17385 gen_lowpart (V2TImode, dcopy.op0),
17386 shift));
17387 }
17388
17389 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17390
17391 /* Test for the degenerate case where the alignment by itself
17392 produces the desired permutation. */
17393 if (in_order)
17394 {
17395 emit_move_insn (d->target, dcopy.op0);
17396 return true;
17397 }
17398
17399 ok = expand_vec_perm_1 (&dcopy);
17400 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17401
17402 return ok;
17403 }
17404
17405 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17406 the permutation using the SSE4_1 pblendv instruction. Potentially
17407 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17408
17409 static bool
17410 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17411 {
17412 unsigned i, which, nelt = d->nelt;
17413 struct expand_vec_perm_d dcopy, dcopy1;
17414 machine_mode vmode = d->vmode;
17415 bool ok;
17416
17417 /* Use the same checks as in expand_vec_perm_blend. */
17418 if (d->one_operand_p)
17419 return false;
17420 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17421 ;
17422 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17423 ;
17424 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17425 ;
17426 else
17427 return false;
17428
17429 /* Figure out where permutation elements stay not in their
17430 respective lanes. */
17431 for (i = 0, which = 0; i < nelt; ++i)
17432 {
17433 unsigned e = d->perm[i];
17434 if (e != i)
17435 which |= (e < nelt ? 1 : 2);
17436 }
17437 /* We can pblend the part where elements stay not in their
17438 respective lanes only when these elements are all in one
17439 half of a permutation.
17440 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17441 lanes, but both 8 and 9 >= 8
17442 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17443 respective lanes and 8 >= 8, but 2 not. */
17444 if (which != 1 && which != 2)
17445 return false;
17446 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17447 return true;
17448
17449 /* First we apply one operand permutation to the part where
17450 elements stay not in their respective lanes. */
17451 dcopy = *d;
17452 if (which == 2)
17453 dcopy.op0 = dcopy.op1 = d->op1;
17454 else
17455 dcopy.op0 = dcopy.op1 = d->op0;
17456 if (!d->testing_p)
17457 dcopy.target = gen_reg_rtx (vmode);
17458 dcopy.one_operand_p = true;
17459
17460 for (i = 0; i < nelt; ++i)
17461 dcopy.perm[i] = d->perm[i] & (nelt - 1);
17462
17463 ok = expand_vec_perm_1 (&dcopy);
17464 if (GET_MODE_SIZE (vmode) != 16 && !ok)
17465 return false;
17466 else
17467 gcc_assert (ok);
17468 if (d->testing_p)
17469 return true;
17470
17471 /* Next we put permuted elements into their positions. */
17472 dcopy1 = *d;
17473 if (which == 2)
17474 dcopy1.op1 = dcopy.target;
17475 else
17476 dcopy1.op0 = dcopy.target;
17477
17478 for (i = 0; i < nelt; ++i)
17479 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17480
17481 ok = expand_vec_perm_blend (&dcopy1);
17482 gcc_assert (ok);
17483
17484 return true;
17485 }
17486
17487 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17488
17489 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17490 a two vector permutation into a single vector permutation by using
17491 an interleave operation to merge the vectors. */
17492
17493 static bool
17494 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17495 {
17496 struct expand_vec_perm_d dremap, dfinal;
17497 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17498 unsigned HOST_WIDE_INT contents;
17499 unsigned char remap[2 * MAX_VECT_LEN];
17500 rtx_insn *seq;
17501 bool ok, same_halves = false;
17502
17503 if (GET_MODE_SIZE (d->vmode) == 16)
17504 {
17505 if (d->one_operand_p)
17506 return false;
17507 }
17508 else if (GET_MODE_SIZE (d->vmode) == 32)
17509 {
17510 if (!TARGET_AVX)
17511 return false;
17512 /* For 32-byte modes allow even d->one_operand_p.
17513 The lack of cross-lane shuffling in some instructions
17514 might prevent a single insn shuffle. */
17515 dfinal = *d;
17516 dfinal.testing_p = true;
17517 /* If expand_vec_perm_interleave3 can expand this into
17518 a 3 insn sequence, give up and let it be expanded as
17519 3 insn sequence. While that is one insn longer,
17520 it doesn't need a memory operand and in the common
17521 case that both interleave low and high permutations
17522 with the same operands are adjacent needs 4 insns
17523 for both after CSE. */
17524 if (expand_vec_perm_interleave3 (&dfinal))
17525 return false;
17526 }
17527 else
17528 return false;
17529
17530 /* Examine from whence the elements come. */
17531 contents = 0;
17532 for (i = 0; i < nelt; ++i)
17533 contents |= HOST_WIDE_INT_1U << d->perm[i];
17534
17535 memset (remap, 0xff, sizeof (remap));
17536 dremap = *d;
17537
17538 if (GET_MODE_SIZE (d->vmode) == 16)
17539 {
17540 unsigned HOST_WIDE_INT h1, h2, h3, h4;
17541
17542 /* Split the two input vectors into 4 halves. */
17543 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
17544 h2 = h1 << nelt2;
17545 h3 = h2 << nelt2;
17546 h4 = h3 << nelt2;
17547
17548 /* If the elements from the low halves use interleave low, and similarly
17549 for interleave high. If the elements are from mis-matched halves, we
17550 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17551 if ((contents & (h1 | h3)) == contents)
17552 {
17553 /* punpckl* */
17554 for (i = 0; i < nelt2; ++i)
17555 {
17556 remap[i] = i * 2;
17557 remap[i + nelt] = i * 2 + 1;
17558 dremap.perm[i * 2] = i;
17559 dremap.perm[i * 2 + 1] = i + nelt;
17560 }
17561 if (!TARGET_SSE2 && d->vmode == V4SImode)
17562 dremap.vmode = V4SFmode;
17563 }
17564 else if ((contents & (h2 | h4)) == contents)
17565 {
17566 /* punpckh* */
17567 for (i = 0; i < nelt2; ++i)
17568 {
17569 remap[i + nelt2] = i * 2;
17570 remap[i + nelt + nelt2] = i * 2 + 1;
17571 dremap.perm[i * 2] = i + nelt2;
17572 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
17573 }
17574 if (!TARGET_SSE2 && d->vmode == V4SImode)
17575 dremap.vmode = V4SFmode;
17576 }
17577 else if ((contents & (h1 | h4)) == contents)
17578 {
17579 /* shufps */
17580 for (i = 0; i < nelt2; ++i)
17581 {
17582 remap[i] = i;
17583 remap[i + nelt + nelt2] = i + nelt2;
17584 dremap.perm[i] = i;
17585 dremap.perm[i + nelt2] = i + nelt + nelt2;
17586 }
17587 if (nelt != 4)
17588 {
17589 /* shufpd */
17590 dremap.vmode = V2DImode;
17591 dremap.nelt = 2;
17592 dremap.perm[0] = 0;
17593 dremap.perm[1] = 3;
17594 }
17595 }
17596 else if ((contents & (h2 | h3)) == contents)
17597 {
17598 /* shufps */
17599 for (i = 0; i < nelt2; ++i)
17600 {
17601 remap[i + nelt2] = i;
17602 remap[i + nelt] = i + nelt2;
17603 dremap.perm[i] = i + nelt2;
17604 dremap.perm[i + nelt2] = i + nelt;
17605 }
17606 if (nelt != 4)
17607 {
17608 /* shufpd */
17609 dremap.vmode = V2DImode;
17610 dremap.nelt = 2;
17611 dremap.perm[0] = 1;
17612 dremap.perm[1] = 2;
17613 }
17614 }
17615 else
17616 return false;
17617 }
17618 else
17619 {
17620 unsigned int nelt4 = nelt / 4, nzcnt = 0;
17621 unsigned HOST_WIDE_INT q[8];
17622 unsigned int nonzero_halves[4];
17623
17624 /* Split the two input vectors into 8 quarters. */
17625 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
17626 for (i = 1; i < 8; ++i)
17627 q[i] = q[0] << (nelt4 * i);
17628 for (i = 0; i < 4; ++i)
17629 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
17630 {
17631 nonzero_halves[nzcnt] = i;
17632 ++nzcnt;
17633 }
17634
17635 if (nzcnt == 1)
17636 {
17637 gcc_assert (d->one_operand_p);
17638 nonzero_halves[1] = nonzero_halves[0];
17639 same_halves = true;
17640 }
17641 else if (d->one_operand_p)
17642 {
17643 gcc_assert (nonzero_halves[0] == 0);
17644 gcc_assert (nonzero_halves[1] == 1);
17645 }
17646
17647 if (nzcnt <= 2)
17648 {
17649 if (d->perm[0] / nelt2 == nonzero_halves[1])
17650 {
17651 /* Attempt to increase the likelihood that dfinal
17652 shuffle will be intra-lane. */
17653 std::swap (nonzero_halves[0], nonzero_halves[1]);
17654 }
17655
17656 /* vperm2f128 or vperm2i128. */
17657 for (i = 0; i < nelt2; ++i)
17658 {
17659 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
17660 remap[i + nonzero_halves[0] * nelt2] = i;
17661 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
17662 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
17663 }
17664
17665 if (d->vmode != V8SFmode
17666 && d->vmode != V4DFmode
17667 && d->vmode != V8SImode)
17668 {
17669 dremap.vmode = V8SImode;
17670 dremap.nelt = 8;
17671 for (i = 0; i < 4; ++i)
17672 {
17673 dremap.perm[i] = i + nonzero_halves[0] * 4;
17674 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
17675 }
17676 }
17677 }
17678 else if (d->one_operand_p)
17679 return false;
17680 else if (TARGET_AVX2
17681 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
17682 {
17683 /* vpunpckl* */
17684 for (i = 0; i < nelt4; ++i)
17685 {
17686 remap[i] = i * 2;
17687 remap[i + nelt] = i * 2 + 1;
17688 remap[i + nelt2] = i * 2 + nelt2;
17689 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
17690 dremap.perm[i * 2] = i;
17691 dremap.perm[i * 2 + 1] = i + nelt;
17692 dremap.perm[i * 2 + nelt2] = i + nelt2;
17693 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
17694 }
17695 }
17696 else if (TARGET_AVX2
17697 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
17698 {
17699 /* vpunpckh* */
17700 for (i = 0; i < nelt4; ++i)
17701 {
17702 remap[i + nelt4] = i * 2;
17703 remap[i + nelt + nelt4] = i * 2 + 1;
17704 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
17705 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
17706 dremap.perm[i * 2] = i + nelt4;
17707 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
17708 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
17709 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
17710 }
17711 }
17712 else
17713 return false;
17714 }
17715
17716 /* Use the remapping array set up above to move the elements from their
17717 swizzled locations into their final destinations. */
17718 dfinal = *d;
17719 for (i = 0; i < nelt; ++i)
17720 {
17721 unsigned e = remap[d->perm[i]];
17722 gcc_assert (e < nelt);
17723 /* If same_halves is true, both halves of the remapped vector are the
17724 same. Avoid cross-lane accesses if possible. */
17725 if (same_halves && i >= nelt2)
17726 {
17727 gcc_assert (e < nelt2);
17728 dfinal.perm[i] = e + nelt2;
17729 }
17730 else
17731 dfinal.perm[i] = e;
17732 }
17733 if (!d->testing_p)
17734 {
17735 dremap.target = gen_reg_rtx (dremap.vmode);
17736 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17737 }
17738 dfinal.op1 = dfinal.op0;
17739 dfinal.one_operand_p = true;
17740
17741 /* Test if the final remap can be done with a single insn. For V4SFmode or
17742 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17743 start_sequence ();
17744 ok = expand_vec_perm_1 (&dfinal);
17745 seq = get_insns ();
17746 end_sequence ();
17747
17748 if (!ok)
17749 return false;
17750
17751 if (d->testing_p)
17752 return true;
17753
17754 if (dremap.vmode != dfinal.vmode)
17755 {
17756 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
17757 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
17758 }
17759
17760 ok = expand_vec_perm_1 (&dremap);
17761 gcc_assert (ok);
17762
17763 emit_insn (seq);
17764 return true;
17765 }
17766
17767 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17768 a single vector cross-lane permutation into vpermq followed
17769 by any of the single insn permutations. */
17770
17771 static bool
17772 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
17773 {
17774 struct expand_vec_perm_d dremap, dfinal;
17775 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
17776 unsigned contents[2];
17777 bool ok;
17778
17779 if (!(TARGET_AVX2
17780 && (d->vmode == V32QImode || d->vmode == V16HImode)
17781 && d->one_operand_p))
17782 return false;
17783
17784 contents[0] = 0;
17785 contents[1] = 0;
17786 for (i = 0; i < nelt2; ++i)
17787 {
17788 contents[0] |= 1u << (d->perm[i] / nelt4);
17789 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
17790 }
17791
17792 for (i = 0; i < 2; ++i)
17793 {
17794 unsigned int cnt = 0;
17795 for (j = 0; j < 4; ++j)
17796 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
17797 return false;
17798 }
17799
17800 if (d->testing_p)
17801 return true;
17802
17803 dremap = *d;
17804 dremap.vmode = V4DImode;
17805 dremap.nelt = 4;
17806 dremap.target = gen_reg_rtx (V4DImode);
17807 dremap.op0 = gen_lowpart (V4DImode, d->op0);
17808 dremap.op1 = dremap.op0;
17809 dremap.one_operand_p = true;
17810 for (i = 0; i < 2; ++i)
17811 {
17812 unsigned int cnt = 0;
17813 for (j = 0; j < 4; ++j)
17814 if ((contents[i] & (1u << j)) != 0)
17815 dremap.perm[2 * i + cnt++] = j;
17816 for (; cnt < 2; ++cnt)
17817 dremap.perm[2 * i + cnt] = 0;
17818 }
17819
17820 dfinal = *d;
17821 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17822 dfinal.op1 = dfinal.op0;
17823 dfinal.one_operand_p = true;
17824 for (i = 0, j = 0; i < nelt; ++i)
17825 {
17826 if (i == nelt2)
17827 j = 2;
17828 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
17829 if ((d->perm[i] / nelt4) == dremap.perm[j])
17830 ;
17831 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
17832 dfinal.perm[i] |= nelt4;
17833 else
17834 gcc_unreachable ();
17835 }
17836
17837 ok = expand_vec_perm_1 (&dremap);
17838 gcc_assert (ok);
17839
17840 ok = expand_vec_perm_1 (&dfinal);
17841 gcc_assert (ok);
17842
17843 return true;
17844 }
17845
17846 static bool canonicalize_perm (struct expand_vec_perm_d *d);
17847
17848 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
17849 a vector permutation using two instructions, vperm2f128 resp.
17850 vperm2i128 followed by any single in-lane permutation. */
17851
17852 static bool
17853 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
17854 {
17855 struct expand_vec_perm_d dfirst, dsecond;
17856 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
17857 bool ok;
17858
17859 if (!TARGET_AVX
17860 || GET_MODE_SIZE (d->vmode) != 32
17861 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
17862 return false;
17863
17864 dsecond = *d;
17865 dsecond.one_operand_p = false;
17866 dsecond.testing_p = true;
17867
17868 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17869 immediate. For perm < 16 the second permutation uses
17870 d->op0 as first operand, for perm >= 16 it uses d->op1
17871 as first operand. The second operand is the result of
17872 vperm2[fi]128. */
17873 for (perm = 0; perm < 32; perm++)
17874 {
17875 /* Ignore permutations which do not move anything cross-lane. */
17876 if (perm < 16)
17877 {
17878 /* The second shuffle for e.g. V4DFmode has
17879 0123 and ABCD operands.
17880 Ignore AB23, as 23 is already in the second lane
17881 of the first operand. */
17882 if ((perm & 0xc) == (1 << 2)) continue;
17883 /* And 01CD, as 01 is in the first lane of the first
17884 operand. */
17885 if ((perm & 3) == 0) continue;
17886 /* And 4567, as then the vperm2[fi]128 doesn't change
17887 anything on the original 4567 second operand. */
17888 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
17889 }
17890 else
17891 {
17892 /* The second shuffle for e.g. V4DFmode has
17893 4567 and ABCD operands.
17894 Ignore AB67, as 67 is already in the second lane
17895 of the first operand. */
17896 if ((perm & 0xc) == (3 << 2)) continue;
17897 /* And 45CD, as 45 is in the first lane of the first
17898 operand. */
17899 if ((perm & 3) == 2) continue;
17900 /* And 0123, as then the vperm2[fi]128 doesn't change
17901 anything on the original 0123 first operand. */
17902 if ((perm & 0xf) == (1 << 2)) continue;
17903 }
17904
17905 for (i = 0; i < nelt; i++)
17906 {
17907 j = d->perm[i] / nelt2;
17908 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
17909 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
17910 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
17911 dsecond.perm[i] = d->perm[i] & (nelt - 1);
17912 else
17913 break;
17914 }
17915
17916 if (i == nelt)
17917 {
17918 start_sequence ();
17919 ok = expand_vec_perm_1 (&dsecond);
17920 end_sequence ();
17921 }
17922 else
17923 ok = false;
17924
17925 if (ok)
17926 {
17927 if (d->testing_p)
17928 return true;
17929
17930 /* Found a usable second shuffle. dfirst will be
17931 vperm2f128 on d->op0 and d->op1. */
17932 dsecond.testing_p = false;
17933 dfirst = *d;
17934 dfirst.target = gen_reg_rtx (d->vmode);
17935 for (i = 0; i < nelt; i++)
17936 dfirst.perm[i] = (i & (nelt2 - 1))
17937 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
17938
17939 canonicalize_perm (&dfirst);
17940 ok = expand_vec_perm_1 (&dfirst);
17941 gcc_assert (ok);
17942
17943 /* And dsecond is some single insn shuffle, taking
17944 d->op0 and result of vperm2f128 (if perm < 16) or
17945 d->op1 and result of vperm2f128 (otherwise). */
17946 if (perm >= 16)
17947 dsecond.op0 = dsecond.op1;
17948 dsecond.op1 = dfirst.target;
17949
17950 ok = expand_vec_perm_1 (&dsecond);
17951 gcc_assert (ok);
17952
17953 return true;
17954 }
17955
17956 /* For one operand, the only useful vperm2f128 permutation is 0x01
17957 aka lanes swap. */
17958 if (d->one_operand_p)
17959 return false;
17960 }
17961
17962 return false;
17963 }
17964
17965 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17966 a two vector permutation using 2 intra-lane interleave insns
17967 and cross-lane shuffle for 32-byte vectors. */
17968
17969 static bool
17970 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
17971 {
17972 unsigned i, nelt;
17973 rtx (*gen) (rtx, rtx, rtx);
17974
17975 if (d->one_operand_p)
17976 return false;
17977 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
17978 ;
17979 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
17980 ;
17981 else
17982 return false;
17983
17984 nelt = d->nelt;
17985 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
17986 return false;
17987 for (i = 0; i < nelt; i += 2)
17988 if (d->perm[i] != d->perm[0] + i / 2
17989 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
17990 return false;
17991
17992 if (d->testing_p)
17993 return true;
17994
17995 switch (d->vmode)
17996 {
17997 case E_V32QImode:
17998 if (d->perm[0])
17999 gen = gen_vec_interleave_highv32qi;
18000 else
18001 gen = gen_vec_interleave_lowv32qi;
18002 break;
18003 case E_V16HImode:
18004 if (d->perm[0])
18005 gen = gen_vec_interleave_highv16hi;
18006 else
18007 gen = gen_vec_interleave_lowv16hi;
18008 break;
18009 case E_V8SImode:
18010 if (d->perm[0])
18011 gen = gen_vec_interleave_highv8si;
18012 else
18013 gen = gen_vec_interleave_lowv8si;
18014 break;
18015 case E_V4DImode:
18016 if (d->perm[0])
18017 gen = gen_vec_interleave_highv4di;
18018 else
18019 gen = gen_vec_interleave_lowv4di;
18020 break;
18021 case E_V8SFmode:
18022 if (d->perm[0])
18023 gen = gen_vec_interleave_highv8sf;
18024 else
18025 gen = gen_vec_interleave_lowv8sf;
18026 break;
18027 case E_V4DFmode:
18028 if (d->perm[0])
18029 gen = gen_vec_interleave_highv4df;
18030 else
18031 gen = gen_vec_interleave_lowv4df;
18032 break;
18033 default:
18034 gcc_unreachable ();
18035 }
18036
18037 emit_insn (gen (d->target, d->op0, d->op1));
18038 return true;
18039 }
18040
18041 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18042 a single vector permutation using a single intra-lane vector
18043 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18044 the non-swapped and swapped vectors together. */
18045
18046 static bool
18047 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
18048 {
18049 struct expand_vec_perm_d dfirst, dsecond;
18050 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
18051 rtx_insn *seq;
18052 bool ok;
18053 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18054
18055 if (!TARGET_AVX
18056 || TARGET_AVX2
18057 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18058 || !d->one_operand_p)
18059 return false;
18060
18061 dfirst = *d;
18062 for (i = 0; i < nelt; i++)
18063 dfirst.perm[i] = 0xff;
18064 for (i = 0, msk = 0; i < nelt; i++)
18065 {
18066 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18067 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
18068 return false;
18069 dfirst.perm[j] = d->perm[i];
18070 if (j != i)
18071 msk |= (1 << i);
18072 }
18073 for (i = 0; i < nelt; i++)
18074 if (dfirst.perm[i] == 0xff)
18075 dfirst.perm[i] = i;
18076
18077 if (!d->testing_p)
18078 dfirst.target = gen_reg_rtx (dfirst.vmode);
18079
18080 start_sequence ();
18081 ok = expand_vec_perm_1 (&dfirst);
18082 seq = get_insns ();
18083 end_sequence ();
18084
18085 if (!ok)
18086 return false;
18087
18088 if (d->testing_p)
18089 return true;
18090
18091 emit_insn (seq);
18092
18093 dsecond = *d;
18094 dsecond.op0 = dfirst.target;
18095 dsecond.op1 = dfirst.target;
18096 dsecond.one_operand_p = true;
18097 dsecond.target = gen_reg_rtx (dsecond.vmode);
18098 for (i = 0; i < nelt; i++)
18099 dsecond.perm[i] = i ^ nelt2;
18100
18101 ok = expand_vec_perm_1 (&dsecond);
18102 gcc_assert (ok);
18103
18104 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18105 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18106 return true;
18107 }
18108
18109 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
18110 permutation using two vperm2f128, followed by a vshufpd insn blending
18111 the two vectors together. */
18112
18113 static bool
18114 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18115 {
18116 struct expand_vec_perm_d dfirst, dsecond, dthird;
18117 bool ok;
18118
18119 if (!TARGET_AVX || (d->vmode != V4DFmode))
18120 return false;
18121
18122 if (d->testing_p)
18123 return true;
18124
18125 dfirst = *d;
18126 dsecond = *d;
18127 dthird = *d;
18128
18129 dfirst.perm[0] = (d->perm[0] & ~1);
18130 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18131 dfirst.perm[2] = (d->perm[2] & ~1);
18132 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18133 dsecond.perm[0] = (d->perm[1] & ~1);
18134 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18135 dsecond.perm[2] = (d->perm[3] & ~1);
18136 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18137 dthird.perm[0] = (d->perm[0] % 2);
18138 dthird.perm[1] = (d->perm[1] % 2) + 4;
18139 dthird.perm[2] = (d->perm[2] % 2) + 2;
18140 dthird.perm[3] = (d->perm[3] % 2) + 6;
18141
18142 dfirst.target = gen_reg_rtx (dfirst.vmode);
18143 dsecond.target = gen_reg_rtx (dsecond.vmode);
18144 dthird.op0 = dfirst.target;
18145 dthird.op1 = dsecond.target;
18146 dthird.one_operand_p = false;
18147
18148 canonicalize_perm (&dfirst);
18149 canonicalize_perm (&dsecond);
18150
18151 ok = expand_vec_perm_1 (&dfirst)
18152 && expand_vec_perm_1 (&dsecond)
18153 && expand_vec_perm_1 (&dthird);
18154
18155 gcc_assert (ok);
18156
18157 return true;
18158 }
18159
18160 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
18161
18162 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18163 a two vector permutation using two intra-lane vector
18164 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18165 the non-swapped and swapped vectors together. */
18166
18167 static bool
18168 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
18169 {
18170 struct expand_vec_perm_d dfirst, dsecond, dthird;
18171 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
18172 rtx_insn *seq1, *seq2;
18173 bool ok;
18174 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18175
18176 if (!TARGET_AVX
18177 || TARGET_AVX2
18178 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18179 || d->one_operand_p)
18180 return false;
18181
18182 dfirst = *d;
18183 dsecond = *d;
18184 for (i = 0; i < nelt; i++)
18185 {
18186 dfirst.perm[i] = 0xff;
18187 dsecond.perm[i] = 0xff;
18188 }
18189 for (i = 0, msk = 0; i < nelt; i++)
18190 {
18191 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18192 if (j == i)
18193 {
18194 dfirst.perm[j] = d->perm[i];
18195 which1 |= (d->perm[i] < nelt ? 1 : 2);
18196 }
18197 else
18198 {
18199 dsecond.perm[j] = d->perm[i];
18200 which2 |= (d->perm[i] < nelt ? 1 : 2);
18201 msk |= (1U << i);
18202 }
18203 }
18204 if (msk == 0 || msk == (1U << nelt) - 1)
18205 return false;
18206
18207 if (!d->testing_p)
18208 {
18209 dfirst.target = gen_reg_rtx (dfirst.vmode);
18210 dsecond.target = gen_reg_rtx (dsecond.vmode);
18211 }
18212
18213 for (i = 0; i < nelt; i++)
18214 {
18215 if (dfirst.perm[i] == 0xff)
18216 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
18217 if (dsecond.perm[i] == 0xff)
18218 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
18219 }
18220 canonicalize_perm (&dfirst);
18221 start_sequence ();
18222 ok = ix86_expand_vec_perm_const_1 (&dfirst);
18223 seq1 = get_insns ();
18224 end_sequence ();
18225
18226 if (!ok)
18227 return false;
18228
18229 canonicalize_perm (&dsecond);
18230 start_sequence ();
18231 ok = ix86_expand_vec_perm_const_1 (&dsecond);
18232 seq2 = get_insns ();
18233 end_sequence ();
18234
18235 if (!ok)
18236 return false;
18237
18238 if (d->testing_p)
18239 return true;
18240
18241 emit_insn (seq1);
18242 emit_insn (seq2);
18243
18244 dthird = *d;
18245 dthird.op0 = dsecond.target;
18246 dthird.op1 = dsecond.target;
18247 dthird.one_operand_p = true;
18248 dthird.target = gen_reg_rtx (dthird.vmode);
18249 for (i = 0; i < nelt; i++)
18250 dthird.perm[i] = i ^ nelt2;
18251
18252 ok = expand_vec_perm_1 (&dthird);
18253 gcc_assert (ok);
18254
18255 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18256 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
18257 return true;
18258 }
18259
18260 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18261 permutation with two pshufb insns and an ior. We should have already
18262 failed all two instruction sequences. */
18263
18264 static bool
18265 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18266 {
18267 rtx rperm[2][16], vperm, l, h, op, m128;
18268 unsigned int i, nelt, eltsz;
18269
18270 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18271 return false;
18272 gcc_assert (!d->one_operand_p);
18273
18274 if (d->testing_p)
18275 return true;
18276
18277 nelt = d->nelt;
18278 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18279
18280 /* Generate two permutation masks. If the required element is within
18281 the given vector it is shuffled into the proper lane. If the required
18282 element is in the other vector, force a zero into the lane by setting
18283 bit 7 in the permutation mask. */
18284 m128 = GEN_INT (-128);
18285 for (i = 0; i < nelt; ++i)
18286 {
18287 unsigned j, e = d->perm[i];
18288 unsigned which = (e >= nelt);
18289 if (e >= nelt)
18290 e -= nelt;
18291
18292 for (j = 0; j < eltsz; ++j)
18293 {
18294 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18295 rperm[1-which][i*eltsz + j] = m128;
18296 }
18297 }
18298
18299 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18300 vperm = force_reg (V16QImode, vperm);
18301
18302 l = gen_reg_rtx (V16QImode);
18303 op = gen_lowpart (V16QImode, d->op0);
18304 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18305
18306 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18307 vperm = force_reg (V16QImode, vperm);
18308
18309 h = gen_reg_rtx (V16QImode);
18310 op = gen_lowpart (V16QImode, d->op1);
18311 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18312
18313 op = d->target;
18314 if (d->vmode != V16QImode)
18315 op = gen_reg_rtx (V16QImode);
18316 emit_insn (gen_iorv16qi3 (op, l, h));
18317 if (op != d->target)
18318 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18319
18320 return true;
18321 }
18322
18323 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18324 with two vpshufb insns, vpermq and vpor. We should have already failed
18325 all two or three instruction sequences. */
18326
18327 static bool
18328 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18329 {
18330 rtx rperm[2][32], vperm, l, h, hp, op, m128;
18331 unsigned int i, nelt, eltsz;
18332
18333 if (!TARGET_AVX2
18334 || !d->one_operand_p
18335 || (d->vmode != V32QImode && d->vmode != V16HImode))
18336 return false;
18337
18338 if (d->testing_p)
18339 return true;
18340
18341 nelt = d->nelt;
18342 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18343
18344 /* Generate two permutation masks. If the required element is within
18345 the same lane, it is shuffled in. If the required element from the
18346 other lane, force a zero by setting bit 7 in the permutation mask.
18347 In the other mask the mask has non-negative elements if element
18348 is requested from the other lane, but also moved to the other lane,
18349 so that the result of vpshufb can have the two V2TImode halves
18350 swapped. */
18351 m128 = GEN_INT (-128);
18352 for (i = 0; i < nelt; ++i)
18353 {
18354 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18355 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18356
18357 for (j = 0; j < eltsz; ++j)
18358 {
18359 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18360 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18361 }
18362 }
18363
18364 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18365 vperm = force_reg (V32QImode, vperm);
18366
18367 h = gen_reg_rtx (V32QImode);
18368 op = gen_lowpart (V32QImode, d->op0);
18369 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18370
18371 /* Swap the 128-byte lanes of h into hp. */
18372 hp = gen_reg_rtx (V4DImode);
18373 op = gen_lowpart (V4DImode, h);
18374 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18375 const1_rtx));
18376
18377 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18378 vperm = force_reg (V32QImode, vperm);
18379
18380 l = gen_reg_rtx (V32QImode);
18381 op = gen_lowpart (V32QImode, d->op0);
18382 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18383
18384 op = d->target;
18385 if (d->vmode != V32QImode)
18386 op = gen_reg_rtx (V32QImode);
18387 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18388 if (op != d->target)
18389 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18390
18391 return true;
18392 }
18393
18394 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18395 and extract-odd permutations of two V32QImode and V16QImode operand
18396 with two vpshufb insns, vpor and vpermq. We should have already
18397 failed all two or three instruction sequences. */
18398
18399 static bool
18400 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18401 {
18402 rtx rperm[2][32], vperm, l, h, ior, op, m128;
18403 unsigned int i, nelt, eltsz;
18404
18405 if (!TARGET_AVX2
18406 || d->one_operand_p
18407 || (d->vmode != V32QImode && d->vmode != V16HImode))
18408 return false;
18409
18410 for (i = 0; i < d->nelt; ++i)
18411 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18412 return false;
18413
18414 if (d->testing_p)
18415 return true;
18416
18417 nelt = d->nelt;
18418 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18419
18420 /* Generate two permutation masks. In the first permutation mask
18421 the first quarter will contain indexes for the first half
18422 of the op0, the second quarter will contain bit 7 set, third quarter
18423 will contain indexes for the second half of the op0 and the
18424 last quarter bit 7 set. In the second permutation mask
18425 the first quarter will contain bit 7 set, the second quarter
18426 indexes for the first half of the op1, the third quarter bit 7 set
18427 and last quarter indexes for the second half of the op1.
18428 I.e. the first mask e.g. for V32QImode extract even will be:
18429 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18430 (all values masked with 0xf except for -128) and second mask
18431 for extract even will be
18432 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18433 m128 = GEN_INT (-128);
18434 for (i = 0; i < nelt; ++i)
18435 {
18436 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18437 unsigned which = d->perm[i] >= nelt;
18438 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18439
18440 for (j = 0; j < eltsz; ++j)
18441 {
18442 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18443 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18444 }
18445 }
18446
18447 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18448 vperm = force_reg (V32QImode, vperm);
18449
18450 l = gen_reg_rtx (V32QImode);
18451 op = gen_lowpart (V32QImode, d->op0);
18452 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18453
18454 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18455 vperm = force_reg (V32QImode, vperm);
18456
18457 h = gen_reg_rtx (V32QImode);
18458 op = gen_lowpart (V32QImode, d->op1);
18459 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18460
18461 ior = gen_reg_rtx (V32QImode);
18462 emit_insn (gen_iorv32qi3 (ior, l, h));
18463
18464 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18465 op = gen_reg_rtx (V4DImode);
18466 ior = gen_lowpart (V4DImode, ior);
18467 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18468 const1_rtx, GEN_INT (3)));
18469 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18470
18471 return true;
18472 }
18473
18474 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18475 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18476 with two "and" and "pack" or two "shift" and "pack" insns. We should
18477 have already failed all two instruction sequences. */
18478
18479 static bool
18480 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18481 {
18482 rtx op, dop0, dop1, t;
18483 unsigned i, odd, c, s, nelt = d->nelt;
18484 bool end_perm = false;
18485 machine_mode half_mode;
18486 rtx (*gen_and) (rtx, rtx, rtx);
18487 rtx (*gen_pack) (rtx, rtx, rtx);
18488 rtx (*gen_shift) (rtx, rtx, rtx);
18489
18490 if (d->one_operand_p)
18491 return false;
18492
18493 switch (d->vmode)
18494 {
18495 case E_V8HImode:
18496 /* Required for "pack". */
18497 if (!TARGET_SSE4_1)
18498 return false;
18499 c = 0xffff;
18500 s = 16;
18501 half_mode = V4SImode;
18502 gen_and = gen_andv4si3;
18503 gen_pack = gen_sse4_1_packusdw;
18504 gen_shift = gen_lshrv4si3;
18505 break;
18506 case E_V16QImode:
18507 /* No check as all instructions are SSE2. */
18508 c = 0xff;
18509 s = 8;
18510 half_mode = V8HImode;
18511 gen_and = gen_andv8hi3;
18512 gen_pack = gen_sse2_packuswb;
18513 gen_shift = gen_lshrv8hi3;
18514 break;
18515 case E_V16HImode:
18516 if (!TARGET_AVX2)
18517 return false;
18518 c = 0xffff;
18519 s = 16;
18520 half_mode = V8SImode;
18521 gen_and = gen_andv8si3;
18522 gen_pack = gen_avx2_packusdw;
18523 gen_shift = gen_lshrv8si3;
18524 end_perm = true;
18525 break;
18526 case E_V32QImode:
18527 if (!TARGET_AVX2)
18528 return false;
18529 c = 0xff;
18530 s = 8;
18531 half_mode = V16HImode;
18532 gen_and = gen_andv16hi3;
18533 gen_pack = gen_avx2_packuswb;
18534 gen_shift = gen_lshrv16hi3;
18535 end_perm = true;
18536 break;
18537 default:
18538 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18539 general shuffles. */
18540 return false;
18541 }
18542
18543 /* Check that permutation is even or odd. */
18544 odd = d->perm[0];
18545 if (odd > 1)
18546 return false;
18547
18548 for (i = 1; i < nelt; ++i)
18549 if (d->perm[i] != 2 * i + odd)
18550 return false;
18551
18552 if (d->testing_p)
18553 return true;
18554
18555 dop0 = gen_reg_rtx (half_mode);
18556 dop1 = gen_reg_rtx (half_mode);
18557 if (odd == 0)
18558 {
18559 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
18560 t = force_reg (half_mode, t);
18561 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
18562 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
18563 }
18564 else
18565 {
18566 emit_insn (gen_shift (dop0,
18567 gen_lowpart (half_mode, d->op0),
18568 GEN_INT (s)));
18569 emit_insn (gen_shift (dop1,
18570 gen_lowpart (half_mode, d->op1),
18571 GEN_INT (s)));
18572 }
18573 /* In AVX2 for 256 bit case we need to permute pack result. */
18574 if (TARGET_AVX2 && end_perm)
18575 {
18576 op = gen_reg_rtx (d->vmode);
18577 t = gen_reg_rtx (V4DImode);
18578 emit_insn (gen_pack (op, dop0, dop1));
18579 emit_insn (gen_avx2_permv4di_1 (t,
18580 gen_lowpart (V4DImode, op),
18581 const0_rtx,
18582 const2_rtx,
18583 const1_rtx,
18584 GEN_INT (3)));
18585 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
18586 }
18587 else
18588 emit_insn (gen_pack (d->target, dop0, dop1));
18589
18590 return true;
18591 }
18592
18593 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18594 and extract-odd permutations of two V64QI operands
18595 with two "shifts", two "truncs" and one "concat" insns for "odd"
18596 and two "truncs" and one concat insn for "even."
18597 Have already failed all two instruction sequences. */
18598
18599 static bool
18600 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
18601 {
18602 rtx t1, t2, t3, t4;
18603 unsigned i, odd, nelt = d->nelt;
18604
18605 if (!TARGET_AVX512BW
18606 || d->one_operand_p
18607 || d->vmode != V64QImode)
18608 return false;
18609
18610 /* Check that permutation is even or odd. */
18611 odd = d->perm[0];
18612 if (odd > 1)
18613 return false;
18614
18615 for (i = 1; i < nelt; ++i)
18616 if (d->perm[i] != 2 * i + odd)
18617 return false;
18618
18619 if (d->testing_p)
18620 return true;
18621
18622
18623 if (odd)
18624 {
18625 t1 = gen_reg_rtx (V32HImode);
18626 t2 = gen_reg_rtx (V32HImode);
18627 emit_insn (gen_lshrv32hi3 (t1,
18628 gen_lowpart (V32HImode, d->op0),
18629 GEN_INT (8)));
18630 emit_insn (gen_lshrv32hi3 (t2,
18631 gen_lowpart (V32HImode, d->op1),
18632 GEN_INT (8)));
18633 }
18634 else
18635 {
18636 t1 = gen_lowpart (V32HImode, d->op0);
18637 t2 = gen_lowpart (V32HImode, d->op1);
18638 }
18639
18640 t3 = gen_reg_rtx (V32QImode);
18641 t4 = gen_reg_rtx (V32QImode);
18642 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
18643 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
18644 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
18645
18646 return true;
18647 }
18648
18649 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
18650 and extract-odd permutations. */
18651
18652 static bool
18653 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
18654 {
18655 rtx t1, t2, t3, t4, t5;
18656
18657 switch (d->vmode)
18658 {
18659 case E_V4DFmode:
18660 if (d->testing_p)
18661 break;
18662 t1 = gen_reg_rtx (V4DFmode);
18663 t2 = gen_reg_rtx (V4DFmode);
18664
18665 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18666 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
18667 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
18668
18669 /* Now an unpck[lh]pd will produce the result required. */
18670 if (odd)
18671 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
18672 else
18673 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
18674 emit_insn (t3);
18675 break;
18676
18677 case E_V8SFmode:
18678 {
18679 int mask = odd ? 0xdd : 0x88;
18680
18681 if (d->testing_p)
18682 break;
18683 t1 = gen_reg_rtx (V8SFmode);
18684 t2 = gen_reg_rtx (V8SFmode);
18685 t3 = gen_reg_rtx (V8SFmode);
18686
18687 /* Shuffle within the 128-bit lanes to produce:
18688 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18689 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
18690 GEN_INT (mask)));
18691
18692 /* Shuffle the lanes around to produce:
18693 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18694 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
18695 GEN_INT (0x3)));
18696
18697 /* Shuffle within the 128-bit lanes to produce:
18698 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18699 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
18700
18701 /* Shuffle within the 128-bit lanes to produce:
18702 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18703 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
18704
18705 /* Shuffle the lanes around to produce:
18706 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18707 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
18708 GEN_INT (0x20)));
18709 }
18710 break;
18711
18712 case E_V2DFmode:
18713 case E_V4SFmode:
18714 case E_V2DImode:
18715 case E_V4SImode:
18716 /* These are always directly implementable by expand_vec_perm_1. */
18717 gcc_unreachable ();
18718
18719 case E_V8HImode:
18720 if (TARGET_SSE4_1)
18721 return expand_vec_perm_even_odd_pack (d);
18722 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
18723 return expand_vec_perm_pshufb2 (d);
18724 else
18725 {
18726 if (d->testing_p)
18727 break;
18728 /* We need 2*log2(N)-1 operations to achieve odd/even
18729 with interleave. */
18730 t1 = gen_reg_rtx (V8HImode);
18731 t2 = gen_reg_rtx (V8HImode);
18732 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
18733 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
18734 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
18735 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
18736 if (odd)
18737 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
18738 else
18739 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
18740 emit_insn (t3);
18741 }
18742 break;
18743
18744 case E_V16QImode:
18745 return expand_vec_perm_even_odd_pack (d);
18746
18747 case E_V16HImode:
18748 case E_V32QImode:
18749 return expand_vec_perm_even_odd_pack (d);
18750
18751 case E_V64QImode:
18752 return expand_vec_perm_even_odd_trunc (d);
18753
18754 case E_V4DImode:
18755 if (!TARGET_AVX2)
18756 {
18757 struct expand_vec_perm_d d_copy = *d;
18758 d_copy.vmode = V4DFmode;
18759 if (d->testing_p)
18760 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
18761 else
18762 d_copy.target = gen_reg_rtx (V4DFmode);
18763 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
18764 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
18765 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18766 {
18767 if (!d->testing_p)
18768 emit_move_insn (d->target,
18769 gen_lowpart (V4DImode, d_copy.target));
18770 return true;
18771 }
18772 return false;
18773 }
18774
18775 if (d->testing_p)
18776 break;
18777
18778 t1 = gen_reg_rtx (V4DImode);
18779 t2 = gen_reg_rtx (V4DImode);
18780
18781 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18782 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
18783 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
18784
18785 /* Now an vpunpck[lh]qdq will produce the result required. */
18786 if (odd)
18787 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
18788 else
18789 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
18790 emit_insn (t3);
18791 break;
18792
18793 case E_V8SImode:
18794 if (!TARGET_AVX2)
18795 {
18796 struct expand_vec_perm_d d_copy = *d;
18797 d_copy.vmode = V8SFmode;
18798 if (d->testing_p)
18799 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
18800 else
18801 d_copy.target = gen_reg_rtx (V8SFmode);
18802 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
18803 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
18804 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18805 {
18806 if (!d->testing_p)
18807 emit_move_insn (d->target,
18808 gen_lowpart (V8SImode, d_copy.target));
18809 return true;
18810 }
18811 return false;
18812 }
18813
18814 if (d->testing_p)
18815 break;
18816
18817 t1 = gen_reg_rtx (V8SImode);
18818 t2 = gen_reg_rtx (V8SImode);
18819 t3 = gen_reg_rtx (V4DImode);
18820 t4 = gen_reg_rtx (V4DImode);
18821 t5 = gen_reg_rtx (V4DImode);
18822
18823 /* Shuffle the lanes around into
18824 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18825 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
18826 gen_lowpart (V4DImode, d->op1),
18827 GEN_INT (0x20)));
18828 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
18829 gen_lowpart (V4DImode, d->op1),
18830 GEN_INT (0x31)));
18831
18832 /* Swap the 2nd and 3rd position in each lane into
18833 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18834 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
18835 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18836 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
18837 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18838
18839 /* Now an vpunpck[lh]qdq will produce
18840 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18841 if (odd)
18842 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
18843 gen_lowpart (V4DImode, t2));
18844 else
18845 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
18846 gen_lowpart (V4DImode, t2));
18847 emit_insn (t3);
18848 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
18849 break;
18850
18851 default:
18852 gcc_unreachable ();
18853 }
18854
18855 return true;
18856 }
18857
18858 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18859 extract-even and extract-odd permutations. */
18860
18861 static bool
18862 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
18863 {
18864 unsigned i, odd, nelt = d->nelt;
18865
18866 odd = d->perm[0];
18867 if (odd != 0 && odd != 1)
18868 return false;
18869
18870 for (i = 1; i < nelt; ++i)
18871 if (d->perm[i] != 2 * i + odd)
18872 return false;
18873
18874 return expand_vec_perm_even_odd_1 (d, odd);
18875 }
18876
18877 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
18878 permutations. We assume that expand_vec_perm_1 has already failed. */
18879
18880 static bool
18881 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
18882 {
18883 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
18884 machine_mode vmode = d->vmode;
18885 unsigned char perm2[4];
18886 rtx op0 = d->op0, dest;
18887 bool ok;
18888
18889 switch (vmode)
18890 {
18891 case E_V4DFmode:
18892 case E_V8SFmode:
18893 /* These are special-cased in sse.md so that we can optionally
18894 use the vbroadcast instruction. They expand to two insns
18895 if the input happens to be in a register. */
18896 gcc_unreachable ();
18897
18898 case E_V2DFmode:
18899 case E_V2DImode:
18900 case E_V4SFmode:
18901 case E_V4SImode:
18902 /* These are always implementable using standard shuffle patterns. */
18903 gcc_unreachable ();
18904
18905 case E_V8HImode:
18906 case E_V16QImode:
18907 /* These can be implemented via interleave. We save one insn by
18908 stopping once we have promoted to V4SImode and then use pshufd. */
18909 if (d->testing_p)
18910 return true;
18911 do
18912 {
18913 rtx dest;
18914 rtx (*gen) (rtx, rtx, rtx)
18915 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
18916 : gen_vec_interleave_lowv8hi;
18917
18918 if (elt >= nelt2)
18919 {
18920 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
18921 : gen_vec_interleave_highv8hi;
18922 elt -= nelt2;
18923 }
18924 nelt2 /= 2;
18925
18926 dest = gen_reg_rtx (vmode);
18927 emit_insn (gen (dest, op0, op0));
18928 vmode = get_mode_wider_vector (vmode);
18929 op0 = gen_lowpart (vmode, dest);
18930 }
18931 while (vmode != V4SImode);
18932
18933 memset (perm2, elt, 4);
18934 dest = gen_reg_rtx (V4SImode);
18935 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
18936 gcc_assert (ok);
18937 if (!d->testing_p)
18938 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
18939 return true;
18940
18941 case E_V64QImode:
18942 case E_V32QImode:
18943 case E_V16HImode:
18944 case E_V8SImode:
18945 case E_V4DImode:
18946 /* For AVX2 broadcasts of the first element vpbroadcast* or
18947 vpermq should be used by expand_vec_perm_1. */
18948 gcc_assert (!TARGET_AVX2 || d->perm[0]);
18949 return false;
18950
18951 default:
18952 gcc_unreachable ();
18953 }
18954 }
18955
18956 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18957 broadcast permutations. */
18958
18959 static bool
18960 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
18961 {
18962 unsigned i, elt, nelt = d->nelt;
18963
18964 if (!d->one_operand_p)
18965 return false;
18966
18967 elt = d->perm[0];
18968 for (i = 1; i < nelt; ++i)
18969 if (d->perm[i] != elt)
18970 return false;
18971
18972 return expand_vec_perm_broadcast_1 (d);
18973 }
18974
18975 /* Implement arbitrary permutations of two V64QImode operands
18976 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18977 static bool
18978 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
18979 {
18980 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
18981 return false;
18982
18983 if (d->testing_p)
18984 return true;
18985
18986 struct expand_vec_perm_d ds[2];
18987 rtx rperm[128], vperm, target0, target1;
18988 unsigned int i, nelt;
18989 machine_mode vmode;
18990
18991 nelt = d->nelt;
18992 vmode = V64QImode;
18993
18994 for (i = 0; i < 2; i++)
18995 {
18996 ds[i] = *d;
18997 ds[i].vmode = V32HImode;
18998 ds[i].nelt = 32;
18999 ds[i].target = gen_reg_rtx (V32HImode);
19000 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
19001 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
19002 }
19003
19004 /* Prepare permutations such that the first one takes care of
19005 putting the even bytes into the right positions or one higher
19006 positions (ds[0]) and the second one takes care of
19007 putting the odd bytes into the right positions or one below
19008 (ds[1]). */
19009
19010 for (i = 0; i < nelt; i++)
19011 {
19012 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
19013 if (i & 1)
19014 {
19015 rperm[i] = constm1_rtx;
19016 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19017 }
19018 else
19019 {
19020 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19021 rperm[i + 64] = constm1_rtx;
19022 }
19023 }
19024
19025 bool ok = expand_vec_perm_1 (&ds[0]);
19026 gcc_assert (ok);
19027 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
19028
19029 ok = expand_vec_perm_1 (&ds[1]);
19030 gcc_assert (ok);
19031 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
19032
19033 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
19034 vperm = force_reg (vmode, vperm);
19035 target0 = gen_reg_rtx (V64QImode);
19036 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
19037
19038 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
19039 vperm = force_reg (vmode, vperm);
19040 target1 = gen_reg_rtx (V64QImode);
19041 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
19042
19043 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
19044 return true;
19045 }
19046
19047 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
19048 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
19049 all the shorter instruction sequences. */
19050
19051 static bool
19052 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
19053 {
19054 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
19055 unsigned int i, nelt, eltsz;
19056 bool used[4];
19057
19058 if (!TARGET_AVX2
19059 || d->one_operand_p
19060 || (d->vmode != V32QImode && d->vmode != V16HImode))
19061 return false;
19062
19063 if (d->testing_p)
19064 return true;
19065
19066 nelt = d->nelt;
19067 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19068
19069 /* Generate 4 permutation masks. If the required element is within
19070 the same lane, it is shuffled in. If the required element from the
19071 other lane, force a zero by setting bit 7 in the permutation mask.
19072 In the other mask the mask has non-negative elements if element
19073 is requested from the other lane, but also moved to the other lane,
19074 so that the result of vpshufb can have the two V2TImode halves
19075 swapped. */
19076 m128 = GEN_INT (-128);
19077 for (i = 0; i < 32; ++i)
19078 {
19079 rperm[0][i] = m128;
19080 rperm[1][i] = m128;
19081 rperm[2][i] = m128;
19082 rperm[3][i] = m128;
19083 }
19084 used[0] = false;
19085 used[1] = false;
19086 used[2] = false;
19087 used[3] = false;
19088 for (i = 0; i < nelt; ++i)
19089 {
19090 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
19091 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
19092 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
19093
19094 for (j = 0; j < eltsz; ++j)
19095 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
19096 used[which] = true;
19097 }
19098
19099 for (i = 0; i < 2; ++i)
19100 {
19101 if (!used[2 * i + 1])
19102 {
19103 h[i] = NULL_RTX;
19104 continue;
19105 }
19106 vperm = gen_rtx_CONST_VECTOR (V32QImode,
19107 gen_rtvec_v (32, rperm[2 * i + 1]));
19108 vperm = force_reg (V32QImode, vperm);
19109 h[i] = gen_reg_rtx (V32QImode);
19110 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19111 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
19112 }
19113
19114 /* Swap the 128-byte lanes of h[X]. */
19115 for (i = 0; i < 2; ++i)
19116 {
19117 if (h[i] == NULL_RTX)
19118 continue;
19119 op = gen_reg_rtx (V4DImode);
19120 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
19121 const2_rtx, GEN_INT (3), const0_rtx,
19122 const1_rtx));
19123 h[i] = gen_lowpart (V32QImode, op);
19124 }
19125
19126 for (i = 0; i < 2; ++i)
19127 {
19128 if (!used[2 * i])
19129 {
19130 l[i] = NULL_RTX;
19131 continue;
19132 }
19133 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19134 vperm = force_reg (V32QImode, vperm);
19135 l[i] = gen_reg_rtx (V32QImode);
19136 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19137 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19138 }
19139
19140 for (i = 0; i < 2; ++i)
19141 {
19142 if (h[i] && l[i])
19143 {
19144 op = gen_reg_rtx (V32QImode);
19145 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19146 l[i] = op;
19147 }
19148 else if (h[i])
19149 l[i] = h[i];
19150 }
19151
19152 gcc_assert (l[0] && l[1]);
19153 op = d->target;
19154 if (d->vmode != V32QImode)
19155 op = gen_reg_rtx (V32QImode);
19156 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19157 if (op != d->target)
19158 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19159 return true;
19160 }
19161
19162 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19163 taken care of, perform the expansion in D and return true on success. */
19164
19165 static bool
19166 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19167 {
19168 /* Try a single instruction expansion. */
19169 if (expand_vec_perm_1 (d))
19170 return true;
19171
19172 /* Try sequences of two instructions. */
19173
19174 if (expand_vec_perm_pshuflw_pshufhw (d))
19175 return true;
19176
19177 if (expand_vec_perm_palignr (d, false))
19178 return true;
19179
19180 if (expand_vec_perm_interleave2 (d))
19181 return true;
19182
19183 if (expand_vec_perm_broadcast (d))
19184 return true;
19185
19186 if (expand_vec_perm_vpermq_perm_1 (d))
19187 return true;
19188
19189 if (expand_vec_perm_vperm2f128 (d))
19190 return true;
19191
19192 if (expand_vec_perm_pblendv (d))
19193 return true;
19194
19195 /* Try sequences of three instructions. */
19196
19197 if (expand_vec_perm_even_odd_pack (d))
19198 return true;
19199
19200 if (expand_vec_perm_2vperm2f128_vshuf (d))
19201 return true;
19202
19203 if (expand_vec_perm_pshufb2 (d))
19204 return true;
19205
19206 if (expand_vec_perm_interleave3 (d))
19207 return true;
19208
19209 if (expand_vec_perm_vperm2f128_vblend (d))
19210 return true;
19211
19212 /* Try sequences of four instructions. */
19213
19214 if (expand_vec_perm_even_odd_trunc (d))
19215 return true;
19216 if (expand_vec_perm_vpshufb2_vpermq (d))
19217 return true;
19218
19219 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19220 return true;
19221
19222 if (expand_vec_perm_vpermt2_vpshub2 (d))
19223 return true;
19224
19225 /* ??? Look for narrow permutations whose element orderings would
19226 allow the promotion to a wider mode. */
19227
19228 /* ??? Look for sequences of interleave or a wider permute that place
19229 the data into the correct lanes for a half-vector shuffle like
19230 pshuf[lh]w or vpermilps. */
19231
19232 /* ??? Look for sequences of interleave that produce the desired results.
19233 The combinatorics of punpck[lh] get pretty ugly... */
19234
19235 if (expand_vec_perm_even_odd (d))
19236 return true;
19237
19238 /* Even longer sequences. */
19239 if (expand_vec_perm_vpshufb4_vpermq2 (d))
19240 return true;
19241
19242 /* See if we can get the same permutation in different vector integer
19243 mode. */
19244 struct expand_vec_perm_d nd;
19245 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19246 {
19247 if (!d->testing_p)
19248 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19249 return true;
19250 }
19251
19252 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19253 if (expand_vec_perm2_vperm2f128_vblend (d))
19254 return true;
19255
19256 return false;
19257 }
19258
19259 /* If a permutation only uses one operand, make it clear. Returns true
19260 if the permutation references both operands. */
19261
19262 static bool
19263 canonicalize_perm (struct expand_vec_perm_d *d)
19264 {
19265 int i, which, nelt = d->nelt;
19266
19267 for (i = which = 0; i < nelt; ++i)
19268 which |= (d->perm[i] < nelt ? 1 : 2);
19269
19270 d->one_operand_p = true;
19271 switch (which)
19272 {
19273 default:
19274 gcc_unreachable();
19275
19276 case 3:
19277 if (!rtx_equal_p (d->op0, d->op1))
19278 {
19279 d->one_operand_p = false;
19280 break;
19281 }
19282 /* The elements of PERM do not suggest that only the first operand
19283 is used, but both operands are identical. Allow easier matching
19284 of the permutation by folding the permutation into the single
19285 input vector. */
19286 /* FALLTHRU */
19287
19288 case 2:
19289 for (i = 0; i < nelt; ++i)
19290 d->perm[i] &= nelt - 1;
19291 d->op0 = d->op1;
19292 break;
19293
19294 case 1:
19295 d->op1 = d->op0;
19296 break;
19297 }
19298
19299 return (which == 3);
19300 }
19301
19302 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19303
19304 bool
19305 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19306 rtx op1, const vec_perm_indices &sel)
19307 {
19308 struct expand_vec_perm_d d;
19309 unsigned char perm[MAX_VECT_LEN];
19310 unsigned int i, nelt, which;
19311 bool two_args;
19312
19313 d.target = target;
19314 d.op0 = op0;
19315 d.op1 = op1;
19316
19317 d.vmode = vmode;
19318 gcc_assert (VECTOR_MODE_P (d.vmode));
19319 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19320 d.testing_p = !target;
19321
19322 gcc_assert (sel.length () == nelt);
19323 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19324
19325 /* Given sufficient ISA support we can just return true here
19326 for selected vector modes. */
19327 switch (d.vmode)
19328 {
19329 case E_V16SFmode:
19330 case E_V16SImode:
19331 case E_V8DImode:
19332 case E_V8DFmode:
19333 if (!TARGET_AVX512F)
19334 return false;
19335 /* All implementable with a single vperm[it]2 insn. */
19336 if (d.testing_p)
19337 return true;
19338 break;
19339 case E_V32HImode:
19340 if (!TARGET_AVX512BW)
19341 return false;
19342 if (d.testing_p)
19343 /* All implementable with a single vperm[it]2 insn. */
19344 return true;
19345 break;
19346 case E_V64QImode:
19347 if (!TARGET_AVX512BW)
19348 return false;
19349 if (d.testing_p)
19350 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19351 return true;
19352 break;
19353 case E_V8SImode:
19354 case E_V8SFmode:
19355 case E_V4DFmode:
19356 case E_V4DImode:
19357 if (!TARGET_AVX)
19358 return false;
19359 if (d.testing_p && TARGET_AVX512VL)
19360 /* All implementable with a single vperm[it]2 insn. */
19361 return true;
19362 break;
19363 case E_V16HImode:
19364 if (!TARGET_SSE2)
19365 return false;
19366 if (d.testing_p && TARGET_AVX2)
19367 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19368 return true;
19369 break;
19370 case E_V32QImode:
19371 if (!TARGET_SSE2)
19372 return false;
19373 if (d.testing_p && TARGET_AVX2)
19374 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19375 return true;
19376 break;
19377 case E_V8HImode:
19378 case E_V16QImode:
19379 if (!TARGET_SSE2)
19380 return false;
19381 /* Fall through. */
19382 case E_V4SImode:
19383 case E_V4SFmode:
19384 if (!TARGET_SSE)
19385 return false;
19386 /* All implementable with a single vpperm insn. */
19387 if (d.testing_p && TARGET_XOP)
19388 return true;
19389 /* All implementable with 2 pshufb + 1 ior. */
19390 if (d.testing_p && TARGET_SSSE3)
19391 return true;
19392 break;
19393 case E_V2DImode:
19394 case E_V2DFmode:
19395 if (!TARGET_SSE)
19396 return false;
19397 /* All implementable with shufpd or unpck[lh]pd. */
19398 if (d.testing_p)
19399 return true;
19400 break;
19401 default:
19402 return false;
19403 }
19404
19405 for (i = which = 0; i < nelt; ++i)
19406 {
19407 unsigned char e = sel[i];
19408 gcc_assert (e < 2 * nelt);
19409 d.perm[i] = e;
19410 perm[i] = e;
19411 which |= (e < nelt ? 1 : 2);
19412 }
19413
19414 if (d.testing_p)
19415 {
19416 /* For all elements from second vector, fold the elements to first. */
19417 if (which == 2)
19418 for (i = 0; i < nelt; ++i)
19419 d.perm[i] -= nelt;
19420
19421 /* Check whether the mask can be applied to the vector type. */
19422 d.one_operand_p = (which != 3);
19423
19424 /* Implementable with shufps or pshufd. */
19425 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
19426 return true;
19427
19428 /* Otherwise we have to go through the motions and see if we can
19429 figure out how to generate the requested permutation. */
19430 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19431 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19432 if (!d.one_operand_p)
19433 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19434
19435 start_sequence ();
19436 bool ret = ix86_expand_vec_perm_const_1 (&d);
19437 end_sequence ();
19438
19439 return ret;
19440 }
19441
19442 two_args = canonicalize_perm (&d);
19443
19444 if (ix86_expand_vec_perm_const_1 (&d))
19445 return true;
19446
19447 /* If the selector says both arguments are needed, but the operands are the
19448 same, the above tried to expand with one_operand_p and flattened selector.
19449 If that didn't work, retry without one_operand_p; we succeeded with that
19450 during testing. */
19451 if (two_args && d.one_operand_p)
19452 {
19453 d.one_operand_p = false;
19454 memcpy (d.perm, perm, sizeof (perm));
19455 return ix86_expand_vec_perm_const_1 (&d);
19456 }
19457
19458 return false;
19459 }
19460
19461 void
19462 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
19463 {
19464 struct expand_vec_perm_d d;
19465 unsigned i, nelt;
19466
19467 d.target = targ;
19468 d.op0 = op0;
19469 d.op1 = op1;
19470 d.vmode = GET_MODE (targ);
19471 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19472 d.one_operand_p = false;
19473 d.testing_p = false;
19474
19475 for (i = 0; i < nelt; ++i)
19476 d.perm[i] = i * 2 + odd;
19477
19478 /* We'll either be able to implement the permutation directly... */
19479 if (expand_vec_perm_1 (&d))
19480 return;
19481
19482 /* ... or we use the special-case patterns. */
19483 expand_vec_perm_even_odd_1 (&d, odd);
19484 }
19485
19486 static void
19487 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
19488 {
19489 struct expand_vec_perm_d d;
19490 unsigned i, nelt, base;
19491 bool ok;
19492
19493 d.target = targ;
19494 d.op0 = op0;
19495 d.op1 = op1;
19496 d.vmode = GET_MODE (targ);
19497 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19498 d.one_operand_p = false;
19499 d.testing_p = false;
19500
19501 base = high_p ? nelt / 2 : 0;
19502 for (i = 0; i < nelt / 2; ++i)
19503 {
19504 d.perm[i * 2] = i + base;
19505 d.perm[i * 2 + 1] = i + base + nelt;
19506 }
19507
19508 /* Note that for AVX this isn't one instruction. */
19509 ok = ix86_expand_vec_perm_const_1 (&d);
19510 gcc_assert (ok);
19511 }
19512
19513
19514 /* Expand a vector operation CODE for a V*QImode in terms of the
19515 same operation on V*HImode. */
19516
19517 void
19518 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
19519 {
19520 machine_mode qimode = GET_MODE (dest);
19521 machine_mode himode;
19522 rtx (*gen_il) (rtx, rtx, rtx);
19523 rtx (*gen_ih) (rtx, rtx, rtx);
19524 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
19525 struct expand_vec_perm_d d;
19526 bool ok, full_interleave;
19527 bool uns_p = false;
19528 int i;
19529
19530 switch (qimode)
19531 {
19532 case E_V16QImode:
19533 himode = V8HImode;
19534 gen_il = gen_vec_interleave_lowv16qi;
19535 gen_ih = gen_vec_interleave_highv16qi;
19536 break;
19537 case E_V32QImode:
19538 himode = V16HImode;
19539 gen_il = gen_avx2_interleave_lowv32qi;
19540 gen_ih = gen_avx2_interleave_highv32qi;
19541 break;
19542 case E_V64QImode:
19543 himode = V32HImode;
19544 gen_il = gen_avx512bw_interleave_lowv64qi;
19545 gen_ih = gen_avx512bw_interleave_highv64qi;
19546 break;
19547 default:
19548 gcc_unreachable ();
19549 }
19550
19551 op2_l = op2_h = op2;
19552 switch (code)
19553 {
19554 case MULT:
19555 /* Unpack data such that we've got a source byte in each low byte of
19556 each word. We don't care what goes into the high byte of each word.
19557 Rather than trying to get zero in there, most convenient is to let
19558 it be a copy of the low byte. */
19559 op2_l = gen_reg_rtx (qimode);
19560 op2_h = gen_reg_rtx (qimode);
19561 emit_insn (gen_il (op2_l, op2, op2));
19562 emit_insn (gen_ih (op2_h, op2, op2));
19563
19564 op1_l = gen_reg_rtx (qimode);
19565 op1_h = gen_reg_rtx (qimode);
19566 emit_insn (gen_il (op1_l, op1, op1));
19567 emit_insn (gen_ih (op1_h, op1, op1));
19568 full_interleave = qimode == V16QImode;
19569 break;
19570
19571 case ASHIFT:
19572 case LSHIFTRT:
19573 uns_p = true;
19574 /* FALLTHRU */
19575 case ASHIFTRT:
19576 op1_l = gen_reg_rtx (himode);
19577 op1_h = gen_reg_rtx (himode);
19578 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
19579 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
19580 full_interleave = true;
19581 break;
19582 default:
19583 gcc_unreachable ();
19584 }
19585
19586 /* Perform the operation. */
19587 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
19588 1, OPTAB_DIRECT);
19589 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
19590 1, OPTAB_DIRECT);
19591 gcc_assert (res_l && res_h);
19592
19593 /* Merge the data back into the right place. */
19594 d.target = dest;
19595 d.op0 = gen_lowpart (qimode, res_l);
19596 d.op1 = gen_lowpart (qimode, res_h);
19597 d.vmode = qimode;
19598 d.nelt = GET_MODE_NUNITS (qimode);
19599 d.one_operand_p = false;
19600 d.testing_p = false;
19601
19602 if (full_interleave)
19603 {
19604 /* For SSE2, we used an full interleave, so the desired
19605 results are in the even elements. */
19606 for (i = 0; i < d.nelt; ++i)
19607 d.perm[i] = i * 2;
19608 }
19609 else
19610 {
19611 /* For AVX, the interleave used above was not cross-lane. So the
19612 extraction is evens but with the second and third quarter swapped.
19613 Happily, that is even one insn shorter than even extraction.
19614 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19615 always first from the first and then from the second source operand,
19616 the index bits above the low 4 bits remains the same.
19617 Thus, for d.nelt == 32 we want permutation
19618 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19619 and for d.nelt == 64 we want permutation
19620 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19621 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19622 for (i = 0; i < d.nelt; ++i)
19623 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
19624 }
19625
19626 ok = ix86_expand_vec_perm_const_1 (&d);
19627 gcc_assert (ok);
19628
19629 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19630 gen_rtx_fmt_ee (code, qimode, op1, op2));
19631 }
19632
19633 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
19634 if op is CONST_VECTOR with all odd elements equal to their
19635 preceding element. */
19636
19637 static bool
19638 const_vector_equal_evenodd_p (rtx op)
19639 {
19640 machine_mode mode = GET_MODE (op);
19641 int i, nunits = GET_MODE_NUNITS (mode);
19642 if (GET_CODE (op) != CONST_VECTOR
19643 || nunits != CONST_VECTOR_NUNITS (op))
19644 return false;
19645 for (i = 0; i < nunits; i += 2)
19646 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
19647 return false;
19648 return true;
19649 }
19650
19651 void
19652 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
19653 bool uns_p, bool odd_p)
19654 {
19655 machine_mode mode = GET_MODE (op1);
19656 machine_mode wmode = GET_MODE (dest);
19657 rtx x;
19658 rtx orig_op1 = op1, orig_op2 = op2;
19659
19660 if (!nonimmediate_operand (op1, mode))
19661 op1 = force_reg (mode, op1);
19662 if (!nonimmediate_operand (op2, mode))
19663 op2 = force_reg (mode, op2);
19664
19665 /* We only play even/odd games with vectors of SImode. */
19666 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
19667
19668 /* If we're looking for the odd results, shift those members down to
19669 the even slots. For some cpus this is faster than a PSHUFD. */
19670 if (odd_p)
19671 {
19672 /* For XOP use vpmacsdqh, but only for smult, as it is only
19673 signed. */
19674 if (TARGET_XOP && mode == V4SImode && !uns_p)
19675 {
19676 x = force_reg (wmode, CONST0_RTX (wmode));
19677 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
19678 return;
19679 }
19680
19681 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
19682 if (!const_vector_equal_evenodd_p (orig_op1))
19683 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
19684 x, NULL, 1, OPTAB_DIRECT);
19685 if (!const_vector_equal_evenodd_p (orig_op2))
19686 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
19687 x, NULL, 1, OPTAB_DIRECT);
19688 op1 = gen_lowpart (mode, op1);
19689 op2 = gen_lowpart (mode, op2);
19690 }
19691
19692 if (mode == V16SImode)
19693 {
19694 if (uns_p)
19695 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
19696 else
19697 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
19698 }
19699 else if (mode == V8SImode)
19700 {
19701 if (uns_p)
19702 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
19703 else
19704 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
19705 }
19706 else if (uns_p)
19707 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
19708 else if (TARGET_SSE4_1)
19709 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
19710 else
19711 {
19712 rtx s1, s2, t0, t1, t2;
19713
19714 /* The easiest way to implement this without PMULDQ is to go through
19715 the motions as if we are performing a full 64-bit multiply. With
19716 the exception that we need to do less shuffling of the elements. */
19717
19718 /* Compute the sign-extension, aka highparts, of the two operands. */
19719 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19720 op1, pc_rtx, pc_rtx);
19721 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19722 op2, pc_rtx, pc_rtx);
19723
19724 /* Multiply LO(A) * HI(B), and vice-versa. */
19725 t1 = gen_reg_rtx (wmode);
19726 t2 = gen_reg_rtx (wmode);
19727 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
19728 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
19729
19730 /* Multiply LO(A) * LO(B). */
19731 t0 = gen_reg_rtx (wmode);
19732 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
19733
19734 /* Combine and shift the highparts into place. */
19735 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
19736 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
19737 1, OPTAB_DIRECT);
19738
19739 /* Combine high and low parts. */
19740 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
19741 return;
19742 }
19743 emit_insn (x);
19744 }
19745
19746 void
19747 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
19748 bool uns_p, bool high_p)
19749 {
19750 machine_mode wmode = GET_MODE (dest);
19751 machine_mode mode = GET_MODE (op1);
19752 rtx t1, t2, t3, t4, mask;
19753
19754 switch (mode)
19755 {
19756 case E_V4SImode:
19757 t1 = gen_reg_rtx (mode);
19758 t2 = gen_reg_rtx (mode);
19759 if (TARGET_XOP && !uns_p)
19760 {
19761 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19762 shuffle the elements once so that all elements are in the right
19763 place for immediate use: { A C B D }. */
19764 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
19765 const1_rtx, GEN_INT (3)));
19766 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
19767 const1_rtx, GEN_INT (3)));
19768 }
19769 else
19770 {
19771 /* Put the elements into place for the multiply. */
19772 ix86_expand_vec_interleave (t1, op1, op1, high_p);
19773 ix86_expand_vec_interleave (t2, op2, op2, high_p);
19774 high_p = false;
19775 }
19776 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
19777 break;
19778
19779 case E_V8SImode:
19780 /* Shuffle the elements between the lanes. After this we
19781 have { A B E F | C D G H } for each operand. */
19782 t1 = gen_reg_rtx (V4DImode);
19783 t2 = gen_reg_rtx (V4DImode);
19784 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
19785 const0_rtx, const2_rtx,
19786 const1_rtx, GEN_INT (3)));
19787 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
19788 const0_rtx, const2_rtx,
19789 const1_rtx, GEN_INT (3)));
19790
19791 /* Shuffle the elements within the lanes. After this we
19792 have { A A B B | C C D D } or { E E F F | G G H H }. */
19793 t3 = gen_reg_rtx (V8SImode);
19794 t4 = gen_reg_rtx (V8SImode);
19795 mask = GEN_INT (high_p
19796 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19797 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19798 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
19799 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
19800
19801 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
19802 break;
19803
19804 case E_V8HImode:
19805 case E_V16HImode:
19806 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
19807 uns_p, OPTAB_DIRECT);
19808 t2 = expand_binop (mode,
19809 uns_p ? umul_highpart_optab : smul_highpart_optab,
19810 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
19811 gcc_assert (t1 && t2);
19812
19813 t3 = gen_reg_rtx (mode);
19814 ix86_expand_vec_interleave (t3, t1, t2, high_p);
19815 emit_move_insn (dest, gen_lowpart (wmode, t3));
19816 break;
19817
19818 case E_V16QImode:
19819 case E_V32QImode:
19820 case E_V32HImode:
19821 case E_V16SImode:
19822 case E_V64QImode:
19823 t1 = gen_reg_rtx (wmode);
19824 t2 = gen_reg_rtx (wmode);
19825 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
19826 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
19827
19828 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
19829 break;
19830
19831 default:
19832 gcc_unreachable ();
19833 }
19834 }
19835
19836 void
19837 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
19838 {
19839 rtx res_1, res_2, res_3, res_4;
19840
19841 res_1 = gen_reg_rtx (V4SImode);
19842 res_2 = gen_reg_rtx (V4SImode);
19843 res_3 = gen_reg_rtx (V2DImode);
19844 res_4 = gen_reg_rtx (V2DImode);
19845 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
19846 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
19847
19848 /* Move the results in element 2 down to element 1; we don't care
19849 what goes in elements 2 and 3. Then we can merge the parts
19850 back together with an interleave.
19851
19852 Note that two other sequences were tried:
19853 (1) Use interleaves at the start instead of psrldq, which allows
19854 us to use a single shufps to merge things back at the end.
19855 (2) Use shufps here to combine the two vectors, then pshufd to
19856 put the elements in the correct order.
19857 In both cases the cost of the reformatting stall was too high
19858 and the overall sequence slower. */
19859
19860 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
19861 const0_rtx, const2_rtx,
19862 const0_rtx, const0_rtx));
19863 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
19864 const0_rtx, const2_rtx,
19865 const0_rtx, const0_rtx));
19866 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
19867
19868 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
19869 }
19870
19871 void
19872 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
19873 {
19874 machine_mode mode = GET_MODE (op0);
19875 rtx t1, t2, t3, t4, t5, t6;
19876
19877 if (TARGET_AVX512DQ && mode == V8DImode)
19878 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
19879 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
19880 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
19881 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
19882 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
19883 else if (TARGET_XOP && mode == V2DImode)
19884 {
19885 /* op1: A,B,C,D, op2: E,F,G,H */
19886 op1 = gen_lowpart (V4SImode, op1);
19887 op2 = gen_lowpart (V4SImode, op2);
19888
19889 t1 = gen_reg_rtx (V4SImode);
19890 t2 = gen_reg_rtx (V4SImode);
19891 t3 = gen_reg_rtx (V2DImode);
19892 t4 = gen_reg_rtx (V2DImode);
19893
19894 /* t1: B,A,D,C */
19895 emit_insn (gen_sse2_pshufd_1 (t1, op1,
19896 GEN_INT (1),
19897 GEN_INT (0),
19898 GEN_INT (3),
19899 GEN_INT (2)));
19900
19901 /* t2: (B*E),(A*F),(D*G),(C*H) */
19902 emit_insn (gen_mulv4si3 (t2, t1, op2));
19903
19904 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
19905 emit_insn (gen_xop_phadddq (t3, t2));
19906
19907 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
19908 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
19909
19910 /* Multiply lower parts and add all */
19911 t5 = gen_reg_rtx (V2DImode);
19912 emit_insn (gen_vec_widen_umult_even_v4si (t5,
19913 gen_lowpart (V4SImode, op1),
19914 gen_lowpart (V4SImode, op2)));
19915 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
19916 }
19917 else
19918 {
19919 machine_mode nmode;
19920 rtx (*umul) (rtx, rtx, rtx);
19921
19922 if (mode == V2DImode)
19923 {
19924 umul = gen_vec_widen_umult_even_v4si;
19925 nmode = V4SImode;
19926 }
19927 else if (mode == V4DImode)
19928 {
19929 umul = gen_vec_widen_umult_even_v8si;
19930 nmode = V8SImode;
19931 }
19932 else if (mode == V8DImode)
19933 {
19934 umul = gen_vec_widen_umult_even_v16si;
19935 nmode = V16SImode;
19936 }
19937 else
19938 gcc_unreachable ();
19939
19940
19941 /* Multiply low parts. */
19942 t1 = gen_reg_rtx (mode);
19943 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
19944
19945 /* Shift input vectors right 32 bits so we can multiply high parts. */
19946 t6 = GEN_INT (32);
19947 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
19948 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
19949
19950 /* Multiply high parts by low parts. */
19951 t4 = gen_reg_rtx (mode);
19952 t5 = gen_reg_rtx (mode);
19953 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
19954 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
19955
19956 /* Combine and shift the highparts back. */
19957 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
19958 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
19959
19960 /* Combine high and low parts. */
19961 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
19962 }
19963
19964 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19965 gen_rtx_MULT (mode, op1, op2));
19966 }
19967
19968 /* Return 1 if control tansfer instruction INSN
19969 should be encoded with notrack prefix. */
19970
19971 bool
19972 ix86_notrack_prefixed_insn_p (rtx insn)
19973 {
19974 if (!insn || !((flag_cf_protection & CF_BRANCH)))
19975 return false;
19976
19977 if (CALL_P (insn))
19978 {
19979 rtx call = get_call_rtx_from (insn);
19980 gcc_assert (call != NULL_RTX);
19981 rtx addr = XEXP (call, 0);
19982
19983 /* Do not emit 'notrack' if it's not an indirect call. */
19984 if (MEM_P (addr)
19985 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
19986 return false;
19987 else
19988 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
19989 }
19990
19991 if (JUMP_P (insn) && !flag_cet_switch)
19992 {
19993 rtx target = JUMP_LABEL (insn);
19994 if (target == NULL_RTX || ANY_RETURN_P (target))
19995 return false;
19996
19997 /* Check the jump is a switch table. */
19998 rtx_insn *label = as_a<rtx_insn *> (target);
19999 rtx_insn *table = next_insn (label);
20000 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
20001 return false;
20002 else
20003 return true;
20004 }
20005 return false;
20006 }
20007
20008 /* Calculate integer abs() using only SSE2 instructions. */
20009
20010 void
20011 ix86_expand_sse2_abs (rtx target, rtx input)
20012 {
20013 machine_mode mode = GET_MODE (target);
20014 rtx tmp0, tmp1, x;
20015
20016 switch (mode)
20017 {
20018 case E_V2DImode:
20019 case E_V4DImode:
20020 /* For 64-bit signed integer X, with SSE4.2 use
20021 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20022 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20023 32 and use logical instead of arithmetic right shift (which is
20024 unimplemented) and subtract. */
20025 if (TARGET_SSE4_2)
20026 {
20027 tmp0 = gen_reg_rtx (mode);
20028 tmp1 = gen_reg_rtx (mode);
20029 emit_move_insn (tmp1, CONST0_RTX (mode));
20030 if (mode == E_V2DImode)
20031 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
20032 else
20033 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
20034 }
20035 else
20036 {
20037 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
20038 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
20039 - 1), NULL, 0, OPTAB_DIRECT);
20040 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
20041 }
20042
20043 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20044 NULL, 0, OPTAB_DIRECT);
20045 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20046 target, 0, OPTAB_DIRECT);
20047 break;
20048
20049 case E_V4SImode:
20050 /* For 32-bit signed integer X, the best way to calculate the absolute
20051 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20052 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
20053 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
20054 NULL, 0, OPTAB_DIRECT);
20055 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20056 NULL, 0, OPTAB_DIRECT);
20057 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20058 target, 0, OPTAB_DIRECT);
20059 break;
20060
20061 case E_V8HImode:
20062 /* For 16-bit signed integer X, the best way to calculate the absolute
20063 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20064 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20065
20066 x = expand_simple_binop (mode, SMAX, tmp0, input,
20067 target, 0, OPTAB_DIRECT);
20068 break;
20069
20070 case E_V16QImode:
20071 /* For 8-bit signed integer X, the best way to calculate the absolute
20072 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20073 as SSE2 provides the PMINUB insn. */
20074 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20075
20076 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
20077 target, 0, OPTAB_DIRECT);
20078 break;
20079
20080 default:
20081 gcc_unreachable ();
20082 }
20083
20084 if (x != target)
20085 emit_move_insn (target, x);
20086 }
20087
20088 /* Expand an extract from a vector register through pextr insn.
20089 Return true if successful. */
20090
20091 bool
20092 ix86_expand_pextr (rtx *operands)
20093 {
20094 rtx dst = operands[0];
20095 rtx src = operands[1];
20096
20097 unsigned int size = INTVAL (operands[2]);
20098 unsigned int pos = INTVAL (operands[3]);
20099
20100 if (SUBREG_P (dst))
20101 {
20102 /* Reject non-lowpart subregs. */
20103 if (SUBREG_BYTE (dst) > 0)
20104 return false;
20105 dst = SUBREG_REG (dst);
20106 }
20107
20108 if (SUBREG_P (src))
20109 {
20110 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
20111 src = SUBREG_REG (src);
20112 }
20113
20114 switch (GET_MODE (src))
20115 {
20116 case E_V16QImode:
20117 case E_V8HImode:
20118 case E_V4SImode:
20119 case E_V2DImode:
20120 case E_V1TImode:
20121 case E_TImode:
20122 {
20123 machine_mode srcmode, dstmode;
20124 rtx d, pat;
20125
20126 if (!int_mode_for_size (size, 0).exists (&dstmode))
20127 return false;
20128
20129 switch (dstmode)
20130 {
20131 case E_QImode:
20132 if (!TARGET_SSE4_1)
20133 return false;
20134 srcmode = V16QImode;
20135 break;
20136
20137 case E_HImode:
20138 if (!TARGET_SSE2)
20139 return false;
20140 srcmode = V8HImode;
20141 break;
20142
20143 case E_SImode:
20144 if (!TARGET_SSE4_1)
20145 return false;
20146 srcmode = V4SImode;
20147 break;
20148
20149 case E_DImode:
20150 gcc_assert (TARGET_64BIT);
20151 if (!TARGET_SSE4_1)
20152 return false;
20153 srcmode = V2DImode;
20154 break;
20155
20156 default:
20157 return false;
20158 }
20159
20160 /* Reject extractions from misaligned positions. */
20161 if (pos & (size-1))
20162 return false;
20163
20164 if (GET_MODE (dst) == dstmode)
20165 d = dst;
20166 else
20167 d = gen_reg_rtx (dstmode);
20168
20169 /* Construct insn pattern. */
20170 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20171 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20172
20173 /* Let the rtl optimizers know about the zero extension performed. */
20174 if (dstmode == QImode || dstmode == HImode)
20175 {
20176 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20177 d = gen_lowpart (SImode, d);
20178 }
20179
20180 emit_insn (gen_rtx_SET (d, pat));
20181
20182 if (d != dst)
20183 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20184 return true;
20185 }
20186
20187 default:
20188 return false;
20189 }
20190 }
20191
20192 /* Expand an insert into a vector register through pinsr insn.
20193 Return true if successful. */
20194
20195 bool
20196 ix86_expand_pinsr (rtx *operands)
20197 {
20198 rtx dst = operands[0];
20199 rtx src = operands[3];
20200
20201 unsigned int size = INTVAL (operands[1]);
20202 unsigned int pos = INTVAL (operands[2]);
20203
20204 if (SUBREG_P (dst))
20205 {
20206 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20207 dst = SUBREG_REG (dst);
20208 }
20209
20210 switch (GET_MODE (dst))
20211 {
20212 case E_V16QImode:
20213 case E_V8HImode:
20214 case E_V4SImode:
20215 case E_V2DImode:
20216 case E_V1TImode:
20217 case E_TImode:
20218 {
20219 machine_mode srcmode, dstmode;
20220 rtx (*pinsr)(rtx, rtx, rtx, rtx);
20221 rtx d;
20222
20223 if (!int_mode_for_size (size, 0).exists (&srcmode))
20224 return false;
20225
20226 switch (srcmode)
20227 {
20228 case E_QImode:
20229 if (!TARGET_SSE4_1)
20230 return false;
20231 dstmode = V16QImode;
20232 pinsr = gen_sse4_1_pinsrb;
20233 break;
20234
20235 case E_HImode:
20236 if (!TARGET_SSE2)
20237 return false;
20238 dstmode = V8HImode;
20239 pinsr = gen_sse2_pinsrw;
20240 break;
20241
20242 case E_SImode:
20243 if (!TARGET_SSE4_1)
20244 return false;
20245 dstmode = V4SImode;
20246 pinsr = gen_sse4_1_pinsrd;
20247 break;
20248
20249 case E_DImode:
20250 gcc_assert (TARGET_64BIT);
20251 if (!TARGET_SSE4_1)
20252 return false;
20253 dstmode = V2DImode;
20254 pinsr = gen_sse4_1_pinsrq;
20255 break;
20256
20257 default:
20258 return false;
20259 }
20260
20261 /* Reject insertions to misaligned positions. */
20262 if (pos & (size-1))
20263 return false;
20264
20265 if (SUBREG_P (src))
20266 {
20267 unsigned int srcpos = SUBREG_BYTE (src);
20268
20269 if (srcpos > 0)
20270 {
20271 rtx extr_ops[4];
20272
20273 extr_ops[0] = gen_reg_rtx (srcmode);
20274 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
20275 extr_ops[2] = GEN_INT (size);
20276 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
20277
20278 if (!ix86_expand_pextr (extr_ops))
20279 return false;
20280
20281 src = extr_ops[0];
20282 }
20283 else
20284 src = gen_lowpart (srcmode, SUBREG_REG (src));
20285 }
20286
20287 if (GET_MODE (dst) == dstmode)
20288 d = dst;
20289 else
20290 d = gen_reg_rtx (dstmode);
20291
20292 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
20293 gen_lowpart (srcmode, src),
20294 GEN_INT (1 << (pos / size))));
20295 if (d != dst)
20296 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20297 return true;
20298 }
20299
20300 default:
20301 return false;
20302 }
20303 }
20304
20305 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20306 upper against lower halves up to SSE reg size. */
20307
20308 machine_mode
20309 ix86_split_reduction (machine_mode mode)
20310 {
20311 /* Reduce lowpart against highpart until we reach SSE reg width to
20312 avoid cross-lane operations. */
20313 switch (mode)
20314 {
20315 case E_V8DImode:
20316 case E_V4DImode:
20317 return V2DImode;
20318 case E_V16SImode:
20319 case E_V8SImode:
20320 return V4SImode;
20321 case E_V32HImode:
20322 case E_V16HImode:
20323 return V8HImode;
20324 case E_V64QImode:
20325 case E_V32QImode:
20326 return V16QImode;
20327 case E_V16SFmode:
20328 case E_V8SFmode:
20329 return V4SFmode;
20330 case E_V8DFmode:
20331 case E_V4DFmode:
20332 return V2DFmode;
20333 default:
20334 return mode;
20335 }
20336 }
20337
20338 /* Generate call to __divmoddi4. */
20339
20340 void
20341 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
20342 rtx op0, rtx op1,
20343 rtx *quot_p, rtx *rem_p)
20344 {
20345 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
20346
20347 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
20348 mode, op0, mode, op1, mode,
20349 XEXP (rem, 0), Pmode);
20350 *quot_p = quot;
20351 *rem_p = rem;
20352 }
20353
20354 #include "gt-i386-expand.h"