Add GCC support to ENQCMD.
[gcc.git] / gcc / config / i386 / i386-expand.c
1 /* Copyright (C) 1988-2019 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "params.h"
62 #include "cselib.h"
63 #include "sched-int.h"
64 #include "opts.h"
65 #include "tree-pass.h"
66 #include "context.h"
67 #include "pass_manager.h"
68 #include "target-globals.h"
69 #include "gimple-iterator.h"
70 #include "tree-vectorizer.h"
71 #include "shrink-wrap.h"
72 #include "builtins.h"
73 #include "rtl-iter.h"
74 #include "tree-iterator.h"
75 #include "dbgcnt.h"
76 #include "case-cfn-macros.h"
77 #include "dojump.h"
78 #include "fold-const-call.h"
79 #include "tree-vrp.h"
80 #include "tree-ssanames.h"
81 #include "selftest.h"
82 #include "selftest-rtl.h"
83 #include "print-rtl.h"
84 #include "intl.h"
85 #include "ifcvt.h"
86 #include "symbol-summary.h"
87 #include "ipa-prop.h"
88 #include "ipa-fnsummary.h"
89 #include "wide-int-bitmask.h"
90 #include "tree-vector-builder.h"
91 #include "debug.h"
92 #include "dwarf2out.h"
93 #include "i386-options.h"
94 #include "i386-builtins.h"
95 #include "i386-expand.h"
96
97 /* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
102
103 void
104 split_double_mode (machine_mode mode, rtx operands[],
105 int num, rtx lo_half[], rtx hi_half[])
106 {
107 machine_mode half_mode;
108 unsigned int byte;
109
110 switch (mode)
111 {
112 case E_TImode:
113 half_mode = DImode;
114 break;
115 case E_DImode:
116 half_mode = SImode;
117 break;
118 default:
119 gcc_unreachable ();
120 }
121
122 byte = GET_MODE_SIZE (half_mode);
123
124 while (num--)
125 {
126 rtx op = operands[num];
127
128 /* simplify_subreg refuse to split volatile memory addresses,
129 but we still have to handle it. */
130 if (MEM_P (op))
131 {
132 lo_half[num] = adjust_address (op, half_mode, 0);
133 hi_half[num] = adjust_address (op, half_mode, byte);
134 }
135 else
136 {
137 lo_half[num] = simplify_gen_subreg (half_mode, op,
138 GET_MODE (op) == VOIDmode
139 ? mode : GET_MODE (op), 0);
140 hi_half[num] = simplify_gen_subreg (half_mode, op,
141 GET_MODE (op) == VOIDmode
142 ? mode : GET_MODE (op), byte);
143 }
144 }
145 }
146
147 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
148 for the target. */
149
150 void
151 ix86_expand_clear (rtx dest)
152 {
153 rtx tmp;
154
155 /* We play register width games, which are only valid after reload. */
156 gcc_assert (reload_completed);
157
158 /* Avoid HImode and its attendant prefix byte. */
159 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
160 dest = gen_rtx_REG (SImode, REGNO (dest));
161 tmp = gen_rtx_SET (dest, const0_rtx);
162
163 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
164 {
165 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
166 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
167 }
168
169 emit_insn (tmp);
170 }
171
172 void
173 ix86_expand_move (machine_mode mode, rtx operands[])
174 {
175 rtx op0, op1;
176 rtx tmp, addend = NULL_RTX;
177 enum tls_model model;
178
179 op0 = operands[0];
180 op1 = operands[1];
181
182 switch (GET_CODE (op1))
183 {
184 case CONST:
185 tmp = XEXP (op1, 0);
186
187 if (GET_CODE (tmp) != PLUS
188 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
189 break;
190
191 op1 = XEXP (tmp, 0);
192 addend = XEXP (tmp, 1);
193 /* FALLTHRU */
194
195 case SYMBOL_REF:
196 model = SYMBOL_REF_TLS_MODEL (op1);
197
198 if (model)
199 op1 = legitimize_tls_address (op1, model, true);
200 else if (ix86_force_load_from_GOT_p (op1))
201 {
202 /* Load the external function address via GOT slot to avoid PLT. */
203 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
204 (TARGET_64BIT
205 ? UNSPEC_GOTPCREL
206 : UNSPEC_GOT));
207 op1 = gen_rtx_CONST (Pmode, op1);
208 op1 = gen_const_mem (Pmode, op1);
209 set_mem_alias_set (op1, ix86_GOT_alias_set ());
210 }
211 else
212 {
213 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
214 if (tmp)
215 {
216 op1 = tmp;
217 if (!addend)
218 break;
219 }
220 else
221 {
222 op1 = operands[1];
223 break;
224 }
225 }
226
227 if (addend)
228 {
229 op1 = force_operand (op1, NULL_RTX);
230 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
231 op0, 1, OPTAB_DIRECT);
232 }
233 else
234 op1 = force_operand (op1, op0);
235
236 if (op1 == op0)
237 return;
238
239 op1 = convert_to_mode (mode, op1, 1);
240
241 default:
242 break;
243 }
244
245 if ((flag_pic || MACHOPIC_INDIRECT)
246 && symbolic_operand (op1, mode))
247 {
248 if (TARGET_MACHO && !TARGET_64BIT)
249 {
250 #if TARGET_MACHO
251 /* dynamic-no-pic */
252 if (MACHOPIC_INDIRECT)
253 {
254 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
255 ? op0 : gen_reg_rtx (Pmode);
256 op1 = machopic_indirect_data_reference (op1, temp);
257 if (MACHOPIC_PURE)
258 op1 = machopic_legitimize_pic_address (op1, mode,
259 temp == op1 ? 0 : temp);
260 }
261 if (op0 != op1 && GET_CODE (op0) != MEM)
262 {
263 rtx insn = gen_rtx_SET (op0, op1);
264 emit_insn (insn);
265 return;
266 }
267 if (GET_CODE (op0) == MEM)
268 op1 = force_reg (Pmode, op1);
269 else
270 {
271 rtx temp = op0;
272 if (GET_CODE (temp) != REG)
273 temp = gen_reg_rtx (Pmode);
274 temp = legitimize_pic_address (op1, temp);
275 if (temp == op0)
276 return;
277 op1 = temp;
278 }
279 /* dynamic-no-pic */
280 #endif
281 }
282 else
283 {
284 if (MEM_P (op0))
285 op1 = force_reg (mode, op1);
286 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
287 {
288 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
289 op1 = legitimize_pic_address (op1, reg);
290 if (op0 == op1)
291 return;
292 op1 = convert_to_mode (mode, op1, 1);
293 }
294 }
295 }
296 else
297 {
298 if (MEM_P (op0)
299 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
300 || !push_operand (op0, mode))
301 && MEM_P (op1))
302 op1 = force_reg (mode, op1);
303
304 if (push_operand (op0, mode)
305 && ! general_no_elim_operand (op1, mode))
306 op1 = copy_to_mode_reg (mode, op1);
307
308 /* Force large constants in 64bit compilation into register
309 to get them CSEed. */
310 if (can_create_pseudo_p ()
311 && (mode == DImode) && TARGET_64BIT
312 && immediate_operand (op1, mode)
313 && !x86_64_zext_immediate_operand (op1, VOIDmode)
314 && !register_operand (op0, mode)
315 && optimize)
316 op1 = copy_to_mode_reg (mode, op1);
317
318 if (can_create_pseudo_p ()
319 && CONST_DOUBLE_P (op1))
320 {
321 /* If we are loading a floating point constant to a register,
322 force the value to memory now, since we'll get better code
323 out the back end. */
324
325 op1 = validize_mem (force_const_mem (mode, op1));
326 if (!register_operand (op0, mode))
327 {
328 rtx temp = gen_reg_rtx (mode);
329 emit_insn (gen_rtx_SET (temp, op1));
330 emit_move_insn (op0, temp);
331 return;
332 }
333 }
334 }
335
336 emit_insn (gen_rtx_SET (op0, op1));
337 }
338
339 void
340 ix86_expand_vector_move (machine_mode mode, rtx operands[])
341 {
342 rtx op0 = operands[0], op1 = operands[1];
343 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
344 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
345 unsigned int align = (TARGET_IAMCU
346 ? GET_MODE_BITSIZE (mode)
347 : GET_MODE_ALIGNMENT (mode));
348
349 if (push_operand (op0, VOIDmode))
350 op0 = emit_move_resolve_push (mode, op0);
351
352 /* Force constants other than zero into memory. We do not know how
353 the instructions used to build constants modify the upper 64 bits
354 of the register, once we have that information we may be able
355 to handle some of them more efficiently. */
356 if (can_create_pseudo_p ()
357 && (CONSTANT_P (op1)
358 || (SUBREG_P (op1)
359 && CONSTANT_P (SUBREG_REG (op1))))
360 && ((register_operand (op0, mode)
361 && !standard_sse_constant_p (op1, mode))
362 /* ix86_expand_vector_move_misalign() does not like constants. */
363 || (SSE_REG_MODE_P (mode)
364 && MEM_P (op0)
365 && MEM_ALIGN (op0) < align)))
366 {
367 if (SUBREG_P (op1))
368 {
369 machine_mode imode = GET_MODE (SUBREG_REG (op1));
370 rtx r = force_const_mem (imode, SUBREG_REG (op1));
371 if (r)
372 r = validize_mem (r);
373 else
374 r = force_reg (imode, SUBREG_REG (op1));
375 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
376 }
377 else
378 op1 = validize_mem (force_const_mem (mode, op1));
379 }
380
381 /* We need to check memory alignment for SSE mode since attribute
382 can make operands unaligned. */
383 if (can_create_pseudo_p ()
384 && SSE_REG_MODE_P (mode)
385 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
386 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
387 {
388 rtx tmp[2];
389
390 /* ix86_expand_vector_move_misalign() does not like both
391 arguments in memory. */
392 if (!register_operand (op0, mode)
393 && !register_operand (op1, mode))
394 op1 = force_reg (mode, op1);
395
396 tmp[0] = op0; tmp[1] = op1;
397 ix86_expand_vector_move_misalign (mode, tmp);
398 return;
399 }
400
401 /* Make operand1 a register if it isn't already. */
402 if (can_create_pseudo_p ()
403 && !register_operand (op0, mode)
404 && !register_operand (op1, mode))
405 {
406 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
407 return;
408 }
409
410 emit_insn (gen_rtx_SET (op0, op1));
411 }
412
413 /* Split 32-byte AVX unaligned load and store if needed. */
414
415 static void
416 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
417 {
418 rtx m;
419 rtx (*extract) (rtx, rtx, rtx);
420 machine_mode mode;
421
422 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
423 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
424 {
425 emit_insn (gen_rtx_SET (op0, op1));
426 return;
427 }
428
429 rtx orig_op0 = NULL_RTX;
430 mode = GET_MODE (op0);
431 switch (GET_MODE_CLASS (mode))
432 {
433 case MODE_VECTOR_INT:
434 case MODE_INT:
435 if (mode != V32QImode)
436 {
437 if (!MEM_P (op0))
438 {
439 orig_op0 = op0;
440 op0 = gen_reg_rtx (V32QImode);
441 }
442 else
443 op0 = gen_lowpart (V32QImode, op0);
444 op1 = gen_lowpart (V32QImode, op1);
445 mode = V32QImode;
446 }
447 break;
448 case MODE_VECTOR_FLOAT:
449 break;
450 default:
451 gcc_unreachable ();
452 }
453
454 switch (mode)
455 {
456 default:
457 gcc_unreachable ();
458 case E_V32QImode:
459 extract = gen_avx_vextractf128v32qi;
460 mode = V16QImode;
461 break;
462 case E_V8SFmode:
463 extract = gen_avx_vextractf128v8sf;
464 mode = V4SFmode;
465 break;
466 case E_V4DFmode:
467 extract = gen_avx_vextractf128v4df;
468 mode = V2DFmode;
469 break;
470 }
471
472 if (MEM_P (op1))
473 {
474 rtx r = gen_reg_rtx (mode);
475 m = adjust_address (op1, mode, 0);
476 emit_move_insn (r, m);
477 m = adjust_address (op1, mode, 16);
478 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
479 emit_move_insn (op0, r);
480 }
481 else if (MEM_P (op0))
482 {
483 m = adjust_address (op0, mode, 0);
484 emit_insn (extract (m, op1, const0_rtx));
485 m = adjust_address (op0, mode, 16);
486 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
487 }
488 else
489 gcc_unreachable ();
490
491 if (orig_op0)
492 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
493 }
494
495 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
496 straight to ix86_expand_vector_move. */
497 /* Code generation for scalar reg-reg moves of single and double precision data:
498 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
499 movaps reg, reg
500 else
501 movss reg, reg
502 if (x86_sse_partial_reg_dependency == true)
503 movapd reg, reg
504 else
505 movsd reg, reg
506
507 Code generation for scalar loads of double precision data:
508 if (x86_sse_split_regs == true)
509 movlpd mem, reg (gas syntax)
510 else
511 movsd mem, reg
512
513 Code generation for unaligned packed loads of single precision data
514 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
515 if (x86_sse_unaligned_move_optimal)
516 movups mem, reg
517
518 if (x86_sse_partial_reg_dependency == true)
519 {
520 xorps reg, reg
521 movlps mem, reg
522 movhps mem+8, reg
523 }
524 else
525 {
526 movlps mem, reg
527 movhps mem+8, reg
528 }
529
530 Code generation for unaligned packed loads of double precision data
531 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
532 if (x86_sse_unaligned_move_optimal)
533 movupd mem, reg
534
535 if (x86_sse_split_regs == true)
536 {
537 movlpd mem, reg
538 movhpd mem+8, reg
539 }
540 else
541 {
542 movsd mem, reg
543 movhpd mem+8, reg
544 }
545 */
546
547 void
548 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
549 {
550 rtx op0, op1, m;
551
552 op0 = operands[0];
553 op1 = operands[1];
554
555 /* Use unaligned load/store for AVX512 or when optimizing for size. */
556 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
557 {
558 emit_insn (gen_rtx_SET (op0, op1));
559 return;
560 }
561
562 if (TARGET_AVX)
563 {
564 if (GET_MODE_SIZE (mode) == 32)
565 ix86_avx256_split_vector_move_misalign (op0, op1);
566 else
567 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
568 emit_insn (gen_rtx_SET (op0, op1));
569 return;
570 }
571
572 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
573 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
574 {
575 emit_insn (gen_rtx_SET (op0, op1));
576 return;
577 }
578
579 /* ??? If we have typed data, then it would appear that using
580 movdqu is the only way to get unaligned data loaded with
581 integer type. */
582 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
583 {
584 emit_insn (gen_rtx_SET (op0, op1));
585 return;
586 }
587
588 if (MEM_P (op1))
589 {
590 if (TARGET_SSE2 && mode == V2DFmode)
591 {
592 rtx zero;
593
594 /* When SSE registers are split into halves, we can avoid
595 writing to the top half twice. */
596 if (TARGET_SSE_SPLIT_REGS)
597 {
598 emit_clobber (op0);
599 zero = op0;
600 }
601 else
602 {
603 /* ??? Not sure about the best option for the Intel chips.
604 The following would seem to satisfy; the register is
605 entirely cleared, breaking the dependency chain. We
606 then store to the upper half, with a dependency depth
607 of one. A rumor has it that Intel recommends two movsd
608 followed by an unpacklpd, but this is unconfirmed. And
609 given that the dependency depth of the unpacklpd would
610 still be one, I'm not sure why this would be better. */
611 zero = CONST0_RTX (V2DFmode);
612 }
613
614 m = adjust_address (op1, DFmode, 0);
615 emit_insn (gen_sse2_loadlpd (op0, zero, m));
616 m = adjust_address (op1, DFmode, 8);
617 emit_insn (gen_sse2_loadhpd (op0, op0, m));
618 }
619 else
620 {
621 rtx t;
622
623 if (mode != V4SFmode)
624 t = gen_reg_rtx (V4SFmode);
625 else
626 t = op0;
627
628 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
629 emit_move_insn (t, CONST0_RTX (V4SFmode));
630 else
631 emit_clobber (t);
632
633 m = adjust_address (op1, V2SFmode, 0);
634 emit_insn (gen_sse_loadlps (t, t, m));
635 m = adjust_address (op1, V2SFmode, 8);
636 emit_insn (gen_sse_loadhps (t, t, m));
637 if (mode != V4SFmode)
638 emit_move_insn (op0, gen_lowpart (mode, t));
639 }
640 }
641 else if (MEM_P (op0))
642 {
643 if (TARGET_SSE2 && mode == V2DFmode)
644 {
645 m = adjust_address (op0, DFmode, 0);
646 emit_insn (gen_sse2_storelpd (m, op1));
647 m = adjust_address (op0, DFmode, 8);
648 emit_insn (gen_sse2_storehpd (m, op1));
649 }
650 else
651 {
652 if (mode != V4SFmode)
653 op1 = gen_lowpart (V4SFmode, op1);
654
655 m = adjust_address (op0, V2SFmode, 0);
656 emit_insn (gen_sse_storelps (m, op1));
657 m = adjust_address (op0, V2SFmode, 8);
658 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
659 }
660 }
661 else
662 gcc_unreachable ();
663 }
664
665 /* Move bits 64:95 to bits 32:63. */
666
667 void
668 ix86_move_vector_high_sse_to_mmx (rtx op)
669 {
670 rtx mask = gen_rtx_PARALLEL (VOIDmode,
671 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
672 GEN_INT (0), GEN_INT (0)));
673 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
674 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
675 rtx insn = gen_rtx_SET (dest, op);
676 emit_insn (insn);
677 }
678
679 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
680
681 void
682 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
683 {
684 rtx op0 = operands[0];
685 rtx op1 = operands[1];
686 rtx op2 = operands[2];
687
688 machine_mode dmode = GET_MODE (op0);
689 machine_mode smode = GET_MODE (op1);
690 machine_mode inner_dmode = GET_MODE_INNER (dmode);
691 machine_mode inner_smode = GET_MODE_INNER (smode);
692
693 /* Get the corresponding SSE mode for destination. */
694 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
695 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
696 nunits).require ();
697 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
698 nunits / 2).require ();
699
700 /* Get the corresponding SSE mode for source. */
701 nunits = 16 / GET_MODE_SIZE (inner_smode);
702 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
703 nunits).require ();
704
705 /* Generate SSE pack with signed/unsigned saturation. */
706 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
707 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
708 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
709
710 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
711 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
712 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
713 op1, op2));
714 emit_insn (insn);
715
716 ix86_move_vector_high_sse_to_mmx (op0);
717 }
718
719 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
720
721 void
722 ix86_split_mmx_punpck (rtx operands[], bool high_p)
723 {
724 rtx op0 = operands[0];
725 rtx op1 = operands[1];
726 rtx op2 = operands[2];
727 machine_mode mode = GET_MODE (op0);
728 rtx mask;
729 /* The corresponding SSE mode. */
730 machine_mode sse_mode, double_sse_mode;
731
732 switch (mode)
733 {
734 case E_V8QImode:
735 sse_mode = V16QImode;
736 double_sse_mode = V32QImode;
737 mask = gen_rtx_PARALLEL (VOIDmode,
738 gen_rtvec (16,
739 GEN_INT (0), GEN_INT (16),
740 GEN_INT (1), GEN_INT (17),
741 GEN_INT (2), GEN_INT (18),
742 GEN_INT (3), GEN_INT (19),
743 GEN_INT (4), GEN_INT (20),
744 GEN_INT (5), GEN_INT (21),
745 GEN_INT (6), GEN_INT (22),
746 GEN_INT (7), GEN_INT (23)));
747 break;
748
749 case E_V4HImode:
750 sse_mode = V8HImode;
751 double_sse_mode = V16HImode;
752 mask = gen_rtx_PARALLEL (VOIDmode,
753 gen_rtvec (8,
754 GEN_INT (0), GEN_INT (8),
755 GEN_INT (1), GEN_INT (9),
756 GEN_INT (2), GEN_INT (10),
757 GEN_INT (3), GEN_INT (11)));
758 break;
759
760 case E_V2SImode:
761 sse_mode = V4SImode;
762 double_sse_mode = V8SImode;
763 mask = gen_rtx_PARALLEL (VOIDmode,
764 gen_rtvec (4,
765 GEN_INT (0), GEN_INT (4),
766 GEN_INT (1), GEN_INT (5)));
767 break;
768
769 default:
770 gcc_unreachable ();
771 }
772
773 /* Generate SSE punpcklXX. */
774 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
775 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
776 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
777
778 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
779 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
780 rtx insn = gen_rtx_SET (dest, op2);
781 emit_insn (insn);
782
783 if (high_p)
784 {
785 /* Move bits 64:127 to bits 0:63. */
786 mask = gen_rtx_PARALLEL (VOIDmode,
787 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
788 GEN_INT (0), GEN_INT (0)));
789 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
790 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
791 insn = gen_rtx_SET (dest, op1);
792 emit_insn (insn);
793 }
794 }
795
796 /* Helper function of ix86_fixup_binary_operands to canonicalize
797 operand order. Returns true if the operands should be swapped. */
798
799 static bool
800 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
801 rtx operands[])
802 {
803 rtx dst = operands[0];
804 rtx src1 = operands[1];
805 rtx src2 = operands[2];
806
807 /* If the operation is not commutative, we can't do anything. */
808 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
809 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
810 return false;
811
812 /* Highest priority is that src1 should match dst. */
813 if (rtx_equal_p (dst, src1))
814 return false;
815 if (rtx_equal_p (dst, src2))
816 return true;
817
818 /* Next highest priority is that immediate constants come second. */
819 if (immediate_operand (src2, mode))
820 return false;
821 if (immediate_operand (src1, mode))
822 return true;
823
824 /* Lowest priority is that memory references should come second. */
825 if (MEM_P (src2))
826 return false;
827 if (MEM_P (src1))
828 return true;
829
830 return false;
831 }
832
833
834 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
835 destination to use for the operation. If different from the true
836 destination in operands[0], a copy operation will be required. */
837
838 rtx
839 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
840 rtx operands[])
841 {
842 rtx dst = operands[0];
843 rtx src1 = operands[1];
844 rtx src2 = operands[2];
845
846 /* Canonicalize operand order. */
847 if (ix86_swap_binary_operands_p (code, mode, operands))
848 {
849 /* It is invalid to swap operands of different modes. */
850 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
851
852 std::swap (src1, src2);
853 }
854
855 /* Both source operands cannot be in memory. */
856 if (MEM_P (src1) && MEM_P (src2))
857 {
858 /* Optimization: Only read from memory once. */
859 if (rtx_equal_p (src1, src2))
860 {
861 src2 = force_reg (mode, src2);
862 src1 = src2;
863 }
864 else if (rtx_equal_p (dst, src1))
865 src2 = force_reg (mode, src2);
866 else
867 src1 = force_reg (mode, src1);
868 }
869
870 /* If the destination is memory, and we do not have matching source
871 operands, do things in registers. */
872 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
873 dst = gen_reg_rtx (mode);
874
875 /* Source 1 cannot be a constant. */
876 if (CONSTANT_P (src1))
877 src1 = force_reg (mode, src1);
878
879 /* Source 1 cannot be a non-matching memory. */
880 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
881 src1 = force_reg (mode, src1);
882
883 /* Improve address combine. */
884 if (code == PLUS
885 && GET_MODE_CLASS (mode) == MODE_INT
886 && MEM_P (src2))
887 src2 = force_reg (mode, src2);
888
889 operands[1] = src1;
890 operands[2] = src2;
891 return dst;
892 }
893
894 /* Similarly, but assume that the destination has already been
895 set up properly. */
896
897 void
898 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
899 machine_mode mode, rtx operands[])
900 {
901 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
902 gcc_assert (dst == operands[0]);
903 }
904
905 /* Attempt to expand a binary operator. Make the expansion closer to the
906 actual machine, then just general_operand, which will allow 3 separate
907 memory references (one output, two input) in a single insn. */
908
909 void
910 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
911 rtx operands[])
912 {
913 rtx src1, src2, dst, op, clob;
914
915 dst = ix86_fixup_binary_operands (code, mode, operands);
916 src1 = operands[1];
917 src2 = operands[2];
918
919 /* Emit the instruction. */
920
921 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
922
923 if (reload_completed
924 && code == PLUS
925 && !rtx_equal_p (dst, src1))
926 {
927 /* This is going to be an LEA; avoid splitting it later. */
928 emit_insn (op);
929 }
930 else
931 {
932 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
933 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
934 }
935
936 /* Fix up the destination if needed. */
937 if (dst != operands[0])
938 emit_move_insn (operands[0], dst);
939 }
940
941 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
942 the given OPERANDS. */
943
944 void
945 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
946 rtx operands[])
947 {
948 rtx op1 = NULL_RTX, op2 = NULL_RTX;
949 if (SUBREG_P (operands[1]))
950 {
951 op1 = operands[1];
952 op2 = operands[2];
953 }
954 else if (SUBREG_P (operands[2]))
955 {
956 op1 = operands[2];
957 op2 = operands[1];
958 }
959 /* Optimize (__m128i) d | (__m128i) e and similar code
960 when d and e are float vectors into float vector logical
961 insn. In C/C++ without using intrinsics there is no other way
962 to express vector logical operation on float vectors than
963 to cast them temporarily to integer vectors. */
964 if (op1
965 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
966 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
967 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
968 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
969 && SUBREG_BYTE (op1) == 0
970 && (GET_CODE (op2) == CONST_VECTOR
971 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
972 && SUBREG_BYTE (op2) == 0))
973 && can_create_pseudo_p ())
974 {
975 rtx dst;
976 switch (GET_MODE (SUBREG_REG (op1)))
977 {
978 case E_V4SFmode:
979 case E_V8SFmode:
980 case E_V16SFmode:
981 case E_V2DFmode:
982 case E_V4DFmode:
983 case E_V8DFmode:
984 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
985 if (GET_CODE (op2) == CONST_VECTOR)
986 {
987 op2 = gen_lowpart (GET_MODE (dst), op2);
988 op2 = force_reg (GET_MODE (dst), op2);
989 }
990 else
991 {
992 op1 = operands[1];
993 op2 = SUBREG_REG (operands[2]);
994 if (!vector_operand (op2, GET_MODE (dst)))
995 op2 = force_reg (GET_MODE (dst), op2);
996 }
997 op1 = SUBREG_REG (op1);
998 if (!vector_operand (op1, GET_MODE (dst)))
999 op1 = force_reg (GET_MODE (dst), op1);
1000 emit_insn (gen_rtx_SET (dst,
1001 gen_rtx_fmt_ee (code, GET_MODE (dst),
1002 op1, op2)));
1003 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1004 return;
1005 default:
1006 break;
1007 }
1008 }
1009 if (!vector_operand (operands[1], mode))
1010 operands[1] = force_reg (mode, operands[1]);
1011 if (!vector_operand (operands[2], mode))
1012 operands[2] = force_reg (mode, operands[2]);
1013 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1014 emit_insn (gen_rtx_SET (operands[0],
1015 gen_rtx_fmt_ee (code, mode, operands[1],
1016 operands[2])));
1017 }
1018
1019 /* Return TRUE or FALSE depending on whether the binary operator meets the
1020 appropriate constraints. */
1021
1022 bool
1023 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1024 rtx operands[3])
1025 {
1026 rtx dst = operands[0];
1027 rtx src1 = operands[1];
1028 rtx src2 = operands[2];
1029
1030 /* Both source operands cannot be in memory. */
1031 if (MEM_P (src1) && MEM_P (src2))
1032 return false;
1033
1034 /* Canonicalize operand order for commutative operators. */
1035 if (ix86_swap_binary_operands_p (code, mode, operands))
1036 std::swap (src1, src2);
1037
1038 /* If the destination is memory, we must have a matching source operand. */
1039 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1040 return false;
1041
1042 /* Source 1 cannot be a constant. */
1043 if (CONSTANT_P (src1))
1044 return false;
1045
1046 /* Source 1 cannot be a non-matching memory. */
1047 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1048 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1049 return (code == AND
1050 && (mode == HImode
1051 || mode == SImode
1052 || (TARGET_64BIT && mode == DImode))
1053 && satisfies_constraint_L (src2));
1054
1055 return true;
1056 }
1057
1058 /* Attempt to expand a unary operator. Make the expansion closer to the
1059 actual machine, then just general_operand, which will allow 2 separate
1060 memory references (one output, one input) in a single insn. */
1061
1062 void
1063 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1064 rtx operands[])
1065 {
1066 bool matching_memory = false;
1067 rtx src, dst, op, clob;
1068
1069 dst = operands[0];
1070 src = operands[1];
1071
1072 /* If the destination is memory, and we do not have matching source
1073 operands, do things in registers. */
1074 if (MEM_P (dst))
1075 {
1076 if (rtx_equal_p (dst, src))
1077 matching_memory = true;
1078 else
1079 dst = gen_reg_rtx (mode);
1080 }
1081
1082 /* When source operand is memory, destination must match. */
1083 if (MEM_P (src) && !matching_memory)
1084 src = force_reg (mode, src);
1085
1086 /* Emit the instruction. */
1087
1088 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1089
1090 if (code == NOT)
1091 emit_insn (op);
1092 else
1093 {
1094 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1095 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1096 }
1097
1098 /* Fix up the destination if needed. */
1099 if (dst != operands[0])
1100 emit_move_insn (operands[0], dst);
1101 }
1102
1103 /* Predict just emitted jump instruction to be taken with probability PROB. */
1104
1105 static void
1106 predict_jump (int prob)
1107 {
1108 rtx_insn *insn = get_last_insn ();
1109 gcc_assert (JUMP_P (insn));
1110 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1111 }
1112
1113 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1114 divisor are within the range [0-255]. */
1115
1116 void
1117 ix86_split_idivmod (machine_mode mode, rtx operands[],
1118 bool unsigned_p)
1119 {
1120 rtx_code_label *end_label, *qimode_label;
1121 rtx div, mod;
1122 rtx_insn *insn;
1123 rtx scratch, tmp0, tmp1, tmp2;
1124 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1125 rtx (*gen_zero_extend) (rtx, rtx);
1126 rtx (*gen_test_ccno_1) (rtx, rtx);
1127
1128 switch (mode)
1129 {
1130 case E_SImode:
1131 if (GET_MODE (operands[0]) == SImode)
1132 {
1133 if (GET_MODE (operands[1]) == SImode)
1134 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1135 else
1136 gen_divmod4_1
1137 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1138 gen_zero_extend = gen_zero_extendqisi2;
1139 }
1140 else
1141 {
1142 gen_divmod4_1
1143 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1144 gen_zero_extend = gen_zero_extendqidi2;
1145 }
1146 gen_test_ccno_1 = gen_testsi_ccno_1;
1147 break;
1148 case E_DImode:
1149 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1150 gen_test_ccno_1 = gen_testdi_ccno_1;
1151 gen_zero_extend = gen_zero_extendqidi2;
1152 break;
1153 default:
1154 gcc_unreachable ();
1155 }
1156
1157 end_label = gen_label_rtx ();
1158 qimode_label = gen_label_rtx ();
1159
1160 scratch = gen_reg_rtx (mode);
1161
1162 /* Use 8bit unsigned divimod if dividend and divisor are within
1163 the range [0-255]. */
1164 emit_move_insn (scratch, operands[2]);
1165 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1166 scratch, 1, OPTAB_DIRECT);
1167 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
1168 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1169 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1170 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1171 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1172 pc_rtx);
1173 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1174 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1175 JUMP_LABEL (insn) = qimode_label;
1176
1177 /* Generate original signed/unsigned divimod. */
1178 div = gen_divmod4_1 (operands[0], operands[1],
1179 operands[2], operands[3]);
1180 emit_insn (div);
1181
1182 /* Branch to the end. */
1183 emit_jump_insn (gen_jump (end_label));
1184 emit_barrier ();
1185
1186 /* Generate 8bit unsigned divide. */
1187 emit_label (qimode_label);
1188 /* Don't use operands[0] for result of 8bit divide since not all
1189 registers support QImode ZERO_EXTRACT. */
1190 tmp0 = lowpart_subreg (HImode, scratch, mode);
1191 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1192 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1193 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1194
1195 if (unsigned_p)
1196 {
1197 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1198 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1199 }
1200 else
1201 {
1202 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1203 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1204 }
1205 if (mode == SImode)
1206 {
1207 if (GET_MODE (operands[0]) != SImode)
1208 div = gen_rtx_ZERO_EXTEND (DImode, div);
1209 if (GET_MODE (operands[1]) != SImode)
1210 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1211 }
1212
1213 /* Extract remainder from AH. */
1214 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
1215 tmp0, GEN_INT (8), GEN_INT (8));
1216 if (REG_P (operands[1]))
1217 insn = emit_move_insn (operands[1], tmp1);
1218 else
1219 {
1220 /* Need a new scratch register since the old one has result
1221 of 8bit divide. */
1222 scratch = gen_reg_rtx (GET_MODE (operands[1]));
1223 emit_move_insn (scratch, tmp1);
1224 insn = emit_move_insn (operands[1], scratch);
1225 }
1226 set_unique_reg_note (insn, REG_EQUAL, mod);
1227
1228 /* Zero extend quotient from AL. */
1229 tmp1 = gen_lowpart (QImode, tmp0);
1230 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
1231 set_unique_reg_note (insn, REG_EQUAL, div);
1232
1233 emit_label (end_label);
1234 }
1235
1236 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1237 matches destination. RTX includes clobber of FLAGS_REG. */
1238
1239 void
1240 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1241 rtx dst, rtx src)
1242 {
1243 rtx op, clob;
1244
1245 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1246 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1247
1248 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1249 }
1250
1251 /* Return true if regno1 def is nearest to the insn. */
1252
1253 static bool
1254 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1255 {
1256 rtx_insn *prev = insn;
1257 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1258
1259 if (insn == start)
1260 return false;
1261 while (prev && prev != start)
1262 {
1263 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1264 {
1265 prev = PREV_INSN (prev);
1266 continue;
1267 }
1268 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1269 return true;
1270 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1271 return false;
1272 prev = PREV_INSN (prev);
1273 }
1274
1275 /* None of the regs is defined in the bb. */
1276 return false;
1277 }
1278
1279 /* Split lea instructions into a sequence of instructions
1280 which are executed on ALU to avoid AGU stalls.
1281 It is assumed that it is allowed to clobber flags register
1282 at lea position. */
1283
1284 void
1285 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1286 {
1287 unsigned int regno0, regno1, regno2;
1288 struct ix86_address parts;
1289 rtx target, tmp;
1290 int ok, adds;
1291
1292 ok = ix86_decompose_address (operands[1], &parts);
1293 gcc_assert (ok);
1294
1295 target = gen_lowpart (mode, operands[0]);
1296
1297 regno0 = true_regnum (target);
1298 regno1 = INVALID_REGNUM;
1299 regno2 = INVALID_REGNUM;
1300
1301 if (parts.base)
1302 {
1303 parts.base = gen_lowpart (mode, parts.base);
1304 regno1 = true_regnum (parts.base);
1305 }
1306
1307 if (parts.index)
1308 {
1309 parts.index = gen_lowpart (mode, parts.index);
1310 regno2 = true_regnum (parts.index);
1311 }
1312
1313 if (parts.disp)
1314 parts.disp = gen_lowpart (mode, parts.disp);
1315
1316 if (parts.scale > 1)
1317 {
1318 /* Case r1 = r1 + ... */
1319 if (regno1 == regno0)
1320 {
1321 /* If we have a case r1 = r1 + C * r2 then we
1322 should use multiplication which is very
1323 expensive. Assume cost model is wrong if we
1324 have such case here. */
1325 gcc_assert (regno2 != regno0);
1326
1327 for (adds = parts.scale; adds > 0; adds--)
1328 ix86_emit_binop (PLUS, mode, target, parts.index);
1329 }
1330 else
1331 {
1332 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1333 if (regno0 != regno2)
1334 emit_insn (gen_rtx_SET (target, parts.index));
1335
1336 /* Use shift for scaling. */
1337 ix86_emit_binop (ASHIFT, mode, target,
1338 GEN_INT (exact_log2 (parts.scale)));
1339
1340 if (parts.base)
1341 ix86_emit_binop (PLUS, mode, target, parts.base);
1342
1343 if (parts.disp && parts.disp != const0_rtx)
1344 ix86_emit_binop (PLUS, mode, target, parts.disp);
1345 }
1346 }
1347 else if (!parts.base && !parts.index)
1348 {
1349 gcc_assert(parts.disp);
1350 emit_insn (gen_rtx_SET (target, parts.disp));
1351 }
1352 else
1353 {
1354 if (!parts.base)
1355 {
1356 if (regno0 != regno2)
1357 emit_insn (gen_rtx_SET (target, parts.index));
1358 }
1359 else if (!parts.index)
1360 {
1361 if (regno0 != regno1)
1362 emit_insn (gen_rtx_SET (target, parts.base));
1363 }
1364 else
1365 {
1366 if (regno0 == regno1)
1367 tmp = parts.index;
1368 else if (regno0 == regno2)
1369 tmp = parts.base;
1370 else
1371 {
1372 rtx tmp1;
1373
1374 /* Find better operand for SET instruction, depending
1375 on which definition is farther from the insn. */
1376 if (find_nearest_reg_def (insn, regno1, regno2))
1377 tmp = parts.index, tmp1 = parts.base;
1378 else
1379 tmp = parts.base, tmp1 = parts.index;
1380
1381 emit_insn (gen_rtx_SET (target, tmp));
1382
1383 if (parts.disp && parts.disp != const0_rtx)
1384 ix86_emit_binop (PLUS, mode, target, parts.disp);
1385
1386 ix86_emit_binop (PLUS, mode, target, tmp1);
1387 return;
1388 }
1389
1390 ix86_emit_binop (PLUS, mode, target, tmp);
1391 }
1392
1393 if (parts.disp && parts.disp != const0_rtx)
1394 ix86_emit_binop (PLUS, mode, target, parts.disp);
1395 }
1396 }
1397
1398 /* Post-reload splitter for converting an SF or DFmode value in an
1399 SSE register into an unsigned SImode. */
1400
1401 void
1402 ix86_split_convert_uns_si_sse (rtx operands[])
1403 {
1404 machine_mode vecmode;
1405 rtx value, large, zero_or_two31, input, two31, x;
1406
1407 large = operands[1];
1408 zero_or_two31 = operands[2];
1409 input = operands[3];
1410 two31 = operands[4];
1411 vecmode = GET_MODE (large);
1412 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1413
1414 /* Load up the value into the low element. We must ensure that the other
1415 elements are valid floats -- zero is the easiest such value. */
1416 if (MEM_P (input))
1417 {
1418 if (vecmode == V4SFmode)
1419 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1420 else
1421 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1422 }
1423 else
1424 {
1425 input = gen_rtx_REG (vecmode, REGNO (input));
1426 emit_move_insn (value, CONST0_RTX (vecmode));
1427 if (vecmode == V4SFmode)
1428 emit_insn (gen_sse_movss (value, value, input));
1429 else
1430 emit_insn (gen_sse2_movsd (value, value, input));
1431 }
1432
1433 emit_move_insn (large, two31);
1434 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1435
1436 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1437 emit_insn (gen_rtx_SET (large, x));
1438
1439 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1440 emit_insn (gen_rtx_SET (zero_or_two31, x));
1441
1442 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1443 emit_insn (gen_rtx_SET (value, x));
1444
1445 large = gen_rtx_REG (V4SImode, REGNO (large));
1446 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1447
1448 x = gen_rtx_REG (V4SImode, REGNO (value));
1449 if (vecmode == V4SFmode)
1450 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1451 else
1452 emit_insn (gen_sse2_cvttpd2dq (x, value));
1453 value = x;
1454
1455 emit_insn (gen_xorv4si3 (value, value, large));
1456 }
1457
1458 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1459 machine_mode mode, rtx target,
1460 rtx var, int one_var);
1461
1462 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1463 Expects the 64-bit DImode to be supplied in a pair of integral
1464 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1465 -mfpmath=sse, !optimize_size only. */
1466
1467 void
1468 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1469 {
1470 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1471 rtx int_xmm, fp_xmm;
1472 rtx biases, exponents;
1473 rtx x;
1474
1475 int_xmm = gen_reg_rtx (V4SImode);
1476 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1477 emit_insn (gen_movdi_to_sse (int_xmm, input));
1478 else if (TARGET_SSE_SPLIT_REGS)
1479 {
1480 emit_clobber (int_xmm);
1481 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1482 }
1483 else
1484 {
1485 x = gen_reg_rtx (V2DImode);
1486 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1487 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1488 }
1489
1490 x = gen_rtx_CONST_VECTOR (V4SImode,
1491 gen_rtvec (4, GEN_INT (0x43300000UL),
1492 GEN_INT (0x45300000UL),
1493 const0_rtx, const0_rtx));
1494 exponents = validize_mem (force_const_mem (V4SImode, x));
1495
1496 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1497 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1498
1499 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1500 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1501 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1502 (0x1.0p84 + double(fp_value_hi_xmm)).
1503 Note these exponents differ by 32. */
1504
1505 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1506
1507 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1508 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1509 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1510 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1511 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1512 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1513 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1514 biases = validize_mem (force_const_mem (V2DFmode, biases));
1515 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1516
1517 /* Add the upper and lower DFmode values together. */
1518 if (TARGET_SSE3)
1519 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1520 else
1521 {
1522 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1523 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1524 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1525 }
1526
1527 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1528 }
1529
1530 /* Not used, but eases macroization of patterns. */
1531 void
1532 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1533 {
1534 gcc_unreachable ();
1535 }
1536
1537 /* Convert an unsigned SImode value into a DFmode. Only currently used
1538 for SSE, but applicable anywhere. */
1539
1540 void
1541 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1542 {
1543 REAL_VALUE_TYPE TWO31r;
1544 rtx x, fp;
1545
1546 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1547 NULL, 1, OPTAB_DIRECT);
1548
1549 fp = gen_reg_rtx (DFmode);
1550 emit_insn (gen_floatsidf2 (fp, x));
1551
1552 real_ldexp (&TWO31r, &dconst1, 31);
1553 x = const_double_from_real_value (TWO31r, DFmode);
1554
1555 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1556 if (x != target)
1557 emit_move_insn (target, x);
1558 }
1559
1560 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1561 32-bit mode; otherwise we have a direct convert instruction. */
1562
1563 void
1564 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1565 {
1566 REAL_VALUE_TYPE TWO32r;
1567 rtx fp_lo, fp_hi, x;
1568
1569 fp_lo = gen_reg_rtx (DFmode);
1570 fp_hi = gen_reg_rtx (DFmode);
1571
1572 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1573
1574 real_ldexp (&TWO32r, &dconst1, 32);
1575 x = const_double_from_real_value (TWO32r, DFmode);
1576 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1577
1578 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1579
1580 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1581 0, OPTAB_DIRECT);
1582 if (x != target)
1583 emit_move_insn (target, x);
1584 }
1585
1586 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1587 For x86_32, -mfpmath=sse, !optimize_size only. */
1588 void
1589 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1590 {
1591 REAL_VALUE_TYPE ONE16r;
1592 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1593
1594 real_ldexp (&ONE16r, &dconst1, 16);
1595 x = const_double_from_real_value (ONE16r, SFmode);
1596 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1597 NULL, 0, OPTAB_DIRECT);
1598 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1599 NULL, 0, OPTAB_DIRECT);
1600 fp_hi = gen_reg_rtx (SFmode);
1601 fp_lo = gen_reg_rtx (SFmode);
1602 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1603 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1604 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1605 0, OPTAB_DIRECT);
1606 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1607 0, OPTAB_DIRECT);
1608 if (!rtx_equal_p (target, fp_hi))
1609 emit_move_insn (target, fp_hi);
1610 }
1611
1612 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1613 a vector of unsigned ints VAL to vector of floats TARGET. */
1614
1615 void
1616 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1617 {
1618 rtx tmp[8];
1619 REAL_VALUE_TYPE TWO16r;
1620 machine_mode intmode = GET_MODE (val);
1621 machine_mode fltmode = GET_MODE (target);
1622 rtx (*cvt) (rtx, rtx);
1623
1624 if (intmode == V4SImode)
1625 cvt = gen_floatv4siv4sf2;
1626 else
1627 cvt = gen_floatv8siv8sf2;
1628 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1629 tmp[0] = force_reg (intmode, tmp[0]);
1630 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1631 OPTAB_DIRECT);
1632 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1633 NULL_RTX, 1, OPTAB_DIRECT);
1634 tmp[3] = gen_reg_rtx (fltmode);
1635 emit_insn (cvt (tmp[3], tmp[1]));
1636 tmp[4] = gen_reg_rtx (fltmode);
1637 emit_insn (cvt (tmp[4], tmp[2]));
1638 real_ldexp (&TWO16r, &dconst1, 16);
1639 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1640 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1641 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1642 OPTAB_DIRECT);
1643 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1644 OPTAB_DIRECT);
1645 if (tmp[7] != target)
1646 emit_move_insn (target, tmp[7]);
1647 }
1648
1649 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1650 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1651 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1652 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1653
1654 rtx
1655 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1656 {
1657 REAL_VALUE_TYPE TWO31r;
1658 rtx two31r, tmp[4];
1659 machine_mode mode = GET_MODE (val);
1660 machine_mode scalarmode = GET_MODE_INNER (mode);
1661 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1662 rtx (*cmp) (rtx, rtx, rtx, rtx);
1663 int i;
1664
1665 for (i = 0; i < 3; i++)
1666 tmp[i] = gen_reg_rtx (mode);
1667 real_ldexp (&TWO31r, &dconst1, 31);
1668 two31r = const_double_from_real_value (TWO31r, scalarmode);
1669 two31r = ix86_build_const_vector (mode, 1, two31r);
1670 two31r = force_reg (mode, two31r);
1671 switch (mode)
1672 {
1673 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1674 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1675 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1676 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1677 default: gcc_unreachable ();
1678 }
1679 tmp[3] = gen_rtx_LE (mode, two31r, val);
1680 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1681 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1682 0, OPTAB_DIRECT);
1683 if (intmode == V4SImode || TARGET_AVX2)
1684 *xorp = expand_simple_binop (intmode, ASHIFT,
1685 gen_lowpart (intmode, tmp[0]),
1686 GEN_INT (31), NULL_RTX, 0,
1687 OPTAB_DIRECT);
1688 else
1689 {
1690 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
1691 two31 = ix86_build_const_vector (intmode, 1, two31);
1692 *xorp = expand_simple_binop (intmode, AND,
1693 gen_lowpart (intmode, tmp[0]),
1694 two31, NULL_RTX, 0,
1695 OPTAB_DIRECT);
1696 }
1697 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1698 0, OPTAB_DIRECT);
1699 }
1700
1701 /* Generate code for floating point ABS or NEG. */
1702
1703 void
1704 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1705 rtx operands[])
1706 {
1707 rtx mask, set, dst, src;
1708 bool use_sse = false;
1709 bool vector_mode = VECTOR_MODE_P (mode);
1710 machine_mode vmode = mode;
1711
1712 if (vector_mode)
1713 use_sse = true;
1714 else if (mode == TFmode)
1715 use_sse = true;
1716 else if (TARGET_SSE_MATH)
1717 {
1718 use_sse = SSE_FLOAT_MODE_P (mode);
1719 if (mode == SFmode)
1720 vmode = V4SFmode;
1721 else if (mode == DFmode)
1722 vmode = V2DFmode;
1723 }
1724
1725 /* NEG and ABS performed with SSE use bitwise mask operations.
1726 Create the appropriate mask now. */
1727 if (use_sse)
1728 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
1729 else
1730 mask = NULL_RTX;
1731
1732 dst = operands[0];
1733 src = operands[1];
1734
1735 set = gen_rtx_fmt_e (code, mode, src);
1736 set = gen_rtx_SET (dst, set);
1737
1738 if (mask)
1739 {
1740 rtx use, clob;
1741 rtvec par;
1742
1743 use = gen_rtx_USE (VOIDmode, mask);
1744 if (vector_mode)
1745 par = gen_rtvec (2, set, use);
1746 else
1747 {
1748 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1749 par = gen_rtvec (3, set, use, clob);
1750 }
1751 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1752 }
1753 else
1754 emit_insn (set);
1755 }
1756
1757 /* Expand a copysign operation. Special case operand 0 being a constant. */
1758
1759 void
1760 ix86_expand_copysign (rtx operands[])
1761 {
1762 machine_mode mode, vmode;
1763 rtx dest, op0, op1, mask, nmask;
1764
1765 dest = operands[0];
1766 op0 = operands[1];
1767 op1 = operands[2];
1768
1769 mode = GET_MODE (dest);
1770
1771 if (mode == SFmode)
1772 vmode = V4SFmode;
1773 else if (mode == DFmode)
1774 vmode = V2DFmode;
1775 else
1776 vmode = mode;
1777
1778 if (CONST_DOUBLE_P (op0))
1779 {
1780 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
1781
1782 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1783 op0 = simplify_unary_operation (ABS, mode, op0, mode);
1784
1785 if (mode == SFmode || mode == DFmode)
1786 {
1787 if (op0 == CONST0_RTX (mode))
1788 op0 = CONST0_RTX (vmode);
1789 else
1790 {
1791 rtx v = ix86_build_const_vector (vmode, false, op0);
1792
1793 op0 = force_reg (vmode, v);
1794 }
1795 }
1796 else if (op0 != CONST0_RTX (mode))
1797 op0 = force_reg (mode, op0);
1798
1799 mask = ix86_build_signbit_mask (vmode, 0, 0);
1800
1801 if (mode == SFmode)
1802 copysign_insn = gen_copysignsf3_const;
1803 else if (mode == DFmode)
1804 copysign_insn = gen_copysigndf3_const;
1805 else
1806 copysign_insn = gen_copysigntf3_const;
1807
1808 emit_insn (copysign_insn (dest, op0, op1, mask));
1809 }
1810 else
1811 {
1812 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
1813
1814 nmask = ix86_build_signbit_mask (vmode, 0, 1);
1815 mask = ix86_build_signbit_mask (vmode, 0, 0);
1816
1817 if (mode == SFmode)
1818 copysign_insn = gen_copysignsf3_var;
1819 else if (mode == DFmode)
1820 copysign_insn = gen_copysigndf3_var;
1821 else
1822 copysign_insn = gen_copysigntf3_var;
1823
1824 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
1825 }
1826 }
1827
1828 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1829 be a constant, and so has already been expanded into a vector constant. */
1830
1831 void
1832 ix86_split_copysign_const (rtx operands[])
1833 {
1834 machine_mode mode, vmode;
1835 rtx dest, op0, mask, x;
1836
1837 dest = operands[0];
1838 op0 = operands[1];
1839 mask = operands[3];
1840
1841 mode = GET_MODE (dest);
1842 vmode = GET_MODE (mask);
1843
1844 dest = lowpart_subreg (vmode, dest, mode);
1845 x = gen_rtx_AND (vmode, dest, mask);
1846 emit_insn (gen_rtx_SET (dest, x));
1847
1848 if (op0 != CONST0_RTX (vmode))
1849 {
1850 x = gen_rtx_IOR (vmode, dest, op0);
1851 emit_insn (gen_rtx_SET (dest, x));
1852 }
1853 }
1854
1855 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1856 so we have to do two masks. */
1857
1858 void
1859 ix86_split_copysign_var (rtx operands[])
1860 {
1861 machine_mode mode, vmode;
1862 rtx dest, scratch, op0, op1, mask, nmask, x;
1863
1864 dest = operands[0];
1865 scratch = operands[1];
1866 op0 = operands[2];
1867 op1 = operands[3];
1868 nmask = operands[4];
1869 mask = operands[5];
1870
1871 mode = GET_MODE (dest);
1872 vmode = GET_MODE (mask);
1873
1874 if (rtx_equal_p (op0, op1))
1875 {
1876 /* Shouldn't happen often (it's useless, obviously), but when it does
1877 we'd generate incorrect code if we continue below. */
1878 emit_move_insn (dest, op0);
1879 return;
1880 }
1881
1882 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
1883 {
1884 gcc_assert (REGNO (op1) == REGNO (scratch));
1885
1886 x = gen_rtx_AND (vmode, scratch, mask);
1887 emit_insn (gen_rtx_SET (scratch, x));
1888
1889 dest = mask;
1890 op0 = lowpart_subreg (vmode, op0, mode);
1891 x = gen_rtx_NOT (vmode, dest);
1892 x = gen_rtx_AND (vmode, x, op0);
1893 emit_insn (gen_rtx_SET (dest, x));
1894 }
1895 else
1896 {
1897 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
1898 {
1899 x = gen_rtx_AND (vmode, scratch, mask);
1900 }
1901 else /* alternative 2,4 */
1902 {
1903 gcc_assert (REGNO (mask) == REGNO (scratch));
1904 op1 = lowpart_subreg (vmode, op1, mode);
1905 x = gen_rtx_AND (vmode, scratch, op1);
1906 }
1907 emit_insn (gen_rtx_SET (scratch, x));
1908
1909 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
1910 {
1911 dest = lowpart_subreg (vmode, op0, mode);
1912 x = gen_rtx_AND (vmode, dest, nmask);
1913 }
1914 else /* alternative 3,4 */
1915 {
1916 gcc_assert (REGNO (nmask) == REGNO (dest));
1917 dest = nmask;
1918 op0 = lowpart_subreg (vmode, op0, mode);
1919 x = gen_rtx_AND (vmode, dest, op0);
1920 }
1921 emit_insn (gen_rtx_SET (dest, x));
1922 }
1923
1924 x = gen_rtx_IOR (vmode, dest, scratch);
1925 emit_insn (gen_rtx_SET (dest, x));
1926 }
1927
1928 /* Expand an xorsign operation. */
1929
1930 void
1931 ix86_expand_xorsign (rtx operands[])
1932 {
1933 rtx (*xorsign_insn)(rtx, rtx, rtx, rtx);
1934 machine_mode mode, vmode;
1935 rtx dest, op0, op1, mask;
1936
1937 dest = operands[0];
1938 op0 = operands[1];
1939 op1 = operands[2];
1940
1941 mode = GET_MODE (dest);
1942
1943 if (mode == SFmode)
1944 {
1945 xorsign_insn = gen_xorsignsf3_1;
1946 vmode = V4SFmode;
1947 }
1948 else if (mode == DFmode)
1949 {
1950 xorsign_insn = gen_xorsigndf3_1;
1951 vmode = V2DFmode;
1952 }
1953 else
1954 gcc_unreachable ();
1955
1956 mask = ix86_build_signbit_mask (vmode, 0, 0);
1957
1958 emit_insn (xorsign_insn (dest, op0, op1, mask));
1959 }
1960
1961 /* Deconstruct an xorsign operation into bit masks. */
1962
1963 void
1964 ix86_split_xorsign (rtx operands[])
1965 {
1966 machine_mode mode, vmode;
1967 rtx dest, op0, mask, x;
1968
1969 dest = operands[0];
1970 op0 = operands[1];
1971 mask = operands[3];
1972
1973 mode = GET_MODE (dest);
1974 vmode = GET_MODE (mask);
1975
1976 dest = lowpart_subreg (vmode, dest, mode);
1977 x = gen_rtx_AND (vmode, dest, mask);
1978 emit_insn (gen_rtx_SET (dest, x));
1979
1980 op0 = lowpart_subreg (vmode, op0, mode);
1981 x = gen_rtx_XOR (vmode, dest, op0);
1982 emit_insn (gen_rtx_SET (dest, x));
1983 }
1984
1985 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
1986
1987 void
1988 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
1989 {
1990 machine_mode mode = GET_MODE (op0);
1991 rtx tmp;
1992
1993 /* Handle special case - vector comparsion with boolean result, transform
1994 it using ptest instruction. */
1995 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
1996 {
1997 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
1998 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
1999
2000 gcc_assert (code == EQ || code == NE);
2001 /* Generate XOR since we can't check that one operand is zero vector. */
2002 tmp = gen_reg_rtx (mode);
2003 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2004 tmp = gen_lowpart (p_mode, tmp);
2005 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2006 gen_rtx_UNSPEC (CCmode,
2007 gen_rtvec (2, tmp, tmp),
2008 UNSPEC_PTEST)));
2009 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2010 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2011 gen_rtx_LABEL_REF (VOIDmode, label),
2012 pc_rtx);
2013 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2014 return;
2015 }
2016
2017 switch (mode)
2018 {
2019 case E_SFmode:
2020 case E_DFmode:
2021 case E_XFmode:
2022 case E_QImode:
2023 case E_HImode:
2024 case E_SImode:
2025 simple:
2026 tmp = ix86_expand_compare (code, op0, op1);
2027 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2028 gen_rtx_LABEL_REF (VOIDmode, label),
2029 pc_rtx);
2030 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2031 return;
2032
2033 case E_DImode:
2034 if (TARGET_64BIT)
2035 goto simple;
2036 /* For 32-bit target DI comparison may be performed on
2037 SSE registers. To allow this we should avoid split
2038 to SI mode which is achieved by doing xor in DI mode
2039 and then comparing with zero (which is recognized by
2040 STV pass). We don't compare using xor when optimizing
2041 for size. */
2042 if (!optimize_insn_for_size_p ()
2043 && TARGET_STV
2044 && (code == EQ || code == NE))
2045 {
2046 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2047 op1 = const0_rtx;
2048 }
2049 /* FALLTHRU */
2050 case E_TImode:
2051 /* Expand DImode branch into multiple compare+branch. */
2052 {
2053 rtx lo[2], hi[2];
2054 rtx_code_label *label2;
2055 enum rtx_code code1, code2, code3;
2056 machine_mode submode;
2057
2058 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2059 {
2060 std::swap (op0, op1);
2061 code = swap_condition (code);
2062 }
2063
2064 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2065 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2066
2067 submode = mode == DImode ? SImode : DImode;
2068
2069 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2070 avoid two branches. This costs one extra insn, so disable when
2071 optimizing for size. */
2072
2073 if ((code == EQ || code == NE)
2074 && (!optimize_insn_for_size_p ()
2075 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2076 {
2077 rtx xor0, xor1;
2078
2079 xor1 = hi[0];
2080 if (hi[1] != const0_rtx)
2081 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2082 NULL_RTX, 0, OPTAB_WIDEN);
2083
2084 xor0 = lo[0];
2085 if (lo[1] != const0_rtx)
2086 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2087 NULL_RTX, 0, OPTAB_WIDEN);
2088
2089 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2090 NULL_RTX, 0, OPTAB_WIDEN);
2091
2092 ix86_expand_branch (code, tmp, const0_rtx, label);
2093 return;
2094 }
2095
2096 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2097 op1 is a constant and the low word is zero, then we can just
2098 examine the high word. Similarly for low word -1 and
2099 less-or-equal-than or greater-than. */
2100
2101 if (CONST_INT_P (hi[1]))
2102 switch (code)
2103 {
2104 case LT: case LTU: case GE: case GEU:
2105 if (lo[1] == const0_rtx)
2106 {
2107 ix86_expand_branch (code, hi[0], hi[1], label);
2108 return;
2109 }
2110 break;
2111 case LE: case LEU: case GT: case GTU:
2112 if (lo[1] == constm1_rtx)
2113 {
2114 ix86_expand_branch (code, hi[0], hi[1], label);
2115 return;
2116 }
2117 break;
2118 default:
2119 break;
2120 }
2121
2122 /* Emulate comparisons that do not depend on Zero flag with
2123 double-word subtraction. Note that only Overflow, Sign
2124 and Carry flags are valid, so swap arguments and condition
2125 of comparisons that would otherwise test Zero flag. */
2126
2127 switch (code)
2128 {
2129 case LE: case LEU: case GT: case GTU:
2130 std::swap (lo[0], lo[1]);
2131 std::swap (hi[0], hi[1]);
2132 code = swap_condition (code);
2133 /* FALLTHRU */
2134
2135 case LT: case LTU: case GE: case GEU:
2136 {
2137 rtx (*cmp_insn) (rtx, rtx);
2138 rtx (*sbb_insn) (rtx, rtx, rtx);
2139 bool uns = (code == LTU || code == GEU);
2140
2141 if (TARGET_64BIT)
2142 {
2143 cmp_insn = gen_cmpdi_1;
2144 sbb_insn
2145 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
2146 }
2147 else
2148 {
2149 cmp_insn = gen_cmpsi_1;
2150 sbb_insn
2151 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
2152 }
2153
2154 if (!nonimmediate_operand (lo[0], submode))
2155 lo[0] = force_reg (submode, lo[0]);
2156 if (!x86_64_general_operand (lo[1], submode))
2157 lo[1] = force_reg (submode, lo[1]);
2158
2159 if (!register_operand (hi[0], submode))
2160 hi[0] = force_reg (submode, hi[0]);
2161 if ((uns && !nonimmediate_operand (hi[1], submode))
2162 || (!uns && !x86_64_general_operand (hi[1], submode)))
2163 hi[1] = force_reg (submode, hi[1]);
2164
2165 emit_insn (cmp_insn (lo[0], lo[1]));
2166 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
2167
2168 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2169
2170 ix86_expand_branch (code, tmp, const0_rtx, label);
2171 return;
2172 }
2173
2174 default:
2175 break;
2176 }
2177
2178 /* Otherwise, we need two or three jumps. */
2179
2180 label2 = gen_label_rtx ();
2181
2182 code1 = code;
2183 code2 = swap_condition (code);
2184 code3 = unsigned_condition (code);
2185
2186 switch (code)
2187 {
2188 case LT: case GT: case LTU: case GTU:
2189 break;
2190
2191 case LE: code1 = LT; code2 = GT; break;
2192 case GE: code1 = GT; code2 = LT; break;
2193 case LEU: code1 = LTU; code2 = GTU; break;
2194 case GEU: code1 = GTU; code2 = LTU; break;
2195
2196 case EQ: code1 = UNKNOWN; code2 = NE; break;
2197 case NE: code2 = UNKNOWN; break;
2198
2199 default:
2200 gcc_unreachable ();
2201 }
2202
2203 /*
2204 * a < b =>
2205 * if (hi(a) < hi(b)) goto true;
2206 * if (hi(a) > hi(b)) goto false;
2207 * if (lo(a) < lo(b)) goto true;
2208 * false:
2209 */
2210
2211 if (code1 != UNKNOWN)
2212 ix86_expand_branch (code1, hi[0], hi[1], label);
2213 if (code2 != UNKNOWN)
2214 ix86_expand_branch (code2, hi[0], hi[1], label2);
2215
2216 ix86_expand_branch (code3, lo[0], lo[1], label);
2217
2218 if (code2 != UNKNOWN)
2219 emit_label (label2);
2220 return;
2221 }
2222
2223 default:
2224 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2225 goto simple;
2226 }
2227 }
2228
2229 /* Figure out whether to use unordered fp comparisons. */
2230
2231 static bool
2232 ix86_unordered_fp_compare (enum rtx_code code)
2233 {
2234 if (!TARGET_IEEE_FP)
2235 return false;
2236
2237 switch (code)
2238 {
2239 case GT:
2240 case GE:
2241 case LT:
2242 case LE:
2243 return false;
2244
2245 case EQ:
2246 case NE:
2247
2248 case LTGT:
2249 case UNORDERED:
2250 case ORDERED:
2251 case UNLT:
2252 case UNLE:
2253 case UNGT:
2254 case UNGE:
2255 case UNEQ:
2256 return true;
2257
2258 default:
2259 gcc_unreachable ();
2260 }
2261 }
2262
2263 /* Return a comparison we can do and that it is equivalent to
2264 swap_condition (code) apart possibly from orderedness.
2265 But, never change orderedness if TARGET_IEEE_FP, returning
2266 UNKNOWN in that case if necessary. */
2267
2268 static enum rtx_code
2269 ix86_fp_swap_condition (enum rtx_code code)
2270 {
2271 switch (code)
2272 {
2273 case GT: /* GTU - CF=0 & ZF=0 */
2274 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2275 case GE: /* GEU - CF=0 */
2276 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2277 case UNLT: /* LTU - CF=1 */
2278 return TARGET_IEEE_FP ? UNKNOWN : GT;
2279 case UNLE: /* LEU - CF=1 | ZF=1 */
2280 return TARGET_IEEE_FP ? UNKNOWN : GE;
2281 default:
2282 return swap_condition (code);
2283 }
2284 }
2285
2286 /* Return cost of comparison CODE using the best strategy for performance.
2287 All following functions do use number of instructions as a cost metrics.
2288 In future this should be tweaked to compute bytes for optimize_size and
2289 take into account performance of various instructions on various CPUs. */
2290
2291 static int
2292 ix86_fp_comparison_cost (enum rtx_code code)
2293 {
2294 int arith_cost;
2295
2296 /* The cost of code using bit-twiddling on %ah. */
2297 switch (code)
2298 {
2299 case UNLE:
2300 case UNLT:
2301 case LTGT:
2302 case GT:
2303 case GE:
2304 case UNORDERED:
2305 case ORDERED:
2306 case UNEQ:
2307 arith_cost = 4;
2308 break;
2309 case LT:
2310 case NE:
2311 case EQ:
2312 case UNGE:
2313 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2314 break;
2315 case LE:
2316 case UNGT:
2317 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2318 break;
2319 default:
2320 gcc_unreachable ();
2321 }
2322
2323 switch (ix86_fp_comparison_strategy (code))
2324 {
2325 case IX86_FPCMP_COMI:
2326 return arith_cost > 4 ? 3 : 2;
2327 case IX86_FPCMP_SAHF:
2328 return arith_cost > 4 ? 4 : 3;
2329 default:
2330 return arith_cost;
2331 }
2332 }
2333
2334 /* Swap, force into registers, or otherwise massage the two operands
2335 to a fp comparison. The operands are updated in place; the new
2336 comparison code is returned. */
2337
2338 static enum rtx_code
2339 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2340 {
2341 bool unordered_compare = ix86_unordered_fp_compare (code);
2342 rtx op0 = *pop0, op1 = *pop1;
2343 machine_mode op_mode = GET_MODE (op0);
2344 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2345
2346 /* All of the unordered compare instructions only work on registers.
2347 The same is true of the fcomi compare instructions. The XFmode
2348 compare instructions require registers except when comparing
2349 against zero or when converting operand 1 from fixed point to
2350 floating point. */
2351
2352 if (!is_sse
2353 && (unordered_compare
2354 || (op_mode == XFmode
2355 && ! (standard_80387_constant_p (op0) == 1
2356 || standard_80387_constant_p (op1) == 1)
2357 && GET_CODE (op1) != FLOAT)
2358 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2359 {
2360 op0 = force_reg (op_mode, op0);
2361 op1 = force_reg (op_mode, op1);
2362 }
2363 else
2364 {
2365 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2366 things around if they appear profitable, otherwise force op0
2367 into a register. */
2368
2369 if (standard_80387_constant_p (op0) == 0
2370 || (MEM_P (op0)
2371 && ! (standard_80387_constant_p (op1) == 0
2372 || MEM_P (op1))))
2373 {
2374 enum rtx_code new_code = ix86_fp_swap_condition (code);
2375 if (new_code != UNKNOWN)
2376 {
2377 std::swap (op0, op1);
2378 code = new_code;
2379 }
2380 }
2381
2382 if (!REG_P (op0))
2383 op0 = force_reg (op_mode, op0);
2384
2385 if (CONSTANT_P (op1))
2386 {
2387 int tmp = standard_80387_constant_p (op1);
2388 if (tmp == 0)
2389 op1 = validize_mem (force_const_mem (op_mode, op1));
2390 else if (tmp == 1)
2391 {
2392 if (TARGET_CMOVE)
2393 op1 = force_reg (op_mode, op1);
2394 }
2395 else
2396 op1 = force_reg (op_mode, op1);
2397 }
2398 }
2399
2400 /* Try to rearrange the comparison to make it cheaper. */
2401 if (ix86_fp_comparison_cost (code)
2402 > ix86_fp_comparison_cost (swap_condition (code))
2403 && (REG_P (op1) || can_create_pseudo_p ()))
2404 {
2405 std::swap (op0, op1);
2406 code = swap_condition (code);
2407 if (!REG_P (op0))
2408 op0 = force_reg (op_mode, op0);
2409 }
2410
2411 *pop0 = op0;
2412 *pop1 = op1;
2413 return code;
2414 }
2415
2416 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2417
2418 static rtx
2419 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2420 {
2421 bool unordered_compare = ix86_unordered_fp_compare (code);
2422 machine_mode cmp_mode;
2423 rtx tmp, scratch;
2424
2425 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2426
2427 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2428 if (unordered_compare)
2429 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2430
2431 /* Do fcomi/sahf based test when profitable. */
2432 switch (ix86_fp_comparison_strategy (code))
2433 {
2434 case IX86_FPCMP_COMI:
2435 cmp_mode = CCFPmode;
2436 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2437 break;
2438
2439 case IX86_FPCMP_SAHF:
2440 cmp_mode = CCFPmode;
2441 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2442 scratch = gen_reg_rtx (HImode);
2443 emit_insn (gen_rtx_SET (scratch, tmp));
2444 emit_insn (gen_x86_sahf_1 (scratch));
2445 break;
2446
2447 case IX86_FPCMP_ARITH:
2448 cmp_mode = CCNOmode;
2449 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2450 scratch = gen_reg_rtx (HImode);
2451 emit_insn (gen_rtx_SET (scratch, tmp));
2452
2453 /* In the unordered case, we have to check C2 for NaN's, which
2454 doesn't happen to work out to anything nice combination-wise.
2455 So do some bit twiddling on the value we've got in AH to come
2456 up with an appropriate set of condition codes. */
2457
2458 switch (code)
2459 {
2460 case GT:
2461 case UNGT:
2462 if (code == GT || !TARGET_IEEE_FP)
2463 {
2464 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2465 code = EQ;
2466 }
2467 else
2468 {
2469 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2470 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2471 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2472 cmp_mode = CCmode;
2473 code = GEU;
2474 }
2475 break;
2476 case LT:
2477 case UNLT:
2478 if (code == LT && TARGET_IEEE_FP)
2479 {
2480 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2481 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2482 cmp_mode = CCmode;
2483 code = EQ;
2484 }
2485 else
2486 {
2487 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2488 code = NE;
2489 }
2490 break;
2491 case GE:
2492 case UNGE:
2493 if (code == GE || !TARGET_IEEE_FP)
2494 {
2495 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2496 code = EQ;
2497 }
2498 else
2499 {
2500 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2501 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2502 code = NE;
2503 }
2504 break;
2505 case LE:
2506 case UNLE:
2507 if (code == LE && TARGET_IEEE_FP)
2508 {
2509 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2510 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2511 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2512 cmp_mode = CCmode;
2513 code = LTU;
2514 }
2515 else
2516 {
2517 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2518 code = NE;
2519 }
2520 break;
2521 case EQ:
2522 case UNEQ:
2523 if (code == EQ && TARGET_IEEE_FP)
2524 {
2525 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2526 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2527 cmp_mode = CCmode;
2528 code = EQ;
2529 }
2530 else
2531 {
2532 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2533 code = NE;
2534 }
2535 break;
2536 case NE:
2537 case LTGT:
2538 if (code == NE && TARGET_IEEE_FP)
2539 {
2540 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2541 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2542 GEN_INT (0x40)));
2543 code = NE;
2544 }
2545 else
2546 {
2547 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2548 code = EQ;
2549 }
2550 break;
2551
2552 case UNORDERED:
2553 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2554 code = NE;
2555 break;
2556 case ORDERED:
2557 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2558 code = EQ;
2559 break;
2560
2561 default:
2562 gcc_unreachable ();
2563 }
2564 break;
2565
2566 default:
2567 gcc_unreachable();
2568 }
2569
2570 /* Return the test that should be put into the flags user, i.e.
2571 the bcc, scc, or cmov instruction. */
2572 return gen_rtx_fmt_ee (code, VOIDmode,
2573 gen_rtx_REG (cmp_mode, FLAGS_REG),
2574 const0_rtx);
2575 }
2576
2577 /* Generate insn patterns to do an integer compare of OPERANDS. */
2578
2579 static rtx
2580 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2581 {
2582 machine_mode cmpmode;
2583 rtx tmp, flags;
2584
2585 cmpmode = SELECT_CC_MODE (code, op0, op1);
2586 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2587
2588 /* This is very simple, but making the interface the same as in the
2589 FP case makes the rest of the code easier. */
2590 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2591 emit_insn (gen_rtx_SET (flags, tmp));
2592
2593 /* Return the test that should be put into the flags user, i.e.
2594 the bcc, scc, or cmov instruction. */
2595 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2596 }
2597
2598 static rtx
2599 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2600 {
2601 rtx ret;
2602
2603 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2604 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2605
2606 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2607 {
2608 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2609 ret = ix86_expand_fp_compare (code, op0, op1);
2610 }
2611 else
2612 ret = ix86_expand_int_compare (code, op0, op1);
2613
2614 return ret;
2615 }
2616
2617 void
2618 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2619 {
2620 rtx ret;
2621
2622 gcc_assert (GET_MODE (dest) == QImode);
2623
2624 ret = ix86_expand_compare (code, op0, op1);
2625 PUT_MODE (ret, QImode);
2626 emit_insn (gen_rtx_SET (dest, ret));
2627 }
2628
2629 /* Expand comparison setting or clearing carry flag. Return true when
2630 successful and set pop for the operation. */
2631 static bool
2632 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2633 {
2634 machine_mode mode
2635 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2636
2637 /* Do not handle double-mode compares that go through special path. */
2638 if (mode == (TARGET_64BIT ? TImode : DImode))
2639 return false;
2640
2641 if (SCALAR_FLOAT_MODE_P (mode))
2642 {
2643 rtx compare_op;
2644 rtx_insn *compare_seq;
2645
2646 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2647
2648 /* Shortcut: following common codes never translate
2649 into carry flag compares. */
2650 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2651 || code == ORDERED || code == UNORDERED)
2652 return false;
2653
2654 /* These comparisons require zero flag; swap operands so they won't. */
2655 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2656 && !TARGET_IEEE_FP)
2657 {
2658 std::swap (op0, op1);
2659 code = swap_condition (code);
2660 }
2661
2662 /* Try to expand the comparison and verify that we end up with
2663 carry flag based comparison. This fails to be true only when
2664 we decide to expand comparison using arithmetic that is not
2665 too common scenario. */
2666 start_sequence ();
2667 compare_op = ix86_expand_fp_compare (code, op0, op1);
2668 compare_seq = get_insns ();
2669 end_sequence ();
2670
2671 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2672 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2673 else
2674 code = GET_CODE (compare_op);
2675
2676 if (code != LTU && code != GEU)
2677 return false;
2678
2679 emit_insn (compare_seq);
2680 *pop = compare_op;
2681 return true;
2682 }
2683
2684 if (!INTEGRAL_MODE_P (mode))
2685 return false;
2686
2687 switch (code)
2688 {
2689 case LTU:
2690 case GEU:
2691 break;
2692
2693 /* Convert a==0 into (unsigned)a<1. */
2694 case EQ:
2695 case NE:
2696 if (op1 != const0_rtx)
2697 return false;
2698 op1 = const1_rtx;
2699 code = (code == EQ ? LTU : GEU);
2700 break;
2701
2702 /* Convert a>b into b<a or a>=b-1. */
2703 case GTU:
2704 case LEU:
2705 if (CONST_INT_P (op1))
2706 {
2707 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2708 /* Bail out on overflow. We still can swap operands but that
2709 would force loading of the constant into register. */
2710 if (op1 == const0_rtx
2711 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2712 return false;
2713 code = (code == GTU ? GEU : LTU);
2714 }
2715 else
2716 {
2717 std::swap (op0, op1);
2718 code = (code == GTU ? LTU : GEU);
2719 }
2720 break;
2721
2722 /* Convert a>=0 into (unsigned)a<0x80000000. */
2723 case LT:
2724 case GE:
2725 if (mode == DImode || op1 != const0_rtx)
2726 return false;
2727 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2728 code = (code == LT ? GEU : LTU);
2729 break;
2730 case LE:
2731 case GT:
2732 if (mode == DImode || op1 != constm1_rtx)
2733 return false;
2734 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2735 code = (code == LE ? GEU : LTU);
2736 break;
2737
2738 default:
2739 return false;
2740 }
2741 /* Swapping operands may cause constant to appear as first operand. */
2742 if (!nonimmediate_operand (op0, VOIDmode))
2743 {
2744 if (!can_create_pseudo_p ())
2745 return false;
2746 op0 = force_reg (mode, op0);
2747 }
2748 *pop = ix86_expand_compare (code, op0, op1);
2749 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2750 return true;
2751 }
2752
2753 /* Expand conditional increment or decrement using adb/sbb instructions.
2754 The default case using setcc followed by the conditional move can be
2755 done by generic code. */
2756 bool
2757 ix86_expand_int_addcc (rtx operands[])
2758 {
2759 enum rtx_code code = GET_CODE (operands[1]);
2760 rtx flags;
2761 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
2762 rtx compare_op;
2763 rtx val = const0_rtx;
2764 bool fpcmp = false;
2765 machine_mode mode;
2766 rtx op0 = XEXP (operands[1], 0);
2767 rtx op1 = XEXP (operands[1], 1);
2768
2769 if (operands[3] != const1_rtx
2770 && operands[3] != constm1_rtx)
2771 return false;
2772 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2773 return false;
2774 code = GET_CODE (compare_op);
2775
2776 flags = XEXP (compare_op, 0);
2777
2778 if (GET_MODE (flags) == CCFPmode)
2779 {
2780 fpcmp = true;
2781 code = ix86_fp_compare_code_to_integer (code);
2782 }
2783
2784 if (code != LTU)
2785 {
2786 val = constm1_rtx;
2787 if (fpcmp)
2788 PUT_CODE (compare_op,
2789 reverse_condition_maybe_unordered
2790 (GET_CODE (compare_op)));
2791 else
2792 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2793 }
2794
2795 mode = GET_MODE (operands[0]);
2796
2797 /* Construct either adc or sbb insn. */
2798 if ((code == LTU) == (operands[3] == constm1_rtx))
2799 {
2800 switch (mode)
2801 {
2802 case E_QImode:
2803 insn = gen_subqi3_carry;
2804 break;
2805 case E_HImode:
2806 insn = gen_subhi3_carry;
2807 break;
2808 case E_SImode:
2809 insn = gen_subsi3_carry;
2810 break;
2811 case E_DImode:
2812 insn = gen_subdi3_carry;
2813 break;
2814 default:
2815 gcc_unreachable ();
2816 }
2817 }
2818 else
2819 {
2820 switch (mode)
2821 {
2822 case E_QImode:
2823 insn = gen_addqi3_carry;
2824 break;
2825 case E_HImode:
2826 insn = gen_addhi3_carry;
2827 break;
2828 case E_SImode:
2829 insn = gen_addsi3_carry;
2830 break;
2831 case E_DImode:
2832 insn = gen_adddi3_carry;
2833 break;
2834 default:
2835 gcc_unreachable ();
2836 }
2837 }
2838 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
2839
2840 return true;
2841 }
2842
2843 bool
2844 ix86_expand_int_movcc (rtx operands[])
2845 {
2846 enum rtx_code code = GET_CODE (operands[1]), compare_code;
2847 rtx_insn *compare_seq;
2848 rtx compare_op;
2849 machine_mode mode = GET_MODE (operands[0]);
2850 bool sign_bit_compare_p = false;
2851 rtx op0 = XEXP (operands[1], 0);
2852 rtx op1 = XEXP (operands[1], 1);
2853
2854 if (GET_MODE (op0) == TImode
2855 || (GET_MODE (op0) == DImode
2856 && !TARGET_64BIT))
2857 return false;
2858
2859 start_sequence ();
2860 compare_op = ix86_expand_compare (code, op0, op1);
2861 compare_seq = get_insns ();
2862 end_sequence ();
2863
2864 compare_code = GET_CODE (compare_op);
2865
2866 if ((op1 == const0_rtx && (code == GE || code == LT))
2867 || (op1 == constm1_rtx && (code == GT || code == LE)))
2868 sign_bit_compare_p = true;
2869
2870 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2871 HImode insns, we'd be swallowed in word prefix ops. */
2872
2873 if ((mode != HImode || TARGET_FAST_PREFIX)
2874 && (mode != (TARGET_64BIT ? TImode : DImode))
2875 && CONST_INT_P (operands[2])
2876 && CONST_INT_P (operands[3]))
2877 {
2878 rtx out = operands[0];
2879 HOST_WIDE_INT ct = INTVAL (operands[2]);
2880 HOST_WIDE_INT cf = INTVAL (operands[3]);
2881 HOST_WIDE_INT diff;
2882
2883 diff = ct - cf;
2884 /* Sign bit compares are better done using shifts than we do by using
2885 sbb. */
2886 if (sign_bit_compare_p
2887 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2888 {
2889 /* Detect overlap between destination and compare sources. */
2890 rtx tmp = out;
2891
2892 if (!sign_bit_compare_p)
2893 {
2894 rtx flags;
2895 bool fpcmp = false;
2896
2897 compare_code = GET_CODE (compare_op);
2898
2899 flags = XEXP (compare_op, 0);
2900
2901 if (GET_MODE (flags) == CCFPmode)
2902 {
2903 fpcmp = true;
2904 compare_code
2905 = ix86_fp_compare_code_to_integer (compare_code);
2906 }
2907
2908 /* To simplify rest of code, restrict to the GEU case. */
2909 if (compare_code == LTU)
2910 {
2911 std::swap (ct, cf);
2912 compare_code = reverse_condition (compare_code);
2913 code = reverse_condition (code);
2914 }
2915 else
2916 {
2917 if (fpcmp)
2918 PUT_CODE (compare_op,
2919 reverse_condition_maybe_unordered
2920 (GET_CODE (compare_op)));
2921 else
2922 PUT_CODE (compare_op,
2923 reverse_condition (GET_CODE (compare_op)));
2924 }
2925 diff = ct - cf;
2926
2927 if (reg_overlap_mentioned_p (out, op0)
2928 || reg_overlap_mentioned_p (out, op1))
2929 tmp = gen_reg_rtx (mode);
2930
2931 if (mode == DImode)
2932 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2933 else
2934 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
2935 flags, compare_op));
2936 }
2937 else
2938 {
2939 if (code == GT || code == GE)
2940 code = reverse_condition (code);
2941 else
2942 {
2943 std::swap (ct, cf);
2944 diff = ct - cf;
2945 }
2946 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2947 }
2948
2949 if (diff == 1)
2950 {
2951 /*
2952 * cmpl op0,op1
2953 * sbbl dest,dest
2954 * [addl dest, ct]
2955 *
2956 * Size 5 - 8.
2957 */
2958 if (ct)
2959 tmp = expand_simple_binop (mode, PLUS,
2960 tmp, GEN_INT (ct),
2961 copy_rtx (tmp), 1, OPTAB_DIRECT);
2962 }
2963 else if (cf == -1)
2964 {
2965 /*
2966 * cmpl op0,op1
2967 * sbbl dest,dest
2968 * orl $ct, dest
2969 *
2970 * Size 8.
2971 */
2972 tmp = expand_simple_binop (mode, IOR,
2973 tmp, GEN_INT (ct),
2974 copy_rtx (tmp), 1, OPTAB_DIRECT);
2975 }
2976 else if (diff == -1 && ct)
2977 {
2978 /*
2979 * cmpl op0,op1
2980 * sbbl dest,dest
2981 * notl dest
2982 * [addl dest, cf]
2983 *
2984 * Size 8 - 11.
2985 */
2986 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
2987 if (cf)
2988 tmp = expand_simple_binop (mode, PLUS,
2989 copy_rtx (tmp), GEN_INT (cf),
2990 copy_rtx (tmp), 1, OPTAB_DIRECT);
2991 }
2992 else
2993 {
2994 /*
2995 * cmpl op0,op1
2996 * sbbl dest,dest
2997 * [notl dest]
2998 * andl cf - ct, dest
2999 * [addl dest, ct]
3000 *
3001 * Size 8 - 11.
3002 */
3003
3004 if (cf == 0)
3005 {
3006 cf = ct;
3007 ct = 0;
3008 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3009 }
3010
3011 tmp = expand_simple_binop (mode, AND,
3012 copy_rtx (tmp),
3013 gen_int_mode (cf - ct, mode),
3014 copy_rtx (tmp), 1, OPTAB_DIRECT);
3015 if (ct)
3016 tmp = expand_simple_binop (mode, PLUS,
3017 copy_rtx (tmp), GEN_INT (ct),
3018 copy_rtx (tmp), 1, OPTAB_DIRECT);
3019 }
3020
3021 if (!rtx_equal_p (tmp, out))
3022 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3023
3024 return true;
3025 }
3026
3027 if (diff < 0)
3028 {
3029 machine_mode cmp_mode = GET_MODE (op0);
3030 enum rtx_code new_code;
3031
3032 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3033 {
3034 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3035
3036 /* We may be reversing unordered compare to normal compare, that
3037 is not valid in general (we may convert non-trapping condition
3038 to trapping one), however on i386 we currently emit all
3039 comparisons unordered. */
3040 new_code = reverse_condition_maybe_unordered (code);
3041 }
3042 else
3043 new_code = ix86_reverse_condition (code, cmp_mode);
3044 if (new_code != UNKNOWN)
3045 {
3046 std::swap (ct, cf);
3047 diff = -diff;
3048 code = new_code;
3049 }
3050 }
3051
3052 compare_code = UNKNOWN;
3053 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3054 && CONST_INT_P (op1))
3055 {
3056 if (op1 == const0_rtx
3057 && (code == LT || code == GE))
3058 compare_code = code;
3059 else if (op1 == constm1_rtx)
3060 {
3061 if (code == LE)
3062 compare_code = LT;
3063 else if (code == GT)
3064 compare_code = GE;
3065 }
3066 }
3067
3068 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3069 if (compare_code != UNKNOWN
3070 && GET_MODE (op0) == GET_MODE (out)
3071 && (cf == -1 || ct == -1))
3072 {
3073 /* If lea code below could be used, only optimize
3074 if it results in a 2 insn sequence. */
3075
3076 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3077 || diff == 3 || diff == 5 || diff == 9)
3078 || (compare_code == LT && ct == -1)
3079 || (compare_code == GE && cf == -1))
3080 {
3081 /*
3082 * notl op1 (if necessary)
3083 * sarl $31, op1
3084 * orl cf, op1
3085 */
3086 if (ct != -1)
3087 {
3088 cf = ct;
3089 ct = -1;
3090 code = reverse_condition (code);
3091 }
3092
3093 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3094
3095 out = expand_simple_binop (mode, IOR,
3096 out, GEN_INT (cf),
3097 out, 1, OPTAB_DIRECT);
3098 if (out != operands[0])
3099 emit_move_insn (operands[0], out);
3100
3101 return true;
3102 }
3103 }
3104
3105
3106 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3107 || diff == 3 || diff == 5 || diff == 9)
3108 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3109 && (mode != DImode
3110 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3111 {
3112 /*
3113 * xorl dest,dest
3114 * cmpl op1,op2
3115 * setcc dest
3116 * lea cf(dest*(ct-cf)),dest
3117 *
3118 * Size 14.
3119 *
3120 * This also catches the degenerate setcc-only case.
3121 */
3122
3123 rtx tmp;
3124 int nops;
3125
3126 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3127
3128 nops = 0;
3129 /* On x86_64 the lea instruction operates on Pmode, so we need
3130 to get arithmetics done in proper mode to match. */
3131 if (diff == 1)
3132 tmp = copy_rtx (out);
3133 else
3134 {
3135 rtx out1;
3136 out1 = copy_rtx (out);
3137 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3138 nops++;
3139 if (diff & 1)
3140 {
3141 tmp = gen_rtx_PLUS (mode, tmp, out1);
3142 nops++;
3143 }
3144 }
3145 if (cf != 0)
3146 {
3147 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
3148 nops++;
3149 }
3150 if (!rtx_equal_p (tmp, out))
3151 {
3152 if (nops == 1)
3153 out = force_operand (tmp, copy_rtx (out));
3154 else
3155 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3156 }
3157 if (!rtx_equal_p (out, operands[0]))
3158 emit_move_insn (operands[0], copy_rtx (out));
3159
3160 return true;
3161 }
3162
3163 /*
3164 * General case: Jumpful:
3165 * xorl dest,dest cmpl op1, op2
3166 * cmpl op1, op2 movl ct, dest
3167 * setcc dest jcc 1f
3168 * decl dest movl cf, dest
3169 * andl (cf-ct),dest 1:
3170 * addl ct,dest
3171 *
3172 * Size 20. Size 14.
3173 *
3174 * This is reasonably steep, but branch mispredict costs are
3175 * high on modern cpus, so consider failing only if optimizing
3176 * for space.
3177 */
3178
3179 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3180 && BRANCH_COST (optimize_insn_for_speed_p (),
3181 false) >= 2)
3182 {
3183 if (cf == 0)
3184 {
3185 machine_mode cmp_mode = GET_MODE (op0);
3186 enum rtx_code new_code;
3187
3188 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3189 {
3190 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3191
3192 /* We may be reversing unordered compare to normal compare,
3193 that is not valid in general (we may convert non-trapping
3194 condition to trapping one), however on i386 we currently
3195 emit all comparisons unordered. */
3196 new_code = reverse_condition_maybe_unordered (code);
3197 }
3198 else
3199 {
3200 new_code = ix86_reverse_condition (code, cmp_mode);
3201 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3202 compare_code = reverse_condition (compare_code);
3203 }
3204
3205 if (new_code != UNKNOWN)
3206 {
3207 cf = ct;
3208 ct = 0;
3209 code = new_code;
3210 }
3211 }
3212
3213 if (compare_code != UNKNOWN)
3214 {
3215 /* notl op1 (if needed)
3216 sarl $31, op1
3217 andl (cf-ct), op1
3218 addl ct, op1
3219
3220 For x < 0 (resp. x <= -1) there will be no notl,
3221 so if possible swap the constants to get rid of the
3222 complement.
3223 True/false will be -1/0 while code below (store flag
3224 followed by decrement) is 0/-1, so the constants need
3225 to be exchanged once more. */
3226
3227 if (compare_code == GE || !cf)
3228 {
3229 code = reverse_condition (code);
3230 compare_code = LT;
3231 }
3232 else
3233 std::swap (ct, cf);
3234
3235 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3236 }
3237 else
3238 {
3239 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3240
3241 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3242 constm1_rtx,
3243 copy_rtx (out), 1, OPTAB_DIRECT);
3244 }
3245
3246 out = expand_simple_binop (mode, AND, copy_rtx (out),
3247 gen_int_mode (cf - ct, mode),
3248 copy_rtx (out), 1, OPTAB_DIRECT);
3249 if (ct)
3250 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3251 copy_rtx (out), 1, OPTAB_DIRECT);
3252 if (!rtx_equal_p (out, operands[0]))
3253 emit_move_insn (operands[0], copy_rtx (out));
3254
3255 return true;
3256 }
3257 }
3258
3259 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3260 {
3261 /* Try a few things more with specific constants and a variable. */
3262
3263 optab op;
3264 rtx var, orig_out, out, tmp;
3265
3266 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3267 return false;
3268
3269 /* If one of the two operands is an interesting constant, load a
3270 constant with the above and mask it in with a logical operation. */
3271
3272 if (CONST_INT_P (operands[2]))
3273 {
3274 var = operands[3];
3275 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3276 operands[3] = constm1_rtx, op = and_optab;
3277 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3278 operands[3] = const0_rtx, op = ior_optab;
3279 else
3280 return false;
3281 }
3282 else if (CONST_INT_P (operands[3]))
3283 {
3284 var = operands[2];
3285 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3286 operands[2] = constm1_rtx, op = and_optab;
3287 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3288 operands[2] = const0_rtx, op = ior_optab;
3289 else
3290 return false;
3291 }
3292 else
3293 return false;
3294
3295 orig_out = operands[0];
3296 tmp = gen_reg_rtx (mode);
3297 operands[0] = tmp;
3298
3299 /* Recurse to get the constant loaded. */
3300 if (!ix86_expand_int_movcc (operands))
3301 return false;
3302
3303 /* Mask in the interesting variable. */
3304 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3305 OPTAB_WIDEN);
3306 if (!rtx_equal_p (out, orig_out))
3307 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3308
3309 return true;
3310 }
3311
3312 /*
3313 * For comparison with above,
3314 *
3315 * movl cf,dest
3316 * movl ct,tmp
3317 * cmpl op1,op2
3318 * cmovcc tmp,dest
3319 *
3320 * Size 15.
3321 */
3322
3323 if (! nonimmediate_operand (operands[2], mode))
3324 operands[2] = force_reg (mode, operands[2]);
3325 if (! nonimmediate_operand (operands[3], mode))
3326 operands[3] = force_reg (mode, operands[3]);
3327
3328 if (! register_operand (operands[2], VOIDmode)
3329 && (mode == QImode
3330 || ! register_operand (operands[3], VOIDmode)))
3331 operands[2] = force_reg (mode, operands[2]);
3332
3333 if (mode == QImode
3334 && ! register_operand (operands[3], VOIDmode))
3335 operands[3] = force_reg (mode, operands[3]);
3336
3337 emit_insn (compare_seq);
3338 emit_insn (gen_rtx_SET (operands[0],
3339 gen_rtx_IF_THEN_ELSE (mode,
3340 compare_op, operands[2],
3341 operands[3])));
3342 return true;
3343 }
3344
3345 /* Detect conditional moves that exactly match min/max operational
3346 semantics. Note that this is IEEE safe, as long as we don't
3347 interchange the operands.
3348
3349 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3350 and TRUE if the operation is successful and instructions are emitted. */
3351
3352 static bool
3353 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3354 rtx cmp_op1, rtx if_true, rtx if_false)
3355 {
3356 machine_mode mode;
3357 bool is_min;
3358 rtx tmp;
3359
3360 if (code == LT)
3361 ;
3362 else if (code == UNGE)
3363 std::swap (if_true, if_false);
3364 else
3365 return false;
3366
3367 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3368 is_min = true;
3369 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3370 is_min = false;
3371 else
3372 return false;
3373
3374 mode = GET_MODE (dest);
3375
3376 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3377 but MODE may be a vector mode and thus not appropriate. */
3378 if (!flag_finite_math_only || flag_signed_zeros)
3379 {
3380 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3381 rtvec v;
3382
3383 if_true = force_reg (mode, if_true);
3384 v = gen_rtvec (2, if_true, if_false);
3385 tmp = gen_rtx_UNSPEC (mode, v, u);
3386 }
3387 else
3388 {
3389 code = is_min ? SMIN : SMAX;
3390 if (MEM_P (if_true) && MEM_P (if_false))
3391 if_true = force_reg (mode, if_true);
3392 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3393 }
3394
3395 emit_insn (gen_rtx_SET (dest, tmp));
3396 return true;
3397 }
3398
3399 /* Expand an SSE comparison. Return the register with the result. */
3400
3401 static rtx
3402 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3403 rtx op_true, rtx op_false)
3404 {
3405 machine_mode mode = GET_MODE (dest);
3406 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3407
3408 /* In general case result of comparison can differ from operands' type. */
3409 machine_mode cmp_mode;
3410
3411 /* In AVX512F the result of comparison is an integer mask. */
3412 bool maskcmp = false;
3413 rtx x;
3414
3415 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
3416 {
3417 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3418 cmp_mode = int_mode_for_size (nbits, 0).require ();
3419 maskcmp = true;
3420 }
3421 else
3422 cmp_mode = cmp_ops_mode;
3423
3424 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3425
3426 int (*op1_predicate)(rtx, machine_mode)
3427 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3428
3429 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3430 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3431
3432 if (optimize
3433 || (maskcmp && cmp_mode != mode)
3434 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3435 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3436 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3437
3438 /* Compare patterns for int modes are unspec in AVX512F only. */
3439 if (maskcmp && (code == GT || code == EQ))
3440 {
3441 rtx (*gen)(rtx, rtx, rtx);
3442
3443 switch (cmp_ops_mode)
3444 {
3445 case E_V64QImode:
3446 gcc_assert (TARGET_AVX512BW);
3447 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
3448 break;
3449 case E_V32HImode:
3450 gcc_assert (TARGET_AVX512BW);
3451 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
3452 break;
3453 case E_V16SImode:
3454 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
3455 break;
3456 case E_V8DImode:
3457 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
3458 break;
3459 default:
3460 gen = NULL;
3461 }
3462
3463 if (gen)
3464 {
3465 emit_insn (gen (dest, cmp_op0, cmp_op1));
3466 return dest;
3467 }
3468 }
3469 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3470
3471 if (cmp_mode != mode && !maskcmp)
3472 {
3473 x = force_reg (cmp_ops_mode, x);
3474 convert_move (dest, x, false);
3475 }
3476 else
3477 emit_insn (gen_rtx_SET (dest, x));
3478
3479 return dest;
3480 }
3481
3482 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3483 operations. This is used for both scalar and vector conditional moves. */
3484
3485 void
3486 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3487 {
3488 machine_mode mode = GET_MODE (dest);
3489 machine_mode cmpmode = GET_MODE (cmp);
3490
3491 /* In AVX512F the result of comparison is an integer mask. */
3492 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
3493
3494 rtx t2, t3, x;
3495
3496 /* If we have an integer mask and FP value then we need
3497 to cast mask to FP mode. */
3498 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3499 {
3500 cmp = force_reg (cmpmode, cmp);
3501 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3502 }
3503
3504 if (maskcmp)
3505 {
3506 rtx (*gen) (rtx, rtx) = NULL;
3507 if ((op_true == CONST0_RTX (mode)
3508 && vector_all_ones_operand (op_false, mode))
3509 || (op_false == CONST0_RTX (mode)
3510 && vector_all_ones_operand (op_true, mode)))
3511 switch (mode)
3512 {
3513 case E_V64QImode:
3514 if (TARGET_AVX512BW)
3515 gen = gen_avx512bw_cvtmask2bv64qi;
3516 break;
3517 case E_V32QImode:
3518 if (TARGET_AVX512VL && TARGET_AVX512BW)
3519 gen = gen_avx512vl_cvtmask2bv32qi;
3520 break;
3521 case E_V16QImode:
3522 if (TARGET_AVX512VL && TARGET_AVX512BW)
3523 gen = gen_avx512vl_cvtmask2bv16qi;
3524 break;
3525 case E_V32HImode:
3526 if (TARGET_AVX512BW)
3527 gen = gen_avx512bw_cvtmask2wv32hi;
3528 break;
3529 case E_V16HImode:
3530 if (TARGET_AVX512VL && TARGET_AVX512BW)
3531 gen = gen_avx512vl_cvtmask2wv16hi;
3532 break;
3533 case E_V8HImode:
3534 if (TARGET_AVX512VL && TARGET_AVX512BW)
3535 gen = gen_avx512vl_cvtmask2wv8hi;
3536 break;
3537 case E_V16SImode:
3538 if (TARGET_AVX512DQ)
3539 gen = gen_avx512f_cvtmask2dv16si;
3540 break;
3541 case E_V8SImode:
3542 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3543 gen = gen_avx512vl_cvtmask2dv8si;
3544 break;
3545 case E_V4SImode:
3546 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3547 gen = gen_avx512vl_cvtmask2dv4si;
3548 break;
3549 case E_V8DImode:
3550 if (TARGET_AVX512DQ)
3551 gen = gen_avx512f_cvtmask2qv8di;
3552 break;
3553 case E_V4DImode:
3554 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3555 gen = gen_avx512vl_cvtmask2qv4di;
3556 break;
3557 case E_V2DImode:
3558 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3559 gen = gen_avx512vl_cvtmask2qv2di;
3560 break;
3561 default:
3562 break;
3563 }
3564 if (gen && SCALAR_INT_MODE_P (cmpmode))
3565 {
3566 cmp = force_reg (cmpmode, cmp);
3567 if (op_true == CONST0_RTX (mode))
3568 {
3569 rtx (*gen_not) (rtx, rtx);
3570 switch (cmpmode)
3571 {
3572 case E_QImode: gen_not = gen_knotqi; break;
3573 case E_HImode: gen_not = gen_knothi; break;
3574 case E_SImode: gen_not = gen_knotsi; break;
3575 case E_DImode: gen_not = gen_knotdi; break;
3576 default: gcc_unreachable ();
3577 }
3578 rtx n = gen_reg_rtx (cmpmode);
3579 emit_insn (gen_not (n, cmp));
3580 cmp = n;
3581 }
3582 emit_insn (gen (dest, cmp));
3583 return;
3584 }
3585 }
3586 else if (vector_all_ones_operand (op_true, mode)
3587 && op_false == CONST0_RTX (mode))
3588 {
3589 emit_insn (gen_rtx_SET (dest, cmp));
3590 return;
3591 }
3592 else if (op_false == CONST0_RTX (mode))
3593 {
3594 op_true = force_reg (mode, op_true);
3595 x = gen_rtx_AND (mode, cmp, op_true);
3596 emit_insn (gen_rtx_SET (dest, x));
3597 return;
3598 }
3599 else if (op_true == CONST0_RTX (mode))
3600 {
3601 op_false = force_reg (mode, op_false);
3602 x = gen_rtx_NOT (mode, cmp);
3603 x = gen_rtx_AND (mode, x, op_false);
3604 emit_insn (gen_rtx_SET (dest, x));
3605 return;
3606 }
3607 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3608 {
3609 op_false = force_reg (mode, op_false);
3610 x = gen_rtx_IOR (mode, cmp, op_false);
3611 emit_insn (gen_rtx_SET (dest, x));
3612 return;
3613 }
3614 else if (TARGET_XOP)
3615 {
3616 op_true = force_reg (mode, op_true);
3617
3618 if (!nonimmediate_operand (op_false, mode))
3619 op_false = force_reg (mode, op_false);
3620
3621 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3622 op_true,
3623 op_false)));
3624 return;
3625 }
3626
3627 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3628 rtx d = dest;
3629
3630 if (!vector_operand (op_true, mode))
3631 op_true = force_reg (mode, op_true);
3632
3633 op_false = force_reg (mode, op_false);
3634
3635 switch (mode)
3636 {
3637 case E_V4SFmode:
3638 if (TARGET_SSE4_1)
3639 gen = gen_sse4_1_blendvps;
3640 break;
3641 case E_V2DFmode:
3642 if (TARGET_SSE4_1)
3643 gen = gen_sse4_1_blendvpd;
3644 break;
3645 case E_SFmode:
3646 if (TARGET_SSE4_1)
3647 {
3648 gen = gen_sse4_1_blendvss;
3649 op_true = force_reg (mode, op_true);
3650 }
3651 break;
3652 case E_DFmode:
3653 if (TARGET_SSE4_1)
3654 {
3655 gen = gen_sse4_1_blendvsd;
3656 op_true = force_reg (mode, op_true);
3657 }
3658 break;
3659 case E_V16QImode:
3660 case E_V8HImode:
3661 case E_V4SImode:
3662 case E_V2DImode:
3663 if (TARGET_SSE4_1)
3664 {
3665 gen = gen_sse4_1_pblendvb;
3666 if (mode != V16QImode)
3667 d = gen_reg_rtx (V16QImode);
3668 op_false = gen_lowpart (V16QImode, op_false);
3669 op_true = gen_lowpart (V16QImode, op_true);
3670 cmp = gen_lowpart (V16QImode, cmp);
3671 }
3672 break;
3673 case E_V8SFmode:
3674 if (TARGET_AVX)
3675 gen = gen_avx_blendvps256;
3676 break;
3677 case E_V4DFmode:
3678 if (TARGET_AVX)
3679 gen = gen_avx_blendvpd256;
3680 break;
3681 case E_V32QImode:
3682 case E_V16HImode:
3683 case E_V8SImode:
3684 case E_V4DImode:
3685 if (TARGET_AVX2)
3686 {
3687 gen = gen_avx2_pblendvb;
3688 if (mode != V32QImode)
3689 d = gen_reg_rtx (V32QImode);
3690 op_false = gen_lowpart (V32QImode, op_false);
3691 op_true = gen_lowpart (V32QImode, op_true);
3692 cmp = gen_lowpart (V32QImode, cmp);
3693 }
3694 break;
3695
3696 case E_V64QImode:
3697 gen = gen_avx512bw_blendmv64qi;
3698 break;
3699 case E_V32HImode:
3700 gen = gen_avx512bw_blendmv32hi;
3701 break;
3702 case E_V16SImode:
3703 gen = gen_avx512f_blendmv16si;
3704 break;
3705 case E_V8DImode:
3706 gen = gen_avx512f_blendmv8di;
3707 break;
3708 case E_V8DFmode:
3709 gen = gen_avx512f_blendmv8df;
3710 break;
3711 case E_V16SFmode:
3712 gen = gen_avx512f_blendmv16sf;
3713 break;
3714
3715 default:
3716 break;
3717 }
3718
3719 if (gen != NULL)
3720 {
3721 emit_insn (gen (d, op_false, op_true, cmp));
3722 if (d != dest)
3723 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3724 }
3725 else
3726 {
3727 op_true = force_reg (mode, op_true);
3728
3729 t2 = gen_reg_rtx (mode);
3730 if (optimize)
3731 t3 = gen_reg_rtx (mode);
3732 else
3733 t3 = dest;
3734
3735 x = gen_rtx_AND (mode, op_true, cmp);
3736 emit_insn (gen_rtx_SET (t2, x));
3737
3738 x = gen_rtx_NOT (mode, cmp);
3739 x = gen_rtx_AND (mode, x, op_false);
3740 emit_insn (gen_rtx_SET (t3, x));
3741
3742 x = gen_rtx_IOR (mode, t3, t2);
3743 emit_insn (gen_rtx_SET (dest, x));
3744 }
3745 }
3746
3747 /* Swap, force into registers, or otherwise massage the two operands
3748 to an sse comparison with a mask result. Thus we differ a bit from
3749 ix86_prepare_fp_compare_args which expects to produce a flags result.
3750
3751 The DEST operand exists to help determine whether to commute commutative
3752 operators. The POP0/POP1 operands are updated in place. The new
3753 comparison code is returned, or UNKNOWN if not implementable. */
3754
3755 static enum rtx_code
3756 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3757 rtx *pop0, rtx *pop1)
3758 {
3759 switch (code)
3760 {
3761 case LTGT:
3762 case UNEQ:
3763 /* AVX supports all the needed comparisons. */
3764 if (TARGET_AVX)
3765 break;
3766 /* We have no LTGT as an operator. We could implement it with
3767 NE & ORDERED, but this requires an extra temporary. It's
3768 not clear that it's worth it. */
3769 return UNKNOWN;
3770
3771 case LT:
3772 case LE:
3773 case UNGT:
3774 case UNGE:
3775 /* These are supported directly. */
3776 break;
3777
3778 case EQ:
3779 case NE:
3780 case UNORDERED:
3781 case ORDERED:
3782 /* AVX has 3 operand comparisons, no need to swap anything. */
3783 if (TARGET_AVX)
3784 break;
3785 /* For commutative operators, try to canonicalize the destination
3786 operand to be first in the comparison - this helps reload to
3787 avoid extra moves. */
3788 if (!dest || !rtx_equal_p (dest, *pop1))
3789 break;
3790 /* FALLTHRU */
3791
3792 case GE:
3793 case GT:
3794 case UNLE:
3795 case UNLT:
3796 /* These are not supported directly before AVX, and furthermore
3797 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3798 comparison operands to transform into something that is
3799 supported. */
3800 std::swap (*pop0, *pop1);
3801 code = swap_condition (code);
3802 break;
3803
3804 default:
3805 gcc_unreachable ();
3806 }
3807
3808 return code;
3809 }
3810
3811 /* Expand a floating-point conditional move. Return true if successful. */
3812
3813 bool
3814 ix86_expand_fp_movcc (rtx operands[])
3815 {
3816 machine_mode mode = GET_MODE (operands[0]);
3817 enum rtx_code code = GET_CODE (operands[1]);
3818 rtx tmp, compare_op;
3819 rtx op0 = XEXP (operands[1], 0);
3820 rtx op1 = XEXP (operands[1], 1);
3821
3822 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3823 {
3824 machine_mode cmode;
3825
3826 /* Since we've no cmove for sse registers, don't force bad register
3827 allocation just to gain access to it. Deny movcc when the
3828 comparison mode doesn't match the move mode. */
3829 cmode = GET_MODE (op0);
3830 if (cmode == VOIDmode)
3831 cmode = GET_MODE (op1);
3832 if (cmode != mode)
3833 return false;
3834
3835 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3836 if (code == UNKNOWN)
3837 return false;
3838
3839 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3840 operands[2], operands[3]))
3841 return true;
3842
3843 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3844 operands[2], operands[3]);
3845 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3846 return true;
3847 }
3848
3849 if (GET_MODE (op0) == TImode
3850 || (GET_MODE (op0) == DImode
3851 && !TARGET_64BIT))
3852 return false;
3853
3854 /* The floating point conditional move instructions don't directly
3855 support conditions resulting from a signed integer comparison. */
3856
3857 compare_op = ix86_expand_compare (code, op0, op1);
3858 if (!fcmov_comparison_operator (compare_op, VOIDmode))
3859 {
3860 tmp = gen_reg_rtx (QImode);
3861 ix86_expand_setcc (tmp, code, op0, op1);
3862
3863 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3864 }
3865
3866 emit_insn (gen_rtx_SET (operands[0],
3867 gen_rtx_IF_THEN_ELSE (mode, compare_op,
3868 operands[2], operands[3])));
3869
3870 return true;
3871 }
3872
3873 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3874
3875 static int
3876 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3877 {
3878 switch (code)
3879 {
3880 case EQ:
3881 return 0;
3882 case LT:
3883 case LTU:
3884 return 1;
3885 case LE:
3886 case LEU:
3887 return 2;
3888 case NE:
3889 return 4;
3890 case GE:
3891 case GEU:
3892 return 5;
3893 case GT:
3894 case GTU:
3895 return 6;
3896 default:
3897 gcc_unreachable ();
3898 }
3899 }
3900
3901 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3902
3903 static int
3904 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3905 {
3906 switch (code)
3907 {
3908 case EQ:
3909 return 0x00;
3910 case NE:
3911 return 0x04;
3912 case GT:
3913 return 0x0e;
3914 case LE:
3915 return 0x02;
3916 case GE:
3917 return 0x0d;
3918 case LT:
3919 return 0x01;
3920 case UNLE:
3921 return 0x0a;
3922 case UNLT:
3923 return 0x09;
3924 case UNGE:
3925 return 0x05;
3926 case UNGT:
3927 return 0x06;
3928 case UNEQ:
3929 return 0x18;
3930 case LTGT:
3931 return 0x0c;
3932 case ORDERED:
3933 return 0x07;
3934 case UNORDERED:
3935 return 0x03;
3936 default:
3937 gcc_unreachable ();
3938 }
3939 }
3940
3941 /* Return immediate value to be used in UNSPEC_PCMP
3942 for comparison CODE in MODE. */
3943
3944 static int
3945 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3946 {
3947 if (FLOAT_MODE_P (mode))
3948 return ix86_fp_cmp_code_to_pcmp_immediate (code);
3949 return ix86_int_cmp_code_to_pcmp_immediate (code);
3950 }
3951
3952 /* Expand AVX-512 vector comparison. */
3953
3954 bool
3955 ix86_expand_mask_vec_cmp (rtx operands[])
3956 {
3957 machine_mode mask_mode = GET_MODE (operands[0]);
3958 machine_mode cmp_mode = GET_MODE (operands[2]);
3959 enum rtx_code code = GET_CODE (operands[1]);
3960 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3961 int unspec_code;
3962 rtx unspec;
3963
3964 switch (code)
3965 {
3966 case LEU:
3967 case GTU:
3968 case GEU:
3969 case LTU:
3970 unspec_code = UNSPEC_UNSIGNED_PCMP;
3971 break;
3972
3973 default:
3974 unspec_code = UNSPEC_PCMP;
3975 }
3976
3977 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
3978 operands[3], imm),
3979 unspec_code);
3980 emit_insn (gen_rtx_SET (operands[0], unspec));
3981
3982 return true;
3983 }
3984
3985 /* Expand fp vector comparison. */
3986
3987 bool
3988 ix86_expand_fp_vec_cmp (rtx operands[])
3989 {
3990 enum rtx_code code = GET_CODE (operands[1]);
3991 rtx cmp;
3992
3993 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
3994 &operands[2], &operands[3]);
3995 if (code == UNKNOWN)
3996 {
3997 rtx temp;
3998 switch (GET_CODE (operands[1]))
3999 {
4000 case LTGT:
4001 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4002 operands[3], NULL, NULL);
4003 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4004 operands[3], NULL, NULL);
4005 code = AND;
4006 break;
4007 case UNEQ:
4008 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4009 operands[3], NULL, NULL);
4010 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4011 operands[3], NULL, NULL);
4012 code = IOR;
4013 break;
4014 default:
4015 gcc_unreachable ();
4016 }
4017 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4018 OPTAB_DIRECT);
4019 }
4020 else
4021 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4022 operands[1], operands[2]);
4023
4024 if (operands[0] != cmp)
4025 emit_move_insn (operands[0], cmp);
4026
4027 return true;
4028 }
4029
4030 static rtx
4031 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4032 rtx op_true, rtx op_false, bool *negate)
4033 {
4034 machine_mode data_mode = GET_MODE (dest);
4035 machine_mode mode = GET_MODE (cop0);
4036 rtx x;
4037
4038 *negate = false;
4039
4040 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4041 if (TARGET_XOP
4042 && (mode == V16QImode || mode == V8HImode
4043 || mode == V4SImode || mode == V2DImode))
4044 ;
4045 else
4046 {
4047 /* Canonicalize the comparison to EQ, GT, GTU. */
4048 switch (code)
4049 {
4050 case EQ:
4051 case GT:
4052 case GTU:
4053 break;
4054
4055 case NE:
4056 case LE:
4057 case LEU:
4058 code = reverse_condition (code);
4059 *negate = true;
4060 break;
4061
4062 case GE:
4063 case GEU:
4064 code = reverse_condition (code);
4065 *negate = true;
4066 /* FALLTHRU */
4067
4068 case LT:
4069 case LTU:
4070 std::swap (cop0, cop1);
4071 code = swap_condition (code);
4072 break;
4073
4074 default:
4075 gcc_unreachable ();
4076 }
4077
4078 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4079 if (mode == V2DImode)
4080 {
4081 switch (code)
4082 {
4083 case EQ:
4084 /* SSE4.1 supports EQ. */
4085 if (!TARGET_SSE4_1)
4086 return NULL;
4087 break;
4088
4089 case GT:
4090 case GTU:
4091 /* SSE4.2 supports GT/GTU. */
4092 if (!TARGET_SSE4_2)
4093 return NULL;
4094 break;
4095
4096 default:
4097 gcc_unreachable ();
4098 }
4099 }
4100
4101 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4102 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4103 if (*negate)
4104 std::swap (optrue, opfalse);
4105
4106 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4107 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4108 min (x, y) == x). While we add one instruction (the minimum),
4109 we remove the need for two instructions in the negation, as the
4110 result is done this way.
4111 When using masks, do it for SI/DImode element types, as it is shorter
4112 than the two subtractions. */
4113 if ((code != EQ
4114 && GET_MODE_SIZE (mode) != 64
4115 && vector_all_ones_operand (opfalse, data_mode)
4116 && optrue == CONST0_RTX (data_mode))
4117 || (code == GTU
4118 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4119 /* Don't do it if not using integer masks and we'd end up with
4120 the right values in the registers though. */
4121 && (GET_MODE_SIZE (mode) == 64
4122 || !vector_all_ones_operand (optrue, data_mode)
4123 || opfalse != CONST0_RTX (data_mode))))
4124 {
4125 rtx (*gen) (rtx, rtx, rtx) = NULL;
4126
4127 switch (mode)
4128 {
4129 case E_V16SImode:
4130 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4131 break;
4132 case E_V8DImode:
4133 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4134 cop0 = force_reg (mode, cop0);
4135 cop1 = force_reg (mode, cop1);
4136 break;
4137 case E_V32QImode:
4138 if (TARGET_AVX2)
4139 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4140 break;
4141 case E_V16HImode:
4142 if (TARGET_AVX2)
4143 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4144 break;
4145 case E_V8SImode:
4146 if (TARGET_AVX2)
4147 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4148 break;
4149 case E_V4DImode:
4150 if (TARGET_AVX512VL)
4151 {
4152 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4153 cop0 = force_reg (mode, cop0);
4154 cop1 = force_reg (mode, cop1);
4155 }
4156 break;
4157 case E_V16QImode:
4158 if (code == GTU && TARGET_SSE2)
4159 gen = gen_uminv16qi3;
4160 else if (code == GT && TARGET_SSE4_1)
4161 gen = gen_sminv16qi3;
4162 break;
4163 case E_V8HImode:
4164 if (code == GTU && TARGET_SSE4_1)
4165 gen = gen_uminv8hi3;
4166 else if (code == GT && TARGET_SSE2)
4167 gen = gen_sminv8hi3;
4168 break;
4169 case E_V4SImode:
4170 if (TARGET_SSE4_1)
4171 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4172 break;
4173 case E_V2DImode:
4174 if (TARGET_AVX512VL)
4175 {
4176 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4177 cop0 = force_reg (mode, cop0);
4178 cop1 = force_reg (mode, cop1);
4179 }
4180 break;
4181 default:
4182 break;
4183 }
4184
4185 if (gen)
4186 {
4187 rtx tem = gen_reg_rtx (mode);
4188 if (!vector_operand (cop0, mode))
4189 cop0 = force_reg (mode, cop0);
4190 if (!vector_operand (cop1, mode))
4191 cop1 = force_reg (mode, cop1);
4192 *negate = !*negate;
4193 emit_insn (gen (tem, cop0, cop1));
4194 cop1 = tem;
4195 code = EQ;
4196 }
4197 }
4198
4199 /* Unsigned parallel compare is not supported by the hardware.
4200 Play some tricks to turn this into a signed comparison
4201 against 0. */
4202 if (code == GTU)
4203 {
4204 cop0 = force_reg (mode, cop0);
4205
4206 switch (mode)
4207 {
4208 case E_V16SImode:
4209 case E_V8DImode:
4210 case E_V8SImode:
4211 case E_V4DImode:
4212 case E_V4SImode:
4213 case E_V2DImode:
4214 {
4215 rtx t1, t2, mask;
4216 rtx (*gen_sub3) (rtx, rtx, rtx);
4217
4218 switch (mode)
4219 {
4220 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
4221 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
4222 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
4223 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
4224 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
4225 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
4226 default:
4227 gcc_unreachable ();
4228 }
4229 /* Subtract (-(INT MAX) - 1) from both operands to make
4230 them signed. */
4231 mask = ix86_build_signbit_mask (mode, true, false);
4232 t1 = gen_reg_rtx (mode);
4233 emit_insn (gen_sub3 (t1, cop0, mask));
4234
4235 t2 = gen_reg_rtx (mode);
4236 emit_insn (gen_sub3 (t2, cop1, mask));
4237
4238 cop0 = t1;
4239 cop1 = t2;
4240 code = GT;
4241 }
4242 break;
4243
4244 case E_V64QImode:
4245 case E_V32HImode:
4246 case E_V32QImode:
4247 case E_V16HImode:
4248 case E_V16QImode:
4249 case E_V8HImode:
4250 /* Perform a parallel unsigned saturating subtraction. */
4251 x = gen_reg_rtx (mode);
4252 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
4253 cop1)));
4254
4255 cop0 = x;
4256 cop1 = CONST0_RTX (mode);
4257 code = EQ;
4258 *negate = !*negate;
4259 break;
4260
4261 default:
4262 gcc_unreachable ();
4263 }
4264 }
4265 }
4266
4267 if (*negate)
4268 std::swap (op_true, op_false);
4269
4270 /* Allow the comparison to be done in one mode, but the movcc to
4271 happen in another mode. */
4272 if (data_mode == mode)
4273 {
4274 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4275 op_true, op_false);
4276 }
4277 else
4278 {
4279 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4280 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4281 op_true, op_false);
4282 if (GET_MODE (x) == mode)
4283 x = gen_lowpart (data_mode, x);
4284 }
4285
4286 return x;
4287 }
4288
4289 /* Expand integer vector comparison. */
4290
4291 bool
4292 ix86_expand_int_vec_cmp (rtx operands[])
4293 {
4294 rtx_code code = GET_CODE (operands[1]);
4295 bool negate = false;
4296 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4297 operands[3], NULL, NULL, &negate);
4298
4299 if (!cmp)
4300 return false;
4301
4302 if (negate)
4303 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4304 CONST0_RTX (GET_MODE (cmp)),
4305 NULL, NULL, &negate);
4306
4307 gcc_assert (!negate);
4308
4309 if (operands[0] != cmp)
4310 emit_move_insn (operands[0], cmp);
4311
4312 return true;
4313 }
4314
4315 /* Expand a floating-point vector conditional move; a vcond operation
4316 rather than a movcc operation. */
4317
4318 bool
4319 ix86_expand_fp_vcond (rtx operands[])
4320 {
4321 enum rtx_code code = GET_CODE (operands[3]);
4322 rtx cmp;
4323
4324 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4325 &operands[4], &operands[5]);
4326 if (code == UNKNOWN)
4327 {
4328 rtx temp;
4329 switch (GET_CODE (operands[3]))
4330 {
4331 case LTGT:
4332 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4333 operands[5], operands[0], operands[0]);
4334 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4335 operands[5], operands[1], operands[2]);
4336 code = AND;
4337 break;
4338 case UNEQ:
4339 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4340 operands[5], operands[0], operands[0]);
4341 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4342 operands[5], operands[1], operands[2]);
4343 code = IOR;
4344 break;
4345 default:
4346 gcc_unreachable ();
4347 }
4348 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4349 OPTAB_DIRECT);
4350 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4351 return true;
4352 }
4353
4354 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4355 operands[5], operands[1], operands[2]))
4356 return true;
4357
4358 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4359 operands[1], operands[2]);
4360 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4361 return true;
4362 }
4363
4364 /* Expand a signed/unsigned integral vector conditional move. */
4365
4366 bool
4367 ix86_expand_int_vcond (rtx operands[])
4368 {
4369 machine_mode data_mode = GET_MODE (operands[0]);
4370 machine_mode mode = GET_MODE (operands[4]);
4371 enum rtx_code code = GET_CODE (operands[3]);
4372 bool negate = false;
4373 rtx x, cop0, cop1;
4374
4375 cop0 = operands[4];
4376 cop1 = operands[5];
4377
4378 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4379 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4380 if ((code == LT || code == GE)
4381 && data_mode == mode
4382 && cop1 == CONST0_RTX (mode)
4383 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4384 && GET_MODE_UNIT_SIZE (data_mode) > 1
4385 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4386 && (GET_MODE_SIZE (data_mode) == 16
4387 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4388 {
4389 rtx negop = operands[2 - (code == LT)];
4390 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4391 if (negop == CONST1_RTX (data_mode))
4392 {
4393 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4394 operands[0], 1, OPTAB_DIRECT);
4395 if (res != operands[0])
4396 emit_move_insn (operands[0], res);
4397 return true;
4398 }
4399 else if (GET_MODE_INNER (data_mode) != DImode
4400 && vector_all_ones_operand (negop, data_mode))
4401 {
4402 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4403 operands[0], 0, OPTAB_DIRECT);
4404 if (res != operands[0])
4405 emit_move_insn (operands[0], res);
4406 return true;
4407 }
4408 }
4409
4410 if (!nonimmediate_operand (cop1, mode))
4411 cop1 = force_reg (mode, cop1);
4412 if (!general_operand (operands[1], data_mode))
4413 operands[1] = force_reg (data_mode, operands[1]);
4414 if (!general_operand (operands[2], data_mode))
4415 operands[2] = force_reg (data_mode, operands[2]);
4416
4417 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4418 operands[1], operands[2], &negate);
4419
4420 if (!x)
4421 return false;
4422
4423 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4424 operands[2-negate]);
4425 return true;
4426 }
4427
4428 static bool
4429 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4430 struct expand_vec_perm_d *d)
4431 {
4432 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4433 expander, so args are either in d, or in op0, op1 etc. */
4434 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4435 machine_mode maskmode = mode;
4436 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4437
4438 switch (mode)
4439 {
4440 case E_V8HImode:
4441 if (TARGET_AVX512VL && TARGET_AVX512BW)
4442 gen = gen_avx512vl_vpermt2varv8hi3;
4443 break;
4444 case E_V16HImode:
4445 if (TARGET_AVX512VL && TARGET_AVX512BW)
4446 gen = gen_avx512vl_vpermt2varv16hi3;
4447 break;
4448 case E_V64QImode:
4449 if (TARGET_AVX512VBMI)
4450 gen = gen_avx512bw_vpermt2varv64qi3;
4451 break;
4452 case E_V32HImode:
4453 if (TARGET_AVX512BW)
4454 gen = gen_avx512bw_vpermt2varv32hi3;
4455 break;
4456 case E_V4SImode:
4457 if (TARGET_AVX512VL)
4458 gen = gen_avx512vl_vpermt2varv4si3;
4459 break;
4460 case E_V8SImode:
4461 if (TARGET_AVX512VL)
4462 gen = gen_avx512vl_vpermt2varv8si3;
4463 break;
4464 case E_V16SImode:
4465 if (TARGET_AVX512F)
4466 gen = gen_avx512f_vpermt2varv16si3;
4467 break;
4468 case E_V4SFmode:
4469 if (TARGET_AVX512VL)
4470 {
4471 gen = gen_avx512vl_vpermt2varv4sf3;
4472 maskmode = V4SImode;
4473 }
4474 break;
4475 case E_V8SFmode:
4476 if (TARGET_AVX512VL)
4477 {
4478 gen = gen_avx512vl_vpermt2varv8sf3;
4479 maskmode = V8SImode;
4480 }
4481 break;
4482 case E_V16SFmode:
4483 if (TARGET_AVX512F)
4484 {
4485 gen = gen_avx512f_vpermt2varv16sf3;
4486 maskmode = V16SImode;
4487 }
4488 break;
4489 case E_V2DImode:
4490 if (TARGET_AVX512VL)
4491 gen = gen_avx512vl_vpermt2varv2di3;
4492 break;
4493 case E_V4DImode:
4494 if (TARGET_AVX512VL)
4495 gen = gen_avx512vl_vpermt2varv4di3;
4496 break;
4497 case E_V8DImode:
4498 if (TARGET_AVX512F)
4499 gen = gen_avx512f_vpermt2varv8di3;
4500 break;
4501 case E_V2DFmode:
4502 if (TARGET_AVX512VL)
4503 {
4504 gen = gen_avx512vl_vpermt2varv2df3;
4505 maskmode = V2DImode;
4506 }
4507 break;
4508 case E_V4DFmode:
4509 if (TARGET_AVX512VL)
4510 {
4511 gen = gen_avx512vl_vpermt2varv4df3;
4512 maskmode = V4DImode;
4513 }
4514 break;
4515 case E_V8DFmode:
4516 if (TARGET_AVX512F)
4517 {
4518 gen = gen_avx512f_vpermt2varv8df3;
4519 maskmode = V8DImode;
4520 }
4521 break;
4522 default:
4523 break;
4524 }
4525
4526 if (gen == NULL)
4527 return false;
4528
4529 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4530 expander, so args are either in d, or in op0, op1 etc. */
4531 if (d)
4532 {
4533 rtx vec[64];
4534 target = d->target;
4535 op0 = d->op0;
4536 op1 = d->op1;
4537 for (int i = 0; i < d->nelt; ++i)
4538 vec[i] = GEN_INT (d->perm[i]);
4539 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4540 }
4541
4542 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4543 return true;
4544 }
4545
4546 /* Expand a variable vector permutation. */
4547
4548 void
4549 ix86_expand_vec_perm (rtx operands[])
4550 {
4551 rtx target = operands[0];
4552 rtx op0 = operands[1];
4553 rtx op1 = operands[2];
4554 rtx mask = operands[3];
4555 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4556 machine_mode mode = GET_MODE (op0);
4557 machine_mode maskmode = GET_MODE (mask);
4558 int w, e, i;
4559 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4560
4561 /* Number of elements in the vector. */
4562 w = GET_MODE_NUNITS (mode);
4563 e = GET_MODE_UNIT_SIZE (mode);
4564 gcc_assert (w <= 64);
4565
4566 if (TARGET_AVX512F && one_operand_shuffle)
4567 {
4568 rtx (*gen) (rtx, rtx, rtx) = NULL;
4569 switch (mode)
4570 {
4571 case E_V16SImode:
4572 gen =gen_avx512f_permvarv16si;
4573 break;
4574 case E_V16SFmode:
4575 gen = gen_avx512f_permvarv16sf;
4576 break;
4577 case E_V8DImode:
4578 gen = gen_avx512f_permvarv8di;
4579 break;
4580 case E_V8DFmode:
4581 gen = gen_avx512f_permvarv8df;
4582 break;
4583 default:
4584 break;
4585 }
4586 if (gen != NULL)
4587 {
4588 emit_insn (gen (target, op0, mask));
4589 return;
4590 }
4591 }
4592
4593 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4594 return;
4595
4596 if (TARGET_AVX2)
4597 {
4598 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4599 {
4600 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4601 an constant shuffle operand. With a tiny bit of effort we can
4602 use VPERMD instead. A re-interpretation stall for V4DFmode is
4603 unfortunate but there's no avoiding it.
4604 Similarly for V16HImode we don't have instructions for variable
4605 shuffling, while for V32QImode we can use after preparing suitable
4606 masks vpshufb; vpshufb; vpermq; vpor. */
4607
4608 if (mode == V16HImode)
4609 {
4610 maskmode = mode = V32QImode;
4611 w = 32;
4612 e = 1;
4613 }
4614 else
4615 {
4616 maskmode = mode = V8SImode;
4617 w = 8;
4618 e = 4;
4619 }
4620 t1 = gen_reg_rtx (maskmode);
4621
4622 /* Replicate the low bits of the V4DImode mask into V8SImode:
4623 mask = { A B C D }
4624 t1 = { A A B B C C D D }. */
4625 for (i = 0; i < w / 2; ++i)
4626 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4627 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4628 vt = force_reg (maskmode, vt);
4629 mask = gen_lowpart (maskmode, mask);
4630 if (maskmode == V8SImode)
4631 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4632 else
4633 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4634
4635 /* Multiply the shuffle indicies by two. */
4636 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4637 OPTAB_DIRECT);
4638
4639 /* Add one to the odd shuffle indicies:
4640 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4641 for (i = 0; i < w / 2; ++i)
4642 {
4643 vec[i * 2] = const0_rtx;
4644 vec[i * 2 + 1] = const1_rtx;
4645 }
4646 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4647 vt = validize_mem (force_const_mem (maskmode, vt));
4648 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4649 OPTAB_DIRECT);
4650
4651 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4652 operands[3] = mask = t1;
4653 target = gen_reg_rtx (mode);
4654 op0 = gen_lowpart (mode, op0);
4655 op1 = gen_lowpart (mode, op1);
4656 }
4657
4658 switch (mode)
4659 {
4660 case E_V8SImode:
4661 /* The VPERMD and VPERMPS instructions already properly ignore
4662 the high bits of the shuffle elements. No need for us to
4663 perform an AND ourselves. */
4664 if (one_operand_shuffle)
4665 {
4666 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4667 if (target != operands[0])
4668 emit_move_insn (operands[0],
4669 gen_lowpart (GET_MODE (operands[0]), target));
4670 }
4671 else
4672 {
4673 t1 = gen_reg_rtx (V8SImode);
4674 t2 = gen_reg_rtx (V8SImode);
4675 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4676 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4677 goto merge_two;
4678 }
4679 return;
4680
4681 case E_V8SFmode:
4682 mask = gen_lowpart (V8SImode, mask);
4683 if (one_operand_shuffle)
4684 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4685 else
4686 {
4687 t1 = gen_reg_rtx (V8SFmode);
4688 t2 = gen_reg_rtx (V8SFmode);
4689 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4690 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4691 goto merge_two;
4692 }
4693 return;
4694
4695 case E_V4SImode:
4696 /* By combining the two 128-bit input vectors into one 256-bit
4697 input vector, we can use VPERMD and VPERMPS for the full
4698 two-operand shuffle. */
4699 t1 = gen_reg_rtx (V8SImode);
4700 t2 = gen_reg_rtx (V8SImode);
4701 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4702 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4703 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4704 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4705 return;
4706
4707 case E_V4SFmode:
4708 t1 = gen_reg_rtx (V8SFmode);
4709 t2 = gen_reg_rtx (V8SImode);
4710 mask = gen_lowpart (V4SImode, mask);
4711 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4712 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4713 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4714 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4715 return;
4716
4717 case E_V32QImode:
4718 t1 = gen_reg_rtx (V32QImode);
4719 t2 = gen_reg_rtx (V32QImode);
4720 t3 = gen_reg_rtx (V32QImode);
4721 vt2 = GEN_INT (-128);
4722 vt = gen_const_vec_duplicate (V32QImode, vt2);
4723 vt = force_reg (V32QImode, vt);
4724 for (i = 0; i < 32; i++)
4725 vec[i] = i < 16 ? vt2 : const0_rtx;
4726 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4727 vt2 = force_reg (V32QImode, vt2);
4728 /* From mask create two adjusted masks, which contain the same
4729 bits as mask in the low 7 bits of each vector element.
4730 The first mask will have the most significant bit clear
4731 if it requests element from the same 128-bit lane
4732 and MSB set if it requests element from the other 128-bit lane.
4733 The second mask will have the opposite values of the MSB,
4734 and additionally will have its 128-bit lanes swapped.
4735 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4736 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4737 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4738 stands for other 12 bytes. */
4739 /* The bit whether element is from the same lane or the other
4740 lane is bit 4, so shift it up by 3 to the MSB position. */
4741 t5 = gen_reg_rtx (V4DImode);
4742 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4743 GEN_INT (3)));
4744 /* Clear MSB bits from the mask just in case it had them set. */
4745 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4746 /* After this t1 will have MSB set for elements from other lane. */
4747 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4748 /* Clear bits other than MSB. */
4749 emit_insn (gen_andv32qi3 (t1, t1, vt));
4750 /* Or in the lower bits from mask into t3. */
4751 emit_insn (gen_iorv32qi3 (t3, t1, t2));
4752 /* And invert MSB bits in t1, so MSB is set for elements from the same
4753 lane. */
4754 emit_insn (gen_xorv32qi3 (t1, t1, vt));
4755 /* Swap 128-bit lanes in t3. */
4756 t6 = gen_reg_rtx (V4DImode);
4757 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4758 const2_rtx, GEN_INT (3),
4759 const0_rtx, const1_rtx));
4760 /* And or in the lower bits from mask into t1. */
4761 emit_insn (gen_iorv32qi3 (t1, t1, t2));
4762 if (one_operand_shuffle)
4763 {
4764 /* Each of these shuffles will put 0s in places where
4765 element from the other 128-bit lane is needed, otherwise
4766 will shuffle in the requested value. */
4767 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4768 gen_lowpart (V32QImode, t6)));
4769 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4770 /* For t3 the 128-bit lanes are swapped again. */
4771 t7 = gen_reg_rtx (V4DImode);
4772 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4773 const2_rtx, GEN_INT (3),
4774 const0_rtx, const1_rtx));
4775 /* And oring both together leads to the result. */
4776 emit_insn (gen_iorv32qi3 (target, t1,
4777 gen_lowpart (V32QImode, t7)));
4778 if (target != operands[0])
4779 emit_move_insn (operands[0],
4780 gen_lowpart (GET_MODE (operands[0]), target));
4781 return;
4782 }
4783
4784 t4 = gen_reg_rtx (V32QImode);
4785 /* Similarly to the above one_operand_shuffle code,
4786 just for repeated twice for each operand. merge_two:
4787 code will merge the two results together. */
4788 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4789 gen_lowpart (V32QImode, t6)));
4790 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4791 gen_lowpart (V32QImode, t6)));
4792 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4793 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4794 t7 = gen_reg_rtx (V4DImode);
4795 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4796 const2_rtx, GEN_INT (3),
4797 const0_rtx, const1_rtx));
4798 t8 = gen_reg_rtx (V4DImode);
4799 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4800 const2_rtx, GEN_INT (3),
4801 const0_rtx, const1_rtx));
4802 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4803 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4804 t1 = t4;
4805 t2 = t3;
4806 goto merge_two;
4807
4808 default:
4809 gcc_assert (GET_MODE_SIZE (mode) <= 16);
4810 break;
4811 }
4812 }
4813
4814 if (TARGET_XOP)
4815 {
4816 /* The XOP VPPERM insn supports three inputs. By ignoring the
4817 one_operand_shuffle special case, we avoid creating another
4818 set of constant vectors in memory. */
4819 one_operand_shuffle = false;
4820
4821 /* mask = mask & {2*w-1, ...} */
4822 vt = GEN_INT (2*w - 1);
4823 }
4824 else
4825 {
4826 /* mask = mask & {w-1, ...} */
4827 vt = GEN_INT (w - 1);
4828 }
4829
4830 vt = gen_const_vec_duplicate (maskmode, vt);
4831 mask = expand_simple_binop (maskmode, AND, mask, vt,
4832 NULL_RTX, 0, OPTAB_DIRECT);
4833
4834 /* For non-QImode operations, convert the word permutation control
4835 into a byte permutation control. */
4836 if (mode != V16QImode)
4837 {
4838 mask = expand_simple_binop (maskmode, ASHIFT, mask,
4839 GEN_INT (exact_log2 (e)),
4840 NULL_RTX, 0, OPTAB_DIRECT);
4841
4842 /* Convert mask to vector of chars. */
4843 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4844
4845 /* Replicate each of the input bytes into byte positions:
4846 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4847 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4848 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4849 for (i = 0; i < 16; ++i)
4850 vec[i] = GEN_INT (i/e * e);
4851 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4852 vt = validize_mem (force_const_mem (V16QImode, vt));
4853 if (TARGET_XOP)
4854 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4855 else
4856 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4857
4858 /* Convert it into the byte positions by doing
4859 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4860 for (i = 0; i < 16; ++i)
4861 vec[i] = GEN_INT (i % e);
4862 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4863 vt = validize_mem (force_const_mem (V16QImode, vt));
4864 emit_insn (gen_addv16qi3 (mask, mask, vt));
4865 }
4866
4867 /* The actual shuffle operations all operate on V16QImode. */
4868 op0 = gen_lowpart (V16QImode, op0);
4869 op1 = gen_lowpart (V16QImode, op1);
4870
4871 if (TARGET_XOP)
4872 {
4873 if (GET_MODE (target) != V16QImode)
4874 target = gen_reg_rtx (V16QImode);
4875 emit_insn (gen_xop_pperm (target, op0, op1, mask));
4876 if (target != operands[0])
4877 emit_move_insn (operands[0],
4878 gen_lowpart (GET_MODE (operands[0]), target));
4879 }
4880 else if (one_operand_shuffle)
4881 {
4882 if (GET_MODE (target) != V16QImode)
4883 target = gen_reg_rtx (V16QImode);
4884 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4885 if (target != operands[0])
4886 emit_move_insn (operands[0],
4887 gen_lowpart (GET_MODE (operands[0]), target));
4888 }
4889 else
4890 {
4891 rtx xops[6];
4892 bool ok;
4893
4894 /* Shuffle the two input vectors independently. */
4895 t1 = gen_reg_rtx (V16QImode);
4896 t2 = gen_reg_rtx (V16QImode);
4897 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4898 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4899
4900 merge_two:
4901 /* Then merge them together. The key is whether any given control
4902 element contained a bit set that indicates the second word. */
4903 mask = operands[3];
4904 vt = GEN_INT (w);
4905 if (maskmode == V2DImode && !TARGET_SSE4_1)
4906 {
4907 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4908 more shuffle to convert the V2DI input mask into a V4SI
4909 input mask. At which point the masking that expand_int_vcond
4910 will work as desired. */
4911 rtx t3 = gen_reg_rtx (V4SImode);
4912 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4913 const0_rtx, const0_rtx,
4914 const2_rtx, const2_rtx));
4915 mask = t3;
4916 maskmode = V4SImode;
4917 e = w = 4;
4918 }
4919
4920 vt = gen_const_vec_duplicate (maskmode, vt);
4921 vt = force_reg (maskmode, vt);
4922 mask = expand_simple_binop (maskmode, AND, mask, vt,
4923 NULL_RTX, 0, OPTAB_DIRECT);
4924
4925 if (GET_MODE (target) != mode)
4926 target = gen_reg_rtx (mode);
4927 xops[0] = target;
4928 xops[1] = gen_lowpart (mode, t2);
4929 xops[2] = gen_lowpart (mode, t1);
4930 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4931 xops[4] = mask;
4932 xops[5] = vt;
4933 ok = ix86_expand_int_vcond (xops);
4934 gcc_assert (ok);
4935 if (target != operands[0])
4936 emit_move_insn (operands[0],
4937 gen_lowpart (GET_MODE (operands[0]), target));
4938 }
4939 }
4940
4941 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4942 true if we should do zero extension, else sign extension. HIGH_P is
4943 true if we want the N/2 high elements, else the low elements. */
4944
4945 void
4946 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4947 {
4948 machine_mode imode = GET_MODE (src);
4949 rtx tmp;
4950
4951 if (TARGET_SSE4_1)
4952 {
4953 rtx (*unpack)(rtx, rtx);
4954 rtx (*extract)(rtx, rtx) = NULL;
4955 machine_mode halfmode = BLKmode;
4956
4957 switch (imode)
4958 {
4959 case E_V64QImode:
4960 if (unsigned_p)
4961 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4962 else
4963 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4964 halfmode = V32QImode;
4965 extract
4966 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4967 break;
4968 case E_V32QImode:
4969 if (unsigned_p)
4970 unpack = gen_avx2_zero_extendv16qiv16hi2;
4971 else
4972 unpack = gen_avx2_sign_extendv16qiv16hi2;
4973 halfmode = V16QImode;
4974 extract
4975 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4976 break;
4977 case E_V32HImode:
4978 if (unsigned_p)
4979 unpack = gen_avx512f_zero_extendv16hiv16si2;
4980 else
4981 unpack = gen_avx512f_sign_extendv16hiv16si2;
4982 halfmode = V16HImode;
4983 extract
4984 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
4985 break;
4986 case E_V16HImode:
4987 if (unsigned_p)
4988 unpack = gen_avx2_zero_extendv8hiv8si2;
4989 else
4990 unpack = gen_avx2_sign_extendv8hiv8si2;
4991 halfmode = V8HImode;
4992 extract
4993 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
4994 break;
4995 case E_V16SImode:
4996 if (unsigned_p)
4997 unpack = gen_avx512f_zero_extendv8siv8di2;
4998 else
4999 unpack = gen_avx512f_sign_extendv8siv8di2;
5000 halfmode = V8SImode;
5001 extract
5002 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5003 break;
5004 case E_V8SImode:
5005 if (unsigned_p)
5006 unpack = gen_avx2_zero_extendv4siv4di2;
5007 else
5008 unpack = gen_avx2_sign_extendv4siv4di2;
5009 halfmode = V4SImode;
5010 extract
5011 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5012 break;
5013 case E_V16QImode:
5014 if (unsigned_p)
5015 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5016 else
5017 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5018 break;
5019 case E_V8HImode:
5020 if (unsigned_p)
5021 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5022 else
5023 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5024 break;
5025 case E_V4SImode:
5026 if (unsigned_p)
5027 unpack = gen_sse4_1_zero_extendv2siv2di2;
5028 else
5029 unpack = gen_sse4_1_sign_extendv2siv2di2;
5030 break;
5031 default:
5032 gcc_unreachable ();
5033 }
5034
5035 if (GET_MODE_SIZE (imode) >= 32)
5036 {
5037 tmp = gen_reg_rtx (halfmode);
5038 emit_insn (extract (tmp, src));
5039 }
5040 else if (high_p)
5041 {
5042 /* Shift higher 8 bytes to lower 8 bytes. */
5043 tmp = gen_reg_rtx (V1TImode);
5044 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5045 GEN_INT (64)));
5046 tmp = gen_lowpart (imode, tmp);
5047 }
5048 else
5049 tmp = src;
5050
5051 emit_insn (unpack (dest, tmp));
5052 }
5053 else
5054 {
5055 rtx (*unpack)(rtx, rtx, rtx);
5056
5057 switch (imode)
5058 {
5059 case E_V16QImode:
5060 if (high_p)
5061 unpack = gen_vec_interleave_highv16qi;
5062 else
5063 unpack = gen_vec_interleave_lowv16qi;
5064 break;
5065 case E_V8HImode:
5066 if (high_p)
5067 unpack = gen_vec_interleave_highv8hi;
5068 else
5069 unpack = gen_vec_interleave_lowv8hi;
5070 break;
5071 case E_V4SImode:
5072 if (high_p)
5073 unpack = gen_vec_interleave_highv4si;
5074 else
5075 unpack = gen_vec_interleave_lowv4si;
5076 break;
5077 default:
5078 gcc_unreachable ();
5079 }
5080
5081 if (unsigned_p)
5082 tmp = force_reg (imode, CONST0_RTX (imode));
5083 else
5084 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5085 src, pc_rtx, pc_rtx);
5086
5087 rtx tmp2 = gen_reg_rtx (imode);
5088 emit_insn (unpack (tmp2, src, tmp));
5089 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5090 }
5091 }
5092
5093 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5094 but works for floating pointer parameters and nonoffsetable memories.
5095 For pushes, it returns just stack offsets; the values will be saved
5096 in the right order. Maximally three parts are generated. */
5097
5098 static int
5099 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5100 {
5101 int size;
5102
5103 if (!TARGET_64BIT)
5104 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5105 else
5106 size = (GET_MODE_SIZE (mode) + 4) / 8;
5107
5108 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5109 gcc_assert (size >= 2 && size <= 4);
5110
5111 /* Optimize constant pool reference to immediates. This is used by fp
5112 moves, that force all constants to memory to allow combining. */
5113 if (MEM_P (operand) && MEM_READONLY_P (operand))
5114 operand = avoid_constant_pool_reference (operand);
5115
5116 if (MEM_P (operand) && !offsettable_memref_p (operand))
5117 {
5118 /* The only non-offsetable memories we handle are pushes. */
5119 int ok = push_operand (operand, VOIDmode);
5120
5121 gcc_assert (ok);
5122
5123 operand = copy_rtx (operand);
5124 PUT_MODE (operand, word_mode);
5125 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5126 return size;
5127 }
5128
5129 if (GET_CODE (operand) == CONST_VECTOR)
5130 {
5131 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5132 /* Caution: if we looked through a constant pool memory above,
5133 the operand may actually have a different mode now. That's
5134 ok, since we want to pun this all the way back to an integer. */
5135 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5136 gcc_assert (operand != NULL);
5137 mode = imode;
5138 }
5139
5140 if (!TARGET_64BIT)
5141 {
5142 if (mode == DImode)
5143 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5144 else
5145 {
5146 int i;
5147
5148 if (REG_P (operand))
5149 {
5150 gcc_assert (reload_completed);
5151 for (i = 0; i < size; i++)
5152 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5153 }
5154 else if (offsettable_memref_p (operand))
5155 {
5156 operand = adjust_address (operand, SImode, 0);
5157 parts[0] = operand;
5158 for (i = 1; i < size; i++)
5159 parts[i] = adjust_address (operand, SImode, 4 * i);
5160 }
5161 else if (CONST_DOUBLE_P (operand))
5162 {
5163 const REAL_VALUE_TYPE *r;
5164 long l[4];
5165
5166 r = CONST_DOUBLE_REAL_VALUE (operand);
5167 switch (mode)
5168 {
5169 case E_TFmode:
5170 real_to_target (l, r, mode);
5171 parts[3] = gen_int_mode (l[3], SImode);
5172 parts[2] = gen_int_mode (l[2], SImode);
5173 break;
5174 case E_XFmode:
5175 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5176 long double may not be 80-bit. */
5177 real_to_target (l, r, mode);
5178 parts[2] = gen_int_mode (l[2], SImode);
5179 break;
5180 case E_DFmode:
5181 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5182 break;
5183 default:
5184 gcc_unreachable ();
5185 }
5186 parts[1] = gen_int_mode (l[1], SImode);
5187 parts[0] = gen_int_mode (l[0], SImode);
5188 }
5189 else
5190 gcc_unreachable ();
5191 }
5192 }
5193 else
5194 {
5195 if (mode == TImode)
5196 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5197 if (mode == XFmode || mode == TFmode)
5198 {
5199 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5200 if (REG_P (operand))
5201 {
5202 gcc_assert (reload_completed);
5203 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5204 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5205 }
5206 else if (offsettable_memref_p (operand))
5207 {
5208 operand = adjust_address (operand, DImode, 0);
5209 parts[0] = operand;
5210 parts[1] = adjust_address (operand, upper_mode, 8);
5211 }
5212 else if (CONST_DOUBLE_P (operand))
5213 {
5214 long l[4];
5215
5216 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5217
5218 /* real_to_target puts 32-bit pieces in each long. */
5219 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5220 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5221 << 32), DImode);
5222
5223 if (upper_mode == SImode)
5224 parts[1] = gen_int_mode (l[2], SImode);
5225 else
5226 parts[1]
5227 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5228 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5229 << 32), DImode);
5230 }
5231 else
5232 gcc_unreachable ();
5233 }
5234 }
5235
5236 return size;
5237 }
5238
5239 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5240 Return false when normal moves are needed; true when all required
5241 insns have been emitted. Operands 2-4 contain the input values
5242 int the correct order; operands 5-7 contain the output values. */
5243
5244 void
5245 ix86_split_long_move (rtx operands[])
5246 {
5247 rtx part[2][4];
5248 int nparts, i, j;
5249 int push = 0;
5250 int collisions = 0;
5251 machine_mode mode = GET_MODE (operands[0]);
5252 bool collisionparts[4];
5253
5254 /* The DFmode expanders may ask us to move double.
5255 For 64bit target this is single move. By hiding the fact
5256 here we simplify i386.md splitters. */
5257 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5258 {
5259 /* Optimize constant pool reference to immediates. This is used by
5260 fp moves, that force all constants to memory to allow combining. */
5261
5262 if (MEM_P (operands[1])
5263 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5264 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5265 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5266 if (push_operand (operands[0], VOIDmode))
5267 {
5268 operands[0] = copy_rtx (operands[0]);
5269 PUT_MODE (operands[0], word_mode);
5270 }
5271 else
5272 operands[0] = gen_lowpart (DImode, operands[0]);
5273 operands[1] = gen_lowpart (DImode, operands[1]);
5274 emit_move_insn (operands[0], operands[1]);
5275 return;
5276 }
5277
5278 /* The only non-offsettable memory we handle is push. */
5279 if (push_operand (operands[0], VOIDmode))
5280 push = 1;
5281 else
5282 gcc_assert (!MEM_P (operands[0])
5283 || offsettable_memref_p (operands[0]));
5284
5285 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5286 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5287
5288 /* When emitting push, take care for source operands on the stack. */
5289 if (push && MEM_P (operands[1])
5290 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5291 {
5292 rtx src_base = XEXP (part[1][nparts - 1], 0);
5293
5294 /* Compensate for the stack decrement by 4. */
5295 if (!TARGET_64BIT && nparts == 3
5296 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5297 src_base = plus_constant (Pmode, src_base, 4);
5298
5299 /* src_base refers to the stack pointer and is
5300 automatically decreased by emitted push. */
5301 for (i = 0; i < nparts; i++)
5302 part[1][i] = change_address (part[1][i],
5303 GET_MODE (part[1][i]), src_base);
5304 }
5305
5306 /* We need to do copy in the right order in case an address register
5307 of the source overlaps the destination. */
5308 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5309 {
5310 rtx tmp;
5311
5312 for (i = 0; i < nparts; i++)
5313 {
5314 collisionparts[i]
5315 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5316 if (collisionparts[i])
5317 collisions++;
5318 }
5319
5320 /* Collision in the middle part can be handled by reordering. */
5321 if (collisions == 1 && nparts == 3 && collisionparts [1])
5322 {
5323 std::swap (part[0][1], part[0][2]);
5324 std::swap (part[1][1], part[1][2]);
5325 }
5326 else if (collisions == 1
5327 && nparts == 4
5328 && (collisionparts [1] || collisionparts [2]))
5329 {
5330 if (collisionparts [1])
5331 {
5332 std::swap (part[0][1], part[0][2]);
5333 std::swap (part[1][1], part[1][2]);
5334 }
5335 else
5336 {
5337 std::swap (part[0][2], part[0][3]);
5338 std::swap (part[1][2], part[1][3]);
5339 }
5340 }
5341
5342 /* If there are more collisions, we can't handle it by reordering.
5343 Do an lea to the last part and use only one colliding move. */
5344 else if (collisions > 1)
5345 {
5346 rtx base, addr;
5347
5348 collisions = 1;
5349
5350 base = part[0][nparts - 1];
5351
5352 /* Handle the case when the last part isn't valid for lea.
5353 Happens in 64-bit mode storing the 12-byte XFmode. */
5354 if (GET_MODE (base) != Pmode)
5355 base = gen_rtx_REG (Pmode, REGNO (base));
5356
5357 addr = XEXP (part[1][0], 0);
5358 if (TARGET_TLS_DIRECT_SEG_REFS)
5359 {
5360 struct ix86_address parts;
5361 int ok = ix86_decompose_address (addr, &parts);
5362 gcc_assert (ok);
5363 /* It is not valid to use %gs: or %fs: in lea. */
5364 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5365 }
5366 emit_insn (gen_rtx_SET (base, addr));
5367 part[1][0] = replace_equiv_address (part[1][0], base);
5368 for (i = 1; i < nparts; i++)
5369 {
5370 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5371 part[1][i] = replace_equiv_address (part[1][i], tmp);
5372 }
5373 }
5374 }
5375
5376 if (push)
5377 {
5378 if (!TARGET_64BIT)
5379 {
5380 if (nparts == 3)
5381 {
5382 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5383 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5384 emit_move_insn (part[0][2], part[1][2]);
5385 }
5386 else if (nparts == 4)
5387 {
5388 emit_move_insn (part[0][3], part[1][3]);
5389 emit_move_insn (part[0][2], part[1][2]);
5390 }
5391 }
5392 else
5393 {
5394 /* In 64bit mode we don't have 32bit push available. In case this is
5395 register, it is OK - we will just use larger counterpart. We also
5396 retype memory - these comes from attempt to avoid REX prefix on
5397 moving of second half of TFmode value. */
5398 if (GET_MODE (part[1][1]) == SImode)
5399 {
5400 switch (GET_CODE (part[1][1]))
5401 {
5402 case MEM:
5403 part[1][1] = adjust_address (part[1][1], DImode, 0);
5404 break;
5405
5406 case REG:
5407 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5408 break;
5409
5410 default:
5411 gcc_unreachable ();
5412 }
5413
5414 if (GET_MODE (part[1][0]) == SImode)
5415 part[1][0] = part[1][1];
5416 }
5417 }
5418 emit_move_insn (part[0][1], part[1][1]);
5419 emit_move_insn (part[0][0], part[1][0]);
5420 return;
5421 }
5422
5423 /* Choose correct order to not overwrite the source before it is copied. */
5424 if ((REG_P (part[0][0])
5425 && REG_P (part[1][1])
5426 && (REGNO (part[0][0]) == REGNO (part[1][1])
5427 || (nparts == 3
5428 && REGNO (part[0][0]) == REGNO (part[1][2]))
5429 || (nparts == 4
5430 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5431 || (collisions > 0
5432 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5433 {
5434 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5435 {
5436 operands[2 + i] = part[0][j];
5437 operands[6 + i] = part[1][j];
5438 }
5439 }
5440 else
5441 {
5442 for (i = 0; i < nparts; i++)
5443 {
5444 operands[2 + i] = part[0][i];
5445 operands[6 + i] = part[1][i];
5446 }
5447 }
5448
5449 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5450 if (optimize_insn_for_size_p ())
5451 {
5452 for (j = 0; j < nparts - 1; j++)
5453 if (CONST_INT_P (operands[6 + j])
5454 && operands[6 + j] != const0_rtx
5455 && REG_P (operands[2 + j]))
5456 for (i = j; i < nparts - 1; i++)
5457 if (CONST_INT_P (operands[7 + i])
5458 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5459 operands[7 + i] = operands[2 + j];
5460 }
5461
5462 for (i = 0; i < nparts; i++)
5463 emit_move_insn (operands[2 + i], operands[6 + i]);
5464
5465 return;
5466 }
5467
5468 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5469 left shift by a constant, either using a single shift or
5470 a sequence of add instructions. */
5471
5472 static void
5473 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5474 {
5475 rtx (*insn)(rtx, rtx, rtx);
5476
5477 if (count == 1
5478 || (count * ix86_cost->add <= ix86_cost->shift_const
5479 && !optimize_insn_for_size_p ()))
5480 {
5481 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
5482 while (count-- > 0)
5483 emit_insn (insn (operand, operand, operand));
5484 }
5485 else
5486 {
5487 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5488 emit_insn (insn (operand, operand, GEN_INT (count)));
5489 }
5490 }
5491
5492 void
5493 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5494 {
5495 rtx (*gen_ashl3)(rtx, rtx, rtx);
5496 rtx (*gen_shld)(rtx, rtx, rtx);
5497 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5498
5499 rtx low[2], high[2];
5500 int count;
5501
5502 if (CONST_INT_P (operands[2]))
5503 {
5504 split_double_mode (mode, operands, 2, low, high);
5505 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5506
5507 if (count >= half_width)
5508 {
5509 emit_move_insn (high[0], low[1]);
5510 emit_move_insn (low[0], const0_rtx);
5511
5512 if (count > half_width)
5513 ix86_expand_ashl_const (high[0], count - half_width, mode);
5514 }
5515 else
5516 {
5517 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5518
5519 if (!rtx_equal_p (operands[0], operands[1]))
5520 emit_move_insn (operands[0], operands[1]);
5521
5522 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5523 ix86_expand_ashl_const (low[0], count, mode);
5524 }
5525 return;
5526 }
5527
5528 split_double_mode (mode, operands, 1, low, high);
5529
5530 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5531
5532 if (operands[1] == const1_rtx)
5533 {
5534 /* Assuming we've chosen a QImode capable registers, then 1 << N
5535 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5536 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5537 {
5538 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5539
5540 ix86_expand_clear (low[0]);
5541 ix86_expand_clear (high[0]);
5542 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5543
5544 d = gen_lowpart (QImode, low[0]);
5545 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5546 s = gen_rtx_EQ (QImode, flags, const0_rtx);
5547 emit_insn (gen_rtx_SET (d, s));
5548
5549 d = gen_lowpart (QImode, high[0]);
5550 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5551 s = gen_rtx_NE (QImode, flags, const0_rtx);
5552 emit_insn (gen_rtx_SET (d, s));
5553 }
5554
5555 /* Otherwise, we can get the same results by manually performing
5556 a bit extract operation on bit 5/6, and then performing the two
5557 shifts. The two methods of getting 0/1 into low/high are exactly
5558 the same size. Avoiding the shift in the bit extract case helps
5559 pentium4 a bit; no one else seems to care much either way. */
5560 else
5561 {
5562 machine_mode half_mode;
5563 rtx (*gen_lshr3)(rtx, rtx, rtx);
5564 rtx (*gen_and3)(rtx, rtx, rtx);
5565 rtx (*gen_xor3)(rtx, rtx, rtx);
5566 HOST_WIDE_INT bits;
5567 rtx x;
5568
5569 if (mode == DImode)
5570 {
5571 half_mode = SImode;
5572 gen_lshr3 = gen_lshrsi3;
5573 gen_and3 = gen_andsi3;
5574 gen_xor3 = gen_xorsi3;
5575 bits = 5;
5576 }
5577 else
5578 {
5579 half_mode = DImode;
5580 gen_lshr3 = gen_lshrdi3;
5581 gen_and3 = gen_anddi3;
5582 gen_xor3 = gen_xordi3;
5583 bits = 6;
5584 }
5585
5586 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5587 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5588 else
5589 x = gen_lowpart (half_mode, operands[2]);
5590 emit_insn (gen_rtx_SET (high[0], x));
5591
5592 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5593 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5594 emit_move_insn (low[0], high[0]);
5595 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5596 }
5597
5598 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5599 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5600 return;
5601 }
5602
5603 if (operands[1] == constm1_rtx)
5604 {
5605 /* For -1 << N, we can avoid the shld instruction, because we
5606 know that we're shifting 0...31/63 ones into a -1. */
5607 emit_move_insn (low[0], constm1_rtx);
5608 if (optimize_insn_for_size_p ())
5609 emit_move_insn (high[0], low[0]);
5610 else
5611 emit_move_insn (high[0], constm1_rtx);
5612 }
5613 else
5614 {
5615 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5616
5617 if (!rtx_equal_p (operands[0], operands[1]))
5618 emit_move_insn (operands[0], operands[1]);
5619
5620 split_double_mode (mode, operands, 1, low, high);
5621 emit_insn (gen_shld (high[0], low[0], operands[2]));
5622 }
5623
5624 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5625
5626 if (TARGET_CMOVE && scratch)
5627 {
5628 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
5629 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
5630
5631 ix86_expand_clear (scratch);
5632 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
5633 }
5634 else
5635 {
5636 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
5637 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
5638
5639 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
5640 }
5641 }
5642
5643 void
5644 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5645 {
5646 rtx (*gen_ashr3)(rtx, rtx, rtx)
5647 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5648 rtx (*gen_shrd)(rtx, rtx, rtx);
5649 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5650
5651 rtx low[2], high[2];
5652 int count;
5653
5654 if (CONST_INT_P (operands[2]))
5655 {
5656 split_double_mode (mode, operands, 2, low, high);
5657 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5658
5659 if (count == GET_MODE_BITSIZE (mode) - 1)
5660 {
5661 emit_move_insn (high[0], high[1]);
5662 emit_insn (gen_ashr3 (high[0], high[0],
5663 GEN_INT (half_width - 1)));
5664 emit_move_insn (low[0], high[0]);
5665
5666 }
5667 else if (count >= half_width)
5668 {
5669 emit_move_insn (low[0], high[1]);
5670 emit_move_insn (high[0], low[0]);
5671 emit_insn (gen_ashr3 (high[0], high[0],
5672 GEN_INT (half_width - 1)));
5673
5674 if (count > half_width)
5675 emit_insn (gen_ashr3 (low[0], low[0],
5676 GEN_INT (count - half_width)));
5677 }
5678 else
5679 {
5680 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5681
5682 if (!rtx_equal_p (operands[0], operands[1]))
5683 emit_move_insn (operands[0], operands[1]);
5684
5685 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5686 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5687 }
5688 }
5689 else
5690 {
5691 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5692
5693 if (!rtx_equal_p (operands[0], operands[1]))
5694 emit_move_insn (operands[0], operands[1]);
5695
5696 split_double_mode (mode, operands, 1, low, high);
5697
5698 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5699 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5700
5701 if (TARGET_CMOVE && scratch)
5702 {
5703 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
5704 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
5705
5706 emit_move_insn (scratch, high[0]);
5707 emit_insn (gen_ashr3 (scratch, scratch,
5708 GEN_INT (half_width - 1)));
5709 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
5710 scratch));
5711 }
5712 else
5713 {
5714 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
5715 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
5716
5717 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
5718 }
5719 }
5720 }
5721
5722 void
5723 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5724 {
5725 rtx (*gen_lshr3)(rtx, rtx, rtx)
5726 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5727 rtx (*gen_shrd)(rtx, rtx, rtx);
5728 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5729
5730 rtx low[2], high[2];
5731 int count;
5732
5733 if (CONST_INT_P (operands[2]))
5734 {
5735 split_double_mode (mode, operands, 2, low, high);
5736 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5737
5738 if (count >= half_width)
5739 {
5740 emit_move_insn (low[0], high[1]);
5741 ix86_expand_clear (high[0]);
5742
5743 if (count > half_width)
5744 emit_insn (gen_lshr3 (low[0], low[0],
5745 GEN_INT (count - half_width)));
5746 }
5747 else
5748 {
5749 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5750
5751 if (!rtx_equal_p (operands[0], operands[1]))
5752 emit_move_insn (operands[0], operands[1]);
5753
5754 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5755 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5756 }
5757 }
5758 else
5759 {
5760 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5761
5762 if (!rtx_equal_p (operands[0], operands[1]))
5763 emit_move_insn (operands[0], operands[1]);
5764
5765 split_double_mode (mode, operands, 1, low, high);
5766
5767 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5768 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5769
5770 if (TARGET_CMOVE && scratch)
5771 {
5772 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
5773 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
5774
5775 ix86_expand_clear (scratch);
5776 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
5777 scratch));
5778 }
5779 else
5780 {
5781 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
5782 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
5783
5784 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
5785 }
5786 }
5787 }
5788
5789 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5790 DImode for constant loop counts. */
5791
5792 static machine_mode
5793 counter_mode (rtx count_exp)
5794 {
5795 if (GET_MODE (count_exp) != VOIDmode)
5796 return GET_MODE (count_exp);
5797 if (!CONST_INT_P (count_exp))
5798 return Pmode;
5799 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5800 return DImode;
5801 return SImode;
5802 }
5803
5804 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5805 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5806 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5807 memory by VALUE (supposed to be in MODE).
5808
5809 The size is rounded down to whole number of chunk size moved at once.
5810 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5811
5812
5813 static void
5814 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
5815 rtx destptr, rtx srcptr, rtx value,
5816 rtx count, machine_mode mode, int unroll,
5817 int expected_size, bool issetmem)
5818 {
5819 rtx_code_label *out_label, *top_label;
5820 rtx iter, tmp;
5821 machine_mode iter_mode = counter_mode (count);
5822 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5823 rtx piece_size = GEN_INT (piece_size_n);
5824 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5825 rtx size;
5826 int i;
5827
5828 top_label = gen_label_rtx ();
5829 out_label = gen_label_rtx ();
5830 iter = gen_reg_rtx (iter_mode);
5831
5832 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5833 NULL, 1, OPTAB_DIRECT);
5834 /* Those two should combine. */
5835 if (piece_size == const1_rtx)
5836 {
5837 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5838 true, out_label);
5839 predict_jump (REG_BR_PROB_BASE * 10 / 100);
5840 }
5841 emit_move_insn (iter, const0_rtx);
5842
5843 emit_label (top_label);
5844
5845 tmp = convert_modes (Pmode, iter_mode, iter, true);
5846
5847 /* This assert could be relaxed - in this case we'll need to compute
5848 smallest power of two, containing in PIECE_SIZE_N and pass it to
5849 offset_address. */
5850 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5851 destmem = offset_address (destmem, tmp, piece_size_n);
5852 destmem = adjust_address (destmem, mode, 0);
5853
5854 if (!issetmem)
5855 {
5856 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5857 srcmem = adjust_address (srcmem, mode, 0);
5858
5859 /* When unrolling for chips that reorder memory reads and writes,
5860 we can save registers by using single temporary.
5861 Also using 4 temporaries is overkill in 32bit mode. */
5862 if (!TARGET_64BIT && 0)
5863 {
5864 for (i = 0; i < unroll; i++)
5865 {
5866 if (i)
5867 {
5868 destmem = adjust_address (copy_rtx (destmem), mode,
5869 GET_MODE_SIZE (mode));
5870 srcmem = adjust_address (copy_rtx (srcmem), mode,
5871 GET_MODE_SIZE (mode));
5872 }
5873 emit_move_insn (destmem, srcmem);
5874 }
5875 }
5876 else
5877 {
5878 rtx tmpreg[4];
5879 gcc_assert (unroll <= 4);
5880 for (i = 0; i < unroll; i++)
5881 {
5882 tmpreg[i] = gen_reg_rtx (mode);
5883 if (i)
5884 srcmem = adjust_address (copy_rtx (srcmem), mode,
5885 GET_MODE_SIZE (mode));
5886 emit_move_insn (tmpreg[i], srcmem);
5887 }
5888 for (i = 0; i < unroll; i++)
5889 {
5890 if (i)
5891 destmem = adjust_address (copy_rtx (destmem), mode,
5892 GET_MODE_SIZE (mode));
5893 emit_move_insn (destmem, tmpreg[i]);
5894 }
5895 }
5896 }
5897 else
5898 for (i = 0; i < unroll; i++)
5899 {
5900 if (i)
5901 destmem = adjust_address (copy_rtx (destmem), mode,
5902 GET_MODE_SIZE (mode));
5903 emit_move_insn (destmem, value);
5904 }
5905
5906 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5907 true, OPTAB_LIB_WIDEN);
5908 if (tmp != iter)
5909 emit_move_insn (iter, tmp);
5910
5911 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5912 true, top_label);
5913 if (expected_size != -1)
5914 {
5915 expected_size /= GET_MODE_SIZE (mode) * unroll;
5916 if (expected_size == 0)
5917 predict_jump (0);
5918 else if (expected_size > REG_BR_PROB_BASE)
5919 predict_jump (REG_BR_PROB_BASE - 1);
5920 else
5921 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5922 / expected_size);
5923 }
5924 else
5925 predict_jump (REG_BR_PROB_BASE * 80 / 100);
5926 iter = ix86_zero_extend_to_Pmode (iter);
5927 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5928 true, OPTAB_LIB_WIDEN);
5929 if (tmp != destptr)
5930 emit_move_insn (destptr, tmp);
5931 if (!issetmem)
5932 {
5933 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5934 true, OPTAB_LIB_WIDEN);
5935 if (tmp != srcptr)
5936 emit_move_insn (srcptr, tmp);
5937 }
5938 emit_label (out_label);
5939 }
5940
5941 /* Divide COUNTREG by SCALE. */
5942 static rtx
5943 scale_counter (rtx countreg, int scale)
5944 {
5945 rtx sc;
5946
5947 if (scale == 1)
5948 return countreg;
5949 if (CONST_INT_P (countreg))
5950 return GEN_INT (INTVAL (countreg) / scale);
5951 gcc_assert (REG_P (countreg));
5952
5953 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5954 GEN_INT (exact_log2 (scale)),
5955 NULL, 1, OPTAB_DIRECT);
5956 return sc;
5957 }
5958
5959 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5960 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5961 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5962 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5963 ORIG_VALUE is the original value passed to memset to fill the memory with.
5964 Other arguments have same meaning as for previous function. */
5965
5966 static void
5967 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
5968 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5969 rtx count,
5970 machine_mode mode, bool issetmem)
5971 {
5972 rtx destexp;
5973 rtx srcexp;
5974 rtx countreg;
5975 HOST_WIDE_INT rounded_count;
5976
5977 /* If possible, it is shorter to use rep movs.
5978 TODO: Maybe it is better to move this logic to decide_alg. */
5979 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5980 && (!issetmem || orig_value == const0_rtx))
5981 mode = SImode;
5982
5983 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5984 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5985
5986 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5987 GET_MODE_SIZE (mode)));
5988 if (mode != QImode)
5989 {
5990 destexp = gen_rtx_ASHIFT (Pmode, countreg,
5991 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5992 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5993 }
5994 else
5995 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
5996 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
5997 {
5998 rounded_count
5999 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6000 destmem = shallow_copy_rtx (destmem);
6001 set_mem_size (destmem, rounded_count);
6002 }
6003 else if (MEM_SIZE_KNOWN_P (destmem))
6004 clear_mem_size (destmem);
6005
6006 if (issetmem)
6007 {
6008 value = force_reg (mode, gen_lowpart (mode, value));
6009 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
6010 }
6011 else
6012 {
6013 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
6014 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
6015 if (mode != QImode)
6016 {
6017 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
6018 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
6019 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
6020 }
6021 else
6022 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
6023 if (CONST_INT_P (count))
6024 {
6025 rounded_count
6026 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6027 srcmem = shallow_copy_rtx (srcmem);
6028 set_mem_size (srcmem, rounded_count);
6029 }
6030 else
6031 {
6032 if (MEM_SIZE_KNOWN_P (srcmem))
6033 clear_mem_size (srcmem);
6034 }
6035 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
6036 destexp, srcexp));
6037 }
6038 }
6039
6040 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6041 DESTMEM.
6042 SRC is passed by pointer to be updated on return.
6043 Return value is updated DST. */
6044 static rtx
6045 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6046 HOST_WIDE_INT size_to_move)
6047 {
6048 rtx dst = destmem, src = *srcmem, adjust, tempreg;
6049 enum insn_code code;
6050 machine_mode move_mode;
6051 int piece_size, i;
6052
6053 /* Find the widest mode in which we could perform moves.
6054 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6055 it until move of such size is supported. */
6056 piece_size = 1 << floor_log2 (size_to_move);
6057 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6058 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6059 {
6060 gcc_assert (piece_size > 1);
6061 piece_size >>= 1;
6062 }
6063
6064 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6065 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6066 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6067 {
6068 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6069 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6070 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6071 {
6072 move_mode = word_mode;
6073 piece_size = GET_MODE_SIZE (move_mode);
6074 code = optab_handler (mov_optab, move_mode);
6075 }
6076 }
6077 gcc_assert (code != CODE_FOR_nothing);
6078
6079 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6080 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6081
6082 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6083 gcc_assert (size_to_move % piece_size == 0);
6084 adjust = GEN_INT (piece_size);
6085 for (i = 0; i < size_to_move; i += piece_size)
6086 {
6087 /* We move from memory to memory, so we'll need to do it via
6088 a temporary register. */
6089 tempreg = gen_reg_rtx (move_mode);
6090 emit_insn (GEN_FCN (code) (tempreg, src));
6091 emit_insn (GEN_FCN (code) (dst, tempreg));
6092
6093 emit_move_insn (destptr,
6094 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6095 emit_move_insn (srcptr,
6096 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
6097
6098 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6099 piece_size);
6100 src = adjust_automodify_address_nv (src, move_mode, srcptr,
6101 piece_size);
6102 }
6103
6104 /* Update DST and SRC rtx. */
6105 *srcmem = src;
6106 return dst;
6107 }
6108
6109 /* Helper function for the string operations below. Dest VARIABLE whether
6110 it is aligned to VALUE bytes. If true, jump to the label. */
6111
6112 static rtx_code_label *
6113 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6114 {
6115 rtx_code_label *label = gen_label_rtx ();
6116 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6117 if (GET_MODE (variable) == DImode)
6118 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6119 else
6120 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6121 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6122 1, label);
6123 if (epilogue)
6124 predict_jump (REG_BR_PROB_BASE * 50 / 100);
6125 else
6126 predict_jump (REG_BR_PROB_BASE * 90 / 100);
6127 return label;
6128 }
6129
6130
6131 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6132
6133 static void
6134 expand_movmem_epilogue (rtx destmem, rtx srcmem,
6135 rtx destptr, rtx srcptr, rtx count, int max_size)
6136 {
6137 rtx src, dest;
6138 if (CONST_INT_P (count))
6139 {
6140 HOST_WIDE_INT countval = INTVAL (count);
6141 HOST_WIDE_INT epilogue_size = countval % max_size;
6142 int i;
6143
6144 /* For now MAX_SIZE should be a power of 2. This assert could be
6145 relaxed, but it'll require a bit more complicated epilogue
6146 expanding. */
6147 gcc_assert ((max_size & (max_size - 1)) == 0);
6148 for (i = max_size; i >= 1; i >>= 1)
6149 {
6150 if (epilogue_size & i)
6151 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6152 }
6153 return;
6154 }
6155 if (max_size > 8)
6156 {
6157 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6158 count, 1, OPTAB_DIRECT);
6159 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
6160 count, QImode, 1, 4, false);
6161 return;
6162 }
6163
6164 /* When there are stringops, we can cheaply increase dest and src pointers.
6165 Otherwise we save code size by maintaining offset (zero is readily
6166 available from preceding rep operation) and using x86 addressing modes.
6167 */
6168 if (TARGET_SINGLE_STRINGOP)
6169 {
6170 if (max_size > 4)
6171 {
6172 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6173 src = change_address (srcmem, SImode, srcptr);
6174 dest = change_address (destmem, SImode, destptr);
6175 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6176 emit_label (label);
6177 LABEL_NUSES (label) = 1;
6178 }
6179 if (max_size > 2)
6180 {
6181 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6182 src = change_address (srcmem, HImode, srcptr);
6183 dest = change_address (destmem, HImode, destptr);
6184 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6185 emit_label (label);
6186 LABEL_NUSES (label) = 1;
6187 }
6188 if (max_size > 1)
6189 {
6190 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6191 src = change_address (srcmem, QImode, srcptr);
6192 dest = change_address (destmem, QImode, destptr);
6193 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6194 emit_label (label);
6195 LABEL_NUSES (label) = 1;
6196 }
6197 }
6198 else
6199 {
6200 rtx offset = force_reg (Pmode, const0_rtx);
6201 rtx tmp;
6202
6203 if (max_size > 4)
6204 {
6205 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6206 src = change_address (srcmem, SImode, srcptr);
6207 dest = change_address (destmem, SImode, destptr);
6208 emit_move_insn (dest, src);
6209 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6210 true, OPTAB_LIB_WIDEN);
6211 if (tmp != offset)
6212 emit_move_insn (offset, tmp);
6213 emit_label (label);
6214 LABEL_NUSES (label) = 1;
6215 }
6216 if (max_size > 2)
6217 {
6218 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6219 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6220 src = change_address (srcmem, HImode, tmp);
6221 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6222 dest = change_address (destmem, HImode, tmp);
6223 emit_move_insn (dest, src);
6224 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6225 true, OPTAB_LIB_WIDEN);
6226 if (tmp != offset)
6227 emit_move_insn (offset, tmp);
6228 emit_label (label);
6229 LABEL_NUSES (label) = 1;
6230 }
6231 if (max_size > 1)
6232 {
6233 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6234 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6235 src = change_address (srcmem, QImode, tmp);
6236 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6237 dest = change_address (destmem, QImode, tmp);
6238 emit_move_insn (dest, src);
6239 emit_label (label);
6240 LABEL_NUSES (label) = 1;
6241 }
6242 }
6243 }
6244
6245 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6246 with value PROMOTED_VAL.
6247 SRC is passed by pointer to be updated on return.
6248 Return value is updated DST. */
6249 static rtx
6250 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6251 HOST_WIDE_INT size_to_move)
6252 {
6253 rtx dst = destmem, adjust;
6254 enum insn_code code;
6255 machine_mode move_mode;
6256 int piece_size, i;
6257
6258 /* Find the widest mode in which we could perform moves.
6259 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6260 it until move of such size is supported. */
6261 move_mode = GET_MODE (promoted_val);
6262 if (move_mode == VOIDmode)
6263 move_mode = QImode;
6264 if (size_to_move < GET_MODE_SIZE (move_mode))
6265 {
6266 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6267 move_mode = int_mode_for_size (move_bits, 0).require ();
6268 promoted_val = gen_lowpart (move_mode, promoted_val);
6269 }
6270 piece_size = GET_MODE_SIZE (move_mode);
6271 code = optab_handler (mov_optab, move_mode);
6272 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6273
6274 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6275
6276 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6277 gcc_assert (size_to_move % piece_size == 0);
6278 adjust = GEN_INT (piece_size);
6279 for (i = 0; i < size_to_move; i += piece_size)
6280 {
6281 if (piece_size <= GET_MODE_SIZE (word_mode))
6282 {
6283 emit_insn (gen_strset (destptr, dst, promoted_val));
6284 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6285 piece_size);
6286 continue;
6287 }
6288
6289 emit_insn (GEN_FCN (code) (dst, promoted_val));
6290
6291 emit_move_insn (destptr,
6292 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6293
6294 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6295 piece_size);
6296 }
6297
6298 /* Update DST rtx. */
6299 return dst;
6300 }
6301 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6302 static void
6303 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6304 rtx count, int max_size)
6305 {
6306 count = expand_simple_binop (counter_mode (count), AND, count,
6307 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
6308 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
6309 gen_lowpart (QImode, value), count, QImode,
6310 1, max_size / 2, true);
6311 }
6312
6313 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6314 static void
6315 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6316 rtx count, int max_size)
6317 {
6318 rtx dest;
6319
6320 if (CONST_INT_P (count))
6321 {
6322 HOST_WIDE_INT countval = INTVAL (count);
6323 HOST_WIDE_INT epilogue_size = countval % max_size;
6324 int i;
6325
6326 /* For now MAX_SIZE should be a power of 2. This assert could be
6327 relaxed, but it'll require a bit more complicated epilogue
6328 expanding. */
6329 gcc_assert ((max_size & (max_size - 1)) == 0);
6330 for (i = max_size; i >= 1; i >>= 1)
6331 {
6332 if (epilogue_size & i)
6333 {
6334 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6335 destmem = emit_memset (destmem, destptr, vec_value, i);
6336 else
6337 destmem = emit_memset (destmem, destptr, value, i);
6338 }
6339 }
6340 return;
6341 }
6342 if (max_size > 32)
6343 {
6344 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6345 return;
6346 }
6347 if (max_size > 16)
6348 {
6349 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6350 if (TARGET_64BIT)
6351 {
6352 dest = change_address (destmem, DImode, destptr);
6353 emit_insn (gen_strset (destptr, dest, value));
6354 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6355 emit_insn (gen_strset (destptr, dest, value));
6356 }
6357 else
6358 {
6359 dest = change_address (destmem, SImode, destptr);
6360 emit_insn (gen_strset (destptr, dest, value));
6361 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6362 emit_insn (gen_strset (destptr, dest, value));
6363 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6364 emit_insn (gen_strset (destptr, dest, value));
6365 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6366 emit_insn (gen_strset (destptr, dest, value));
6367 }
6368 emit_label (label);
6369 LABEL_NUSES (label) = 1;
6370 }
6371 if (max_size > 8)
6372 {
6373 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6374 if (TARGET_64BIT)
6375 {
6376 dest = change_address (destmem, DImode, destptr);
6377 emit_insn (gen_strset (destptr, dest, value));
6378 }
6379 else
6380 {
6381 dest = change_address (destmem, SImode, destptr);
6382 emit_insn (gen_strset (destptr, dest, value));
6383 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6384 emit_insn (gen_strset (destptr, dest, value));
6385 }
6386 emit_label (label);
6387 LABEL_NUSES (label) = 1;
6388 }
6389 if (max_size > 4)
6390 {
6391 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6392 dest = change_address (destmem, SImode, destptr);
6393 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6394 emit_label (label);
6395 LABEL_NUSES (label) = 1;
6396 }
6397 if (max_size > 2)
6398 {
6399 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6400 dest = change_address (destmem, HImode, destptr);
6401 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6402 emit_label (label);
6403 LABEL_NUSES (label) = 1;
6404 }
6405 if (max_size > 1)
6406 {
6407 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6408 dest = change_address (destmem, QImode, destptr);
6409 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6410 emit_label (label);
6411 LABEL_NUSES (label) = 1;
6412 }
6413 }
6414
6415 /* Adjust COUNTER by the VALUE. */
6416 static void
6417 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6418 {
6419 rtx (*gen_add)(rtx, rtx, rtx)
6420 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
6421
6422 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
6423 }
6424
6425 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6426 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6427 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6428 ignored.
6429 Return value is updated DESTMEM. */
6430
6431 static rtx
6432 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
6433 rtx destptr, rtx srcptr, rtx value,
6434 rtx vec_value, rtx count, int align,
6435 int desired_alignment, bool issetmem)
6436 {
6437 int i;
6438 for (i = 1; i < desired_alignment; i <<= 1)
6439 {
6440 if (align <= i)
6441 {
6442 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6443 if (issetmem)
6444 {
6445 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6446 destmem = emit_memset (destmem, destptr, vec_value, i);
6447 else
6448 destmem = emit_memset (destmem, destptr, value, i);
6449 }
6450 else
6451 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6452 ix86_adjust_counter (count, i);
6453 emit_label (label);
6454 LABEL_NUSES (label) = 1;
6455 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6456 }
6457 }
6458 return destmem;
6459 }
6460
6461 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6462 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6463 and jump to DONE_LABEL. */
6464 static void
6465 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
6466 rtx destptr, rtx srcptr,
6467 rtx value, rtx vec_value,
6468 rtx count, int size,
6469 rtx done_label, bool issetmem)
6470 {
6471 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6472 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6473 rtx modesize;
6474 int n;
6475
6476 /* If we do not have vector value to copy, we must reduce size. */
6477 if (issetmem)
6478 {
6479 if (!vec_value)
6480 {
6481 if (GET_MODE (value) == VOIDmode && size > 8)
6482 mode = Pmode;
6483 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6484 mode = GET_MODE (value);
6485 }
6486 else
6487 mode = GET_MODE (vec_value), value = vec_value;
6488 }
6489 else
6490 {
6491 /* Choose appropriate vector mode. */
6492 if (size >= 32)
6493 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6494 else if (size >= 16)
6495 mode = TARGET_SSE ? V16QImode : DImode;
6496 srcmem = change_address (srcmem, mode, srcptr);
6497 }
6498 destmem = change_address (destmem, mode, destptr);
6499 modesize = GEN_INT (GET_MODE_SIZE (mode));
6500 gcc_assert (GET_MODE_SIZE (mode) <= size);
6501 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6502 {
6503 if (issetmem)
6504 emit_move_insn (destmem, gen_lowpart (mode, value));
6505 else
6506 {
6507 emit_move_insn (destmem, srcmem);
6508 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6509 }
6510 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6511 }
6512
6513 destmem = offset_address (destmem, count, 1);
6514 destmem = offset_address (destmem, GEN_INT (-2 * size),
6515 GET_MODE_SIZE (mode));
6516 if (!issetmem)
6517 {
6518 srcmem = offset_address (srcmem, count, 1);
6519 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6520 GET_MODE_SIZE (mode));
6521 }
6522 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6523 {
6524 if (issetmem)
6525 emit_move_insn (destmem, gen_lowpart (mode, value));
6526 else
6527 {
6528 emit_move_insn (destmem, srcmem);
6529 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6530 }
6531 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6532 }
6533 emit_jump_insn (gen_jump (done_label));
6534 emit_barrier ();
6535
6536 emit_label (label);
6537 LABEL_NUSES (label) = 1;
6538 }
6539
6540 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6541 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6542 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6543 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6544 DONE_LABEL is a label after the whole copying sequence. The label is created
6545 on demand if *DONE_LABEL is NULL.
6546 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6547 bounds after the initial copies.
6548
6549 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6550 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6551 we will dispatch to a library call for large blocks.
6552
6553 In pseudocode we do:
6554
6555 if (COUNT < SIZE)
6556 {
6557 Assume that SIZE is 4. Bigger sizes are handled analogously
6558 if (COUNT & 4)
6559 {
6560 copy 4 bytes from SRCPTR to DESTPTR
6561 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6562 goto done_label
6563 }
6564 if (!COUNT)
6565 goto done_label;
6566 copy 1 byte from SRCPTR to DESTPTR
6567 if (COUNT & 2)
6568 {
6569 copy 2 bytes from SRCPTR to DESTPTR
6570 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6571 }
6572 }
6573 else
6574 {
6575 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6576 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6577
6578 OLD_DESPTR = DESTPTR;
6579 Align DESTPTR up to DESIRED_ALIGN
6580 SRCPTR += DESTPTR - OLD_DESTPTR
6581 COUNT -= DEST_PTR - OLD_DESTPTR
6582 if (DYNAMIC_CHECK)
6583 Round COUNT down to multiple of SIZE
6584 << optional caller supplied zero size guard is here >>
6585 << optional caller supplied dynamic check is here >>
6586 << caller supplied main copy loop is here >>
6587 }
6588 done_label:
6589 */
6590 static void
6591 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
6592 rtx *destptr, rtx *srcptr,
6593 machine_mode mode,
6594 rtx value, rtx vec_value,
6595 rtx *count,
6596 rtx_code_label **done_label,
6597 int size,
6598 int desired_align,
6599 int align,
6600 unsigned HOST_WIDE_INT *min_size,
6601 bool dynamic_check,
6602 bool issetmem)
6603 {
6604 rtx_code_label *loop_label = NULL, *label;
6605 int n;
6606 rtx modesize;
6607 int prolog_size = 0;
6608 rtx mode_value;
6609
6610 /* Chose proper value to copy. */
6611 if (issetmem && VECTOR_MODE_P (mode))
6612 mode_value = vec_value;
6613 else
6614 mode_value = value;
6615 gcc_assert (GET_MODE_SIZE (mode) <= size);
6616
6617 /* See if block is big or small, handle small blocks. */
6618 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6619 {
6620 int size2 = size;
6621 loop_label = gen_label_rtx ();
6622
6623 if (!*done_label)
6624 *done_label = gen_label_rtx ();
6625
6626 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6627 1, loop_label);
6628 size2 >>= 1;
6629
6630 /* Handle sizes > 3. */
6631 for (;size2 > 2; size2 >>= 1)
6632 expand_small_movmem_or_setmem (destmem, srcmem,
6633 *destptr, *srcptr,
6634 value, vec_value,
6635 *count,
6636 size2, *done_label, issetmem);
6637 /* Nothing to copy? Jump to DONE_LABEL if so */
6638 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6639 1, *done_label);
6640
6641 /* Do a byte copy. */
6642 destmem = change_address (destmem, QImode, *destptr);
6643 if (issetmem)
6644 emit_move_insn (destmem, gen_lowpart (QImode, value));
6645 else
6646 {
6647 srcmem = change_address (srcmem, QImode, *srcptr);
6648 emit_move_insn (destmem, srcmem);
6649 }
6650
6651 /* Handle sizes 2 and 3. */
6652 label = ix86_expand_aligntest (*count, 2, false);
6653 destmem = change_address (destmem, HImode, *destptr);
6654 destmem = offset_address (destmem, *count, 1);
6655 destmem = offset_address (destmem, GEN_INT (-2), 2);
6656 if (issetmem)
6657 emit_move_insn (destmem, gen_lowpart (HImode, value));
6658 else
6659 {
6660 srcmem = change_address (srcmem, HImode, *srcptr);
6661 srcmem = offset_address (srcmem, *count, 1);
6662 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6663 emit_move_insn (destmem, srcmem);
6664 }
6665
6666 emit_label (label);
6667 LABEL_NUSES (label) = 1;
6668 emit_jump_insn (gen_jump (*done_label));
6669 emit_barrier ();
6670 }
6671 else
6672 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6673 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6674
6675 /* Start memcpy for COUNT >= SIZE. */
6676 if (loop_label)
6677 {
6678 emit_label (loop_label);
6679 LABEL_NUSES (loop_label) = 1;
6680 }
6681
6682 /* Copy first desired_align bytes. */
6683 if (!issetmem)
6684 srcmem = change_address (srcmem, mode, *srcptr);
6685 destmem = change_address (destmem, mode, *destptr);
6686 modesize = GEN_INT (GET_MODE_SIZE (mode));
6687 for (n = 0; prolog_size < desired_align - align; n++)
6688 {
6689 if (issetmem)
6690 emit_move_insn (destmem, mode_value);
6691 else
6692 {
6693 emit_move_insn (destmem, srcmem);
6694 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6695 }
6696 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6697 prolog_size += GET_MODE_SIZE (mode);
6698 }
6699
6700
6701 /* Copy last SIZE bytes. */
6702 destmem = offset_address (destmem, *count, 1);
6703 destmem = offset_address (destmem,
6704 GEN_INT (-size - prolog_size),
6705 1);
6706 if (issetmem)
6707 emit_move_insn (destmem, mode_value);
6708 else
6709 {
6710 srcmem = offset_address (srcmem, *count, 1);
6711 srcmem = offset_address (srcmem,
6712 GEN_INT (-size - prolog_size),
6713 1);
6714 emit_move_insn (destmem, srcmem);
6715 }
6716 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6717 {
6718 destmem = offset_address (destmem, modesize, 1);
6719 if (issetmem)
6720 emit_move_insn (destmem, mode_value);
6721 else
6722 {
6723 srcmem = offset_address (srcmem, modesize, 1);
6724 emit_move_insn (destmem, srcmem);
6725 }
6726 }
6727
6728 /* Align destination. */
6729 if (desired_align > 1 && desired_align > align)
6730 {
6731 rtx saveddest = *destptr;
6732
6733 gcc_assert (desired_align <= size);
6734 /* Align destptr up, place it to new register. */
6735 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6736 GEN_INT (prolog_size),
6737 NULL_RTX, 1, OPTAB_DIRECT);
6738 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6739 REG_POINTER (*destptr) = 1;
6740 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6741 GEN_INT (-desired_align),
6742 *destptr, 1, OPTAB_DIRECT);
6743 /* See how many bytes we skipped. */
6744 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6745 *destptr,
6746 saveddest, 1, OPTAB_DIRECT);
6747 /* Adjust srcptr and count. */
6748 if (!issetmem)
6749 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6750 saveddest, *srcptr, 1, OPTAB_DIRECT);
6751 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6752 saveddest, *count, 1, OPTAB_DIRECT);
6753 /* We copied at most size + prolog_size. */
6754 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6755 *min_size
6756 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6757 else
6758 *min_size = 0;
6759
6760 /* Our loops always round down the block size, but for dispatch to
6761 library we need precise value. */
6762 if (dynamic_check)
6763 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
6764 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6765 }
6766 else
6767 {
6768 gcc_assert (prolog_size == 0);
6769 /* Decrease count, so we won't end up copying last word twice. */
6770 if (!CONST_INT_P (*count))
6771 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6772 constm1_rtx, *count, 1, OPTAB_DIRECT);
6773 else
6774 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6775 (unsigned HOST_WIDE_INT)size));
6776 if (*min_size)
6777 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6778 }
6779 }
6780
6781
6782 /* This function is like the previous one, except here we know how many bytes
6783 need to be copied. That allows us to update alignment not only of DST, which
6784 is returned, but also of SRC, which is passed as a pointer for that
6785 reason. */
6786 static rtx
6787 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
6788 rtx srcreg, rtx value, rtx vec_value,
6789 int desired_align, int align_bytes,
6790 bool issetmem)
6791 {
6792 rtx src = NULL;
6793 rtx orig_dst = dst;
6794 rtx orig_src = NULL;
6795 int piece_size = 1;
6796 int copied_bytes = 0;
6797
6798 if (!issetmem)
6799 {
6800 gcc_assert (srcp != NULL);
6801 src = *srcp;
6802 orig_src = src;
6803 }
6804
6805 for (piece_size = 1;
6806 piece_size <= desired_align && copied_bytes < align_bytes;
6807 piece_size <<= 1)
6808 {
6809 if (align_bytes & piece_size)
6810 {
6811 if (issetmem)
6812 {
6813 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6814 dst = emit_memset (dst, destreg, vec_value, piece_size);
6815 else
6816 dst = emit_memset (dst, destreg, value, piece_size);
6817 }
6818 else
6819 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6820 copied_bytes += piece_size;
6821 }
6822 }
6823 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6824 set_mem_align (dst, desired_align * BITS_PER_UNIT);
6825 if (MEM_SIZE_KNOWN_P (orig_dst))
6826 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6827
6828 if (!issetmem)
6829 {
6830 int src_align_bytes = get_mem_align_offset (src, desired_align
6831 * BITS_PER_UNIT);
6832 if (src_align_bytes >= 0)
6833 src_align_bytes = desired_align - src_align_bytes;
6834 if (src_align_bytes >= 0)
6835 {
6836 unsigned int src_align;
6837 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6838 {
6839 if ((src_align_bytes & (src_align - 1))
6840 == (align_bytes & (src_align - 1)))
6841 break;
6842 }
6843 if (src_align > (unsigned int) desired_align)
6844 src_align = desired_align;
6845 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6846 set_mem_align (src, src_align * BITS_PER_UNIT);
6847 }
6848 if (MEM_SIZE_KNOWN_P (orig_src))
6849 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6850 *srcp = src;
6851 }
6852
6853 return dst;
6854 }
6855
6856 /* Return true if ALG can be used in current context.
6857 Assume we expand memset if MEMSET is true. */
6858 static bool
6859 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6860 {
6861 if (alg == no_stringop)
6862 return false;
6863 if (alg == vector_loop)
6864 return TARGET_SSE || TARGET_AVX;
6865 /* Algorithms using the rep prefix want at least edi and ecx;
6866 additionally, memset wants eax and memcpy wants esi. Don't
6867 consider such algorithms if the user has appropriated those
6868 registers for their own purposes, or if we have a non-default
6869 address space, since some string insns cannot override the segment. */
6870 if (alg == rep_prefix_1_byte
6871 || alg == rep_prefix_4_byte
6872 || alg == rep_prefix_8_byte)
6873 {
6874 if (have_as)
6875 return false;
6876 if (fixed_regs[CX_REG]
6877 || fixed_regs[DI_REG]
6878 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6879 return false;
6880 }
6881 return true;
6882 }
6883
6884 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6885 static enum stringop_alg
6886 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6887 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6888 bool memset, bool zero_memset, bool have_as,
6889 int *dynamic_check, bool *noalign, bool recur)
6890 {
6891 const struct stringop_algs *algs;
6892 bool optimize_for_speed;
6893 int max = 0;
6894 const struct processor_costs *cost;
6895 int i;
6896 bool any_alg_usable_p = false;
6897
6898 *noalign = false;
6899 *dynamic_check = -1;
6900
6901 /* Even if the string operation call is cold, we still might spend a lot
6902 of time processing large blocks. */
6903 if (optimize_function_for_size_p (cfun)
6904 || (optimize_insn_for_size_p ()
6905 && (max_size < 256
6906 || (expected_size != -1 && expected_size < 256))))
6907 optimize_for_speed = false;
6908 else
6909 optimize_for_speed = true;
6910
6911 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6912 if (memset)
6913 algs = &cost->memset[TARGET_64BIT != 0];
6914 else
6915 algs = &cost->memcpy[TARGET_64BIT != 0];
6916
6917 /* See maximal size for user defined algorithm. */
6918 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6919 {
6920 enum stringop_alg candidate = algs->size[i].alg;
6921 bool usable = alg_usable_p (candidate, memset, have_as);
6922 any_alg_usable_p |= usable;
6923
6924 if (candidate != libcall && candidate && usable)
6925 max = algs->size[i].max;
6926 }
6927
6928 /* If expected size is not known but max size is small enough
6929 so inline version is a win, set expected size into
6930 the range. */
6931 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6932 && expected_size == -1)
6933 expected_size = min_size / 2 + max_size / 2;
6934
6935 /* If user specified the algorithm, honor it if possible. */
6936 if (ix86_stringop_alg != no_stringop
6937 && alg_usable_p (ix86_stringop_alg, memset, have_as))
6938 return ix86_stringop_alg;
6939 /* rep; movq or rep; movl is the smallest variant. */
6940 else if (!optimize_for_speed)
6941 {
6942 *noalign = true;
6943 if (!count || (count & 3) || (memset && !zero_memset))
6944 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6945 ? rep_prefix_1_byte : loop_1_byte;
6946 else
6947 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6948 ? rep_prefix_4_byte : loop;
6949 }
6950 /* Very tiny blocks are best handled via the loop, REP is expensive to
6951 setup. */
6952 else if (expected_size != -1 && expected_size < 4)
6953 return loop_1_byte;
6954 else if (expected_size != -1)
6955 {
6956 enum stringop_alg alg = libcall;
6957 bool alg_noalign = false;
6958 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6959 {
6960 /* We get here if the algorithms that were not libcall-based
6961 were rep-prefix based and we are unable to use rep prefixes
6962 based on global register usage. Break out of the loop and
6963 use the heuristic below. */
6964 if (algs->size[i].max == 0)
6965 break;
6966 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6967 {
6968 enum stringop_alg candidate = algs->size[i].alg;
6969
6970 if (candidate != libcall
6971 && alg_usable_p (candidate, memset, have_as))
6972 {
6973 alg = candidate;
6974 alg_noalign = algs->size[i].noalign;
6975 }
6976 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6977 last non-libcall inline algorithm. */
6978 if (TARGET_INLINE_ALL_STRINGOPS)
6979 {
6980 /* When the current size is best to be copied by a libcall,
6981 but we are still forced to inline, run the heuristic below
6982 that will pick code for medium sized blocks. */
6983 if (alg != libcall)
6984 {
6985 *noalign = alg_noalign;
6986 return alg;
6987 }
6988 else if (!any_alg_usable_p)
6989 break;
6990 }
6991 else if (alg_usable_p (candidate, memset, have_as))
6992 {
6993 *noalign = algs->size[i].noalign;
6994 return candidate;
6995 }
6996 }
6997 }
6998 }
6999 /* When asked to inline the call anyway, try to pick meaningful choice.
7000 We look for maximal size of block that is faster to copy by hand and
7001 take blocks of at most of that size guessing that average size will
7002 be roughly half of the block.
7003
7004 If this turns out to be bad, we might simply specify the preferred
7005 choice in ix86_costs. */
7006 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7007 && (algs->unknown_size == libcall
7008 || !alg_usable_p (algs->unknown_size, memset, have_as)))
7009 {
7010 enum stringop_alg alg;
7011 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
7012
7013 /* If there aren't any usable algorithms or if recursing already,
7014 then recursing on smaller sizes or same size isn't going to
7015 find anything. Just return the simple byte-at-a-time copy loop. */
7016 if (!any_alg_usable_p || recur)
7017 {
7018 /* Pick something reasonable. */
7019 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
7020 *dynamic_check = 128;
7021 return loop_1_byte;
7022 }
7023 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
7024 zero_memset, have_as, dynamic_check, noalign, true);
7025 gcc_assert (*dynamic_check == -1);
7026 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7027 *dynamic_check = max;
7028 else
7029 gcc_assert (alg != libcall);
7030 return alg;
7031 }
7032 return (alg_usable_p (algs->unknown_size, memset, have_as)
7033 ? algs->unknown_size : libcall);
7034 }
7035
7036 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7037 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7038 static int
7039 decide_alignment (int align,
7040 enum stringop_alg alg,
7041 int expected_size,
7042 machine_mode move_mode)
7043 {
7044 int desired_align = 0;
7045
7046 gcc_assert (alg != no_stringop);
7047
7048 if (alg == libcall)
7049 return 0;
7050 if (move_mode == VOIDmode)
7051 return 0;
7052
7053 desired_align = GET_MODE_SIZE (move_mode);
7054 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7055 copying whole cacheline at once. */
7056 if (TARGET_PENTIUMPRO
7057 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7058 desired_align = 8;
7059
7060 if (optimize_size)
7061 desired_align = 1;
7062 if (desired_align < align)
7063 desired_align = align;
7064 if (expected_size != -1 && expected_size < 4)
7065 desired_align = align;
7066
7067 return desired_align;
7068 }
7069
7070
7071 /* Helper function for memcpy. For QImode value 0xXY produce
7072 0xXYXYXYXY of wide specified by MODE. This is essentially
7073 a * 0x10101010, but we can do slightly better than
7074 synth_mult by unwinding the sequence by hand on CPUs with
7075 slow multiply. */
7076 static rtx
7077 promote_duplicated_reg (machine_mode mode, rtx val)
7078 {
7079 machine_mode valmode = GET_MODE (val);
7080 rtx tmp;
7081 int nops = mode == DImode ? 3 : 2;
7082
7083 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7084 if (val == const0_rtx)
7085 return copy_to_mode_reg (mode, CONST0_RTX (mode));
7086 if (CONST_INT_P (val))
7087 {
7088 HOST_WIDE_INT v = INTVAL (val) & 255;
7089
7090 v |= v << 8;
7091 v |= v << 16;
7092 if (mode == DImode)
7093 v |= (v << 16) << 16;
7094 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7095 }
7096
7097 if (valmode == VOIDmode)
7098 valmode = QImode;
7099 if (valmode != QImode)
7100 val = gen_lowpart (QImode, val);
7101 if (mode == QImode)
7102 return val;
7103 if (!TARGET_PARTIAL_REG_STALL)
7104 nops--;
7105 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7106 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7107 <= (ix86_cost->shift_const + ix86_cost->add) * nops
7108 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7109 {
7110 rtx reg = convert_modes (mode, QImode, val, true);
7111 tmp = promote_duplicated_reg (mode, const1_rtx);
7112 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7113 OPTAB_DIRECT);
7114 }
7115 else
7116 {
7117 rtx reg = convert_modes (mode, QImode, val, true);
7118
7119 if (!TARGET_PARTIAL_REG_STALL)
7120 if (mode == SImode)
7121 emit_insn (gen_insvsi_1 (reg, reg));
7122 else
7123 emit_insn (gen_insvdi_1 (reg, reg));
7124 else
7125 {
7126 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7127 NULL, 1, OPTAB_DIRECT);
7128 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7129 OPTAB_DIRECT);
7130 }
7131 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7132 NULL, 1, OPTAB_DIRECT);
7133 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7134 if (mode == SImode)
7135 return reg;
7136 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7137 NULL, 1, OPTAB_DIRECT);
7138 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7139 return reg;
7140 }
7141 }
7142
7143 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7144 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7145 alignment from ALIGN to DESIRED_ALIGN. */
7146 static rtx
7147 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7148 int align)
7149 {
7150 rtx promoted_val;
7151
7152 if (TARGET_64BIT
7153 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7154 promoted_val = promote_duplicated_reg (DImode, val);
7155 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7156 promoted_val = promote_duplicated_reg (SImode, val);
7157 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7158 promoted_val = promote_duplicated_reg (HImode, val);
7159 else
7160 promoted_val = val;
7161
7162 return promoted_val;
7163 }
7164
7165 /* Copy the address to a Pmode register. This is used for x32 to
7166 truncate DImode TLS address to a SImode register. */
7167
7168 static rtx
7169 ix86_copy_addr_to_reg (rtx addr)
7170 {
7171 rtx reg;
7172 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7173 {
7174 reg = copy_addr_to_reg (addr);
7175 REG_POINTER (reg) = 1;
7176 return reg;
7177 }
7178 else
7179 {
7180 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7181 reg = copy_to_mode_reg (DImode, addr);
7182 REG_POINTER (reg) = 1;
7183 return gen_rtx_SUBREG (SImode, reg, 0);
7184 }
7185 }
7186
7187 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7188 operations when profitable. The code depends upon architecture, block size
7189 and alignment, but always has one of the following overall structures:
7190
7191 Aligned move sequence:
7192
7193 1) Prologue guard: Conditional that jumps up to epilogues for small
7194 blocks that can be handled by epilogue alone. This is faster
7195 but also needed for correctness, since prologue assume the block
7196 is larger than the desired alignment.
7197
7198 Optional dynamic check for size and libcall for large
7199 blocks is emitted here too, with -minline-stringops-dynamically.
7200
7201 2) Prologue: copy first few bytes in order to get destination
7202 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7203 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7204 copied. We emit either a jump tree on power of two sized
7205 blocks, or a byte loop.
7206
7207 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7208 with specified algorithm.
7209
7210 4) Epilogue: code copying tail of the block that is too small to be
7211 handled by main body (or up to size guarded by prologue guard).
7212
7213 Misaligned move sequence
7214
7215 1) missaligned move prologue/epilogue containing:
7216 a) Prologue handling small memory blocks and jumping to done_label
7217 (skipped if blocks are known to be large enough)
7218 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7219 needed by single possibly misaligned move
7220 (skipped if alignment is not needed)
7221 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7222
7223 2) Zero size guard dispatching to done_label, if needed
7224
7225 3) dispatch to library call, if needed,
7226
7227 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7228 with specified algorithm. */
7229 bool
7230 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
7231 rtx align_exp, rtx expected_align_exp,
7232 rtx expected_size_exp, rtx min_size_exp,
7233 rtx max_size_exp, rtx probable_max_size_exp,
7234 bool issetmem)
7235 {
7236 rtx destreg;
7237 rtx srcreg = NULL;
7238 rtx_code_label *label = NULL;
7239 rtx tmp;
7240 rtx_code_label *jump_around_label = NULL;
7241 HOST_WIDE_INT align = 1;
7242 unsigned HOST_WIDE_INT count = 0;
7243 HOST_WIDE_INT expected_size = -1;
7244 int size_needed = 0, epilogue_size_needed;
7245 int desired_align = 0, align_bytes = 0;
7246 enum stringop_alg alg;
7247 rtx promoted_val = NULL;
7248 rtx vec_promoted_val = NULL;
7249 bool force_loopy_epilogue = false;
7250 int dynamic_check;
7251 bool need_zero_guard = false;
7252 bool noalign;
7253 machine_mode move_mode = VOIDmode;
7254 machine_mode wider_mode;
7255 int unroll_factor = 1;
7256 /* TODO: Once value ranges are available, fill in proper data. */
7257 unsigned HOST_WIDE_INT min_size = 0;
7258 unsigned HOST_WIDE_INT max_size = -1;
7259 unsigned HOST_WIDE_INT probable_max_size = -1;
7260 bool misaligned_prologue_used = false;
7261 bool have_as;
7262
7263 if (CONST_INT_P (align_exp))
7264 align = INTVAL (align_exp);
7265 /* i386 can do misaligned access on reasonably increased cost. */
7266 if (CONST_INT_P (expected_align_exp)
7267 && INTVAL (expected_align_exp) > align)
7268 align = INTVAL (expected_align_exp);
7269 /* ALIGN is the minimum of destination and source alignment, but we care here
7270 just about destination alignment. */
7271 else if (!issetmem
7272 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7273 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7274
7275 if (CONST_INT_P (count_exp))
7276 {
7277 min_size = max_size = probable_max_size = count = expected_size
7278 = INTVAL (count_exp);
7279 /* When COUNT is 0, there is nothing to do. */
7280 if (!count)
7281 return true;
7282 }
7283 else
7284 {
7285 if (min_size_exp)
7286 min_size = INTVAL (min_size_exp);
7287 if (max_size_exp)
7288 max_size = INTVAL (max_size_exp);
7289 if (probable_max_size_exp)
7290 probable_max_size = INTVAL (probable_max_size_exp);
7291 if (CONST_INT_P (expected_size_exp))
7292 expected_size = INTVAL (expected_size_exp);
7293 }
7294
7295 /* Make sure we don't need to care about overflow later on. */
7296 if (count > (HOST_WIDE_INT_1U << 30))
7297 return false;
7298
7299 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7300 if (!issetmem)
7301 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7302
7303 /* Step 0: Decide on preferred algorithm, desired alignment and
7304 size of chunks to be copied by main loop. */
7305 alg = decide_alg (count, expected_size, min_size, probable_max_size,
7306 issetmem,
7307 issetmem && val_exp == const0_rtx, have_as,
7308 &dynamic_check, &noalign, false);
7309
7310 if (dump_file)
7311 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7312 stringop_alg_names[alg]);
7313
7314 if (alg == libcall)
7315 return false;
7316 gcc_assert (alg != no_stringop);
7317
7318 /* For now vector-version of memset is generated only for memory zeroing, as
7319 creating of promoted vector value is very cheap in this case. */
7320 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7321 alg = unrolled_loop;
7322
7323 if (!count)
7324 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7325 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7326 if (!issetmem)
7327 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7328
7329 unroll_factor = 1;
7330 move_mode = word_mode;
7331 switch (alg)
7332 {
7333 case libcall:
7334 case no_stringop:
7335 case last_alg:
7336 gcc_unreachable ();
7337 case loop_1_byte:
7338 need_zero_guard = true;
7339 move_mode = QImode;
7340 break;
7341 case loop:
7342 need_zero_guard = true;
7343 break;
7344 case unrolled_loop:
7345 need_zero_guard = true;
7346 unroll_factor = (TARGET_64BIT ? 4 : 2);
7347 break;
7348 case vector_loop:
7349 need_zero_guard = true;
7350 unroll_factor = 4;
7351 /* Find the widest supported mode. */
7352 move_mode = word_mode;
7353 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7354 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7355 move_mode = wider_mode;
7356
7357 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
7358 move_mode = TImode;
7359
7360 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7361 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7362 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7363 {
7364 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7365 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7366 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7367 move_mode = word_mode;
7368 }
7369 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7370 break;
7371 case rep_prefix_8_byte:
7372 move_mode = DImode;
7373 break;
7374 case rep_prefix_4_byte:
7375 move_mode = SImode;
7376 break;
7377 case rep_prefix_1_byte:
7378 move_mode = QImode;
7379 break;
7380 }
7381 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7382 epilogue_size_needed = size_needed;
7383
7384 /* If we are going to call any library calls conditionally, make sure any
7385 pending stack adjustment happen before the first conditional branch,
7386 otherwise they will be emitted before the library call only and won't
7387 happen from the other branches. */
7388 if (dynamic_check != -1)
7389 do_pending_stack_adjust ();
7390
7391 desired_align = decide_alignment (align, alg, expected_size, move_mode);
7392 if (!TARGET_ALIGN_STRINGOPS || noalign)
7393 align = desired_align;
7394
7395 /* Step 1: Prologue guard. */
7396
7397 /* Alignment code needs count to be in register. */
7398 if (CONST_INT_P (count_exp) && desired_align > align)
7399 {
7400 if (INTVAL (count_exp) > desired_align
7401 && INTVAL (count_exp) > size_needed)
7402 {
7403 align_bytes
7404 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7405 if (align_bytes <= 0)
7406 align_bytes = 0;
7407 else
7408 align_bytes = desired_align - align_bytes;
7409 }
7410 if (align_bytes == 0)
7411 count_exp = force_reg (counter_mode (count_exp), count_exp);
7412 }
7413 gcc_assert (desired_align >= 1 && align >= 1);
7414
7415 /* Misaligned move sequences handle both prologue and epilogue at once.
7416 Default code generation results in a smaller code for large alignments
7417 and also avoids redundant job when sizes are known precisely. */
7418 misaligned_prologue_used
7419 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7420 && MAX (desired_align, epilogue_size_needed) <= 32
7421 && desired_align <= epilogue_size_needed
7422 && ((desired_align > align && !align_bytes)
7423 || (!count && epilogue_size_needed > 1)));
7424
7425 /* Do the cheap promotion to allow better CSE across the
7426 main loop and epilogue (ie one load of the big constant in the
7427 front of all code.
7428 For now the misaligned move sequences do not have fast path
7429 without broadcasting. */
7430 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7431 {
7432 if (alg == vector_loop)
7433 {
7434 gcc_assert (val_exp == const0_rtx);
7435 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7436 promoted_val = promote_duplicated_reg_to_size (val_exp,
7437 GET_MODE_SIZE (word_mode),
7438 desired_align, align);
7439 }
7440 else
7441 {
7442 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7443 desired_align, align);
7444 }
7445 }
7446 /* Misaligned move sequences handles both prologues and epilogues at once.
7447 Default code generation results in smaller code for large alignments and
7448 also avoids redundant job when sizes are known precisely. */
7449 if (misaligned_prologue_used)
7450 {
7451 /* Misaligned move prologue handled small blocks by itself. */
7452 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
7453 (dst, src, &destreg, &srcreg,
7454 move_mode, promoted_val, vec_promoted_val,
7455 &count_exp,
7456 &jump_around_label,
7457 desired_align < align
7458 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7459 desired_align, align, &min_size, dynamic_check, issetmem);
7460 if (!issetmem)
7461 src = change_address (src, BLKmode, srcreg);
7462 dst = change_address (dst, BLKmode, destreg);
7463 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7464 epilogue_size_needed = 0;
7465 if (need_zero_guard
7466 && min_size < (unsigned HOST_WIDE_INT) size_needed)
7467 {
7468 /* It is possible that we copied enough so the main loop will not
7469 execute. */
7470 gcc_assert (size_needed > 1);
7471 if (jump_around_label == NULL_RTX)
7472 jump_around_label = gen_label_rtx ();
7473 emit_cmp_and_jump_insns (count_exp,
7474 GEN_INT (size_needed),
7475 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7476 if (expected_size == -1
7477 || expected_size < (desired_align - align) / 2 + size_needed)
7478 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7479 else
7480 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7481 }
7482 }
7483 /* Ensure that alignment prologue won't copy past end of block. */
7484 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7485 {
7486 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7487 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7488 Make sure it is power of 2. */
7489 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7490
7491 /* To improve performance of small blocks, we jump around the VAL
7492 promoting mode. This mean that if the promoted VAL is not constant,
7493 we might not use it in the epilogue and have to use byte
7494 loop variant. */
7495 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7496 force_loopy_epilogue = true;
7497 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7498 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7499 {
7500 /* If main algorithm works on QImode, no epilogue is needed.
7501 For small sizes just don't align anything. */
7502 if (size_needed == 1)
7503 desired_align = align;
7504 else
7505 goto epilogue;
7506 }
7507 else if (!count
7508 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7509 {
7510 label = gen_label_rtx ();
7511 emit_cmp_and_jump_insns (count_exp,
7512 GEN_INT (epilogue_size_needed),
7513 LTU, 0, counter_mode (count_exp), 1, label);
7514 if (expected_size == -1 || expected_size < epilogue_size_needed)
7515 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7516 else
7517 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7518 }
7519 }
7520
7521 /* Emit code to decide on runtime whether library call or inline should be
7522 used. */
7523 if (dynamic_check != -1)
7524 {
7525 if (!issetmem && CONST_INT_P (count_exp))
7526 {
7527 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7528 {
7529 emit_block_copy_via_libcall (dst, src, count_exp);
7530 count_exp = const0_rtx;
7531 goto epilogue;
7532 }
7533 }
7534 else
7535 {
7536 rtx_code_label *hot_label = gen_label_rtx ();
7537 if (jump_around_label == NULL_RTX)
7538 jump_around_label = gen_label_rtx ();
7539 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7540 LEU, 0, counter_mode (count_exp),
7541 1, hot_label);
7542 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7543 if (issetmem)
7544 set_storage_via_libcall (dst, count_exp, val_exp);
7545 else
7546 emit_block_copy_via_libcall (dst, src, count_exp);
7547 emit_jump (jump_around_label);
7548 emit_label (hot_label);
7549 }
7550 }
7551
7552 /* Step 2: Alignment prologue. */
7553 /* Do the expensive promotion once we branched off the small blocks. */
7554 if (issetmem && !promoted_val)
7555 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7556 desired_align, align);
7557
7558 if (desired_align > align && !misaligned_prologue_used)
7559 {
7560 if (align_bytes == 0)
7561 {
7562 /* Except for the first move in prologue, we no longer know
7563 constant offset in aliasing info. It don't seems to worth
7564 the pain to maintain it for the first move, so throw away
7565 the info early. */
7566 dst = change_address (dst, BLKmode, destreg);
7567 if (!issetmem)
7568 src = change_address (src, BLKmode, srcreg);
7569 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
7570 promoted_val, vec_promoted_val,
7571 count_exp, align, desired_align,
7572 issetmem);
7573 /* At most desired_align - align bytes are copied. */
7574 if (min_size < (unsigned)(desired_align - align))
7575 min_size = 0;
7576 else
7577 min_size -= desired_align - align;
7578 }
7579 else
7580 {
7581 /* If we know how many bytes need to be stored before dst is
7582 sufficiently aligned, maintain aliasing info accurately. */
7583 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
7584 srcreg,
7585 promoted_val,
7586 vec_promoted_val,
7587 desired_align,
7588 align_bytes,
7589 issetmem);
7590
7591 count_exp = plus_constant (counter_mode (count_exp),
7592 count_exp, -align_bytes);
7593 count -= align_bytes;
7594 min_size -= align_bytes;
7595 max_size -= align_bytes;
7596 }
7597 if (need_zero_guard
7598 && min_size < (unsigned HOST_WIDE_INT) size_needed
7599 && (count < (unsigned HOST_WIDE_INT) size_needed
7600 || (align_bytes == 0
7601 && count < ((unsigned HOST_WIDE_INT) size_needed
7602 + desired_align - align))))
7603 {
7604 /* It is possible that we copied enough so the main loop will not
7605 execute. */
7606 gcc_assert (size_needed > 1);
7607 if (label == NULL_RTX)
7608 label = gen_label_rtx ();
7609 emit_cmp_and_jump_insns (count_exp,
7610 GEN_INT (size_needed),
7611 LTU, 0, counter_mode (count_exp), 1, label);
7612 if (expected_size == -1
7613 || expected_size < (desired_align - align) / 2 + size_needed)
7614 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7615 else
7616 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7617 }
7618 }
7619 if (label && size_needed == 1)
7620 {
7621 emit_label (label);
7622 LABEL_NUSES (label) = 1;
7623 label = NULL;
7624 epilogue_size_needed = 1;
7625 if (issetmem)
7626 promoted_val = val_exp;
7627 }
7628 else if (label == NULL_RTX && !misaligned_prologue_used)
7629 epilogue_size_needed = size_needed;
7630
7631 /* Step 3: Main loop. */
7632
7633 switch (alg)
7634 {
7635 case libcall:
7636 case no_stringop:
7637 case last_alg:
7638 gcc_unreachable ();
7639 case loop_1_byte:
7640 case loop:
7641 case unrolled_loop:
7642 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
7643 count_exp, move_mode, unroll_factor,
7644 expected_size, issetmem);
7645 break;
7646 case vector_loop:
7647 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
7648 vec_promoted_val, count_exp, move_mode,
7649 unroll_factor, expected_size, issetmem);
7650 break;
7651 case rep_prefix_8_byte:
7652 case rep_prefix_4_byte:
7653 case rep_prefix_1_byte:
7654 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
7655 val_exp, count_exp, move_mode, issetmem);
7656 break;
7657 }
7658 /* Adjust properly the offset of src and dest memory for aliasing. */
7659 if (CONST_INT_P (count_exp))
7660 {
7661 if (!issetmem)
7662 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7663 (count / size_needed) * size_needed);
7664 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7665 (count / size_needed) * size_needed);
7666 }
7667 else
7668 {
7669 if (!issetmem)
7670 src = change_address (src, BLKmode, srcreg);
7671 dst = change_address (dst, BLKmode, destreg);
7672 }
7673
7674 /* Step 4: Epilogue to copy the remaining bytes. */
7675 epilogue:
7676 if (label)
7677 {
7678 /* When the main loop is done, COUNT_EXP might hold original count,
7679 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7680 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7681 bytes. Compensate if needed. */
7682
7683 if (size_needed < epilogue_size_needed)
7684 {
7685 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7686 GEN_INT (size_needed - 1), count_exp, 1,
7687 OPTAB_DIRECT);
7688 if (tmp != count_exp)
7689 emit_move_insn (count_exp, tmp);
7690 }
7691 emit_label (label);
7692 LABEL_NUSES (label) = 1;
7693 }
7694
7695 if (count_exp != const0_rtx && epilogue_size_needed > 1)
7696 {
7697 if (force_loopy_epilogue)
7698 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7699 epilogue_size_needed);
7700 else
7701 {
7702 if (issetmem)
7703 expand_setmem_epilogue (dst, destreg, promoted_val,
7704 vec_promoted_val, count_exp,
7705 epilogue_size_needed);
7706 else
7707 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
7708 epilogue_size_needed);
7709 }
7710 }
7711 if (jump_around_label)
7712 emit_label (jump_around_label);
7713 return true;
7714 }
7715
7716
7717 /* Expand the appropriate insns for doing strlen if not just doing
7718 repnz; scasb
7719
7720 out = result, initialized with the start address
7721 align_rtx = alignment of the address.
7722 scratch = scratch register, initialized with the startaddress when
7723 not aligned, otherwise undefined
7724
7725 This is just the body. It needs the initializations mentioned above and
7726 some address computing at the end. These things are done in i386.md. */
7727
7728 static void
7729 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7730 {
7731 int align;
7732 rtx tmp;
7733 rtx_code_label *align_2_label = NULL;
7734 rtx_code_label *align_3_label = NULL;
7735 rtx_code_label *align_4_label = gen_label_rtx ();
7736 rtx_code_label *end_0_label = gen_label_rtx ();
7737 rtx mem;
7738 rtx tmpreg = gen_reg_rtx (SImode);
7739 rtx scratch = gen_reg_rtx (SImode);
7740 rtx cmp;
7741
7742 align = 0;
7743 if (CONST_INT_P (align_rtx))
7744 align = INTVAL (align_rtx);
7745
7746 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7747
7748 /* Is there a known alignment and is it less than 4? */
7749 if (align < 4)
7750 {
7751 rtx scratch1 = gen_reg_rtx (Pmode);
7752 emit_move_insn (scratch1, out);
7753 /* Is there a known alignment and is it not 2? */
7754 if (align != 2)
7755 {
7756 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7757 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7758
7759 /* Leave just the 3 lower bits. */
7760 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7761 NULL_RTX, 0, OPTAB_WIDEN);
7762
7763 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7764 Pmode, 1, align_4_label);
7765 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7766 Pmode, 1, align_2_label);
7767 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7768 Pmode, 1, align_3_label);
7769 }
7770 else
7771 {
7772 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7773 check if is aligned to 4 - byte. */
7774
7775 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7776 NULL_RTX, 0, OPTAB_WIDEN);
7777
7778 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7779 Pmode, 1, align_4_label);
7780 }
7781
7782 mem = change_address (src, QImode, out);
7783
7784 /* Now compare the bytes. */
7785
7786 /* Compare the first n unaligned byte on a byte per byte basis. */
7787 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7788 QImode, 1, end_0_label);
7789
7790 /* Increment the address. */
7791 emit_insn (gen_add2_insn (out, const1_rtx));
7792
7793 /* Not needed with an alignment of 2 */
7794 if (align != 2)
7795 {
7796 emit_label (align_2_label);
7797
7798 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7799 end_0_label);
7800
7801 emit_insn (gen_add2_insn (out, const1_rtx));
7802
7803 emit_label (align_3_label);
7804 }
7805
7806 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7807 end_0_label);
7808
7809 emit_insn (gen_add2_insn (out, const1_rtx));
7810 }
7811
7812 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7813 align this loop. It gives only huge programs, but does not help to
7814 speed up. */
7815 emit_label (align_4_label);
7816
7817 mem = change_address (src, SImode, out);
7818 emit_move_insn (scratch, mem);
7819 emit_insn (gen_add2_insn (out, GEN_INT (4)));
7820
7821 /* This formula yields a nonzero result iff one of the bytes is zero.
7822 This saves three branches inside loop and many cycles. */
7823
7824 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7825 emit_insn (gen_one_cmplsi2 (scratch, scratch));
7826 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7827 emit_insn (gen_andsi3 (tmpreg, tmpreg,
7828 gen_int_mode (0x80808080, SImode)));
7829 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7830 align_4_label);
7831
7832 if (TARGET_CMOVE)
7833 {
7834 rtx reg = gen_reg_rtx (SImode);
7835 rtx reg2 = gen_reg_rtx (Pmode);
7836 emit_move_insn (reg, tmpreg);
7837 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7838
7839 /* If zero is not in the first two bytes, move two bytes forward. */
7840 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7841 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7842 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7843 emit_insn (gen_rtx_SET (tmpreg,
7844 gen_rtx_IF_THEN_ELSE (SImode, tmp,
7845 reg,
7846 tmpreg)));
7847 /* Emit lea manually to avoid clobbering of flags. */
7848 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
7849
7850 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7851 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7852 emit_insn (gen_rtx_SET (out,
7853 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7854 reg2,
7855 out)));
7856 }
7857 else
7858 {
7859 rtx_code_label *end_2_label = gen_label_rtx ();
7860 /* Is zero in the first two bytes? */
7861
7862 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7863 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7864 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7865 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7866 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7867 pc_rtx);
7868 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7869 JUMP_LABEL (tmp) = end_2_label;
7870
7871 /* Not in the first two. Move two bytes forward. */
7872 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
7873 emit_insn (gen_add2_insn (out, const2_rtx));
7874
7875 emit_label (end_2_label);
7876
7877 }
7878
7879 /* Avoid branch in fixing the byte. */
7880 tmpreg = gen_lowpart (QImode, tmpreg);
7881 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7882 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7883 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
7884 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
7885
7886 emit_label (end_0_label);
7887 }
7888
7889 /* Expand strlen. */
7890
7891 bool
7892 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7893 {
7894 if (TARGET_UNROLL_STRLEN
7895 && TARGET_INLINE_ALL_STRINGOPS
7896 && eoschar == const0_rtx
7897 && optimize > 1)
7898 {
7899 /* The generic case of strlen expander is long. Avoid it's
7900 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7901 rtx addr = force_reg (Pmode, XEXP (src, 0));
7902 /* Well it seems that some optimizer does not combine a call like
7903 foo(strlen(bar), strlen(bar));
7904 when the move and the subtraction is done here. It does calculate
7905 the length just once when these instructions are done inside of
7906 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7907 often used and I use one fewer register for the lifetime of
7908 output_strlen_unroll() this is better. */
7909
7910 emit_move_insn (out, addr);
7911
7912 ix86_expand_strlensi_unroll_1 (out, src, align);
7913
7914 /* strlensi_unroll_1 returns the address of the zero at the end of
7915 the string, like memchr(), so compute the length by subtracting
7916 the start address. */
7917 emit_insn (gen_sub2_insn (out, addr));
7918 return true;
7919 }
7920 else
7921 return false;
7922 }
7923
7924 /* For given symbol (function) construct code to compute address of it's PLT
7925 entry in large x86-64 PIC model. */
7926
7927 static rtx
7928 construct_plt_address (rtx symbol)
7929 {
7930 rtx tmp, unspec;
7931
7932 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
7933 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
7934 gcc_assert (Pmode == DImode);
7935
7936 tmp = gen_reg_rtx (Pmode);
7937 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
7938
7939 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
7940 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
7941 return tmp;
7942 }
7943
7944 /* Additional registers that are clobbered by SYSV calls. */
7945
7946 static int const x86_64_ms_sysv_extra_clobbered_registers
7947 [NUM_X86_64_MS_CLOBBERED_REGS] =
7948 {
7949 SI_REG, DI_REG,
7950 XMM6_REG, XMM7_REG,
7951 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
7952 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
7953 };
7954
7955 rtx_insn *
7956 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
7957 rtx callarg2,
7958 rtx pop, bool sibcall)
7959 {
7960 rtx vec[3];
7961 rtx use = NULL, call;
7962 unsigned int vec_len = 0;
7963 tree fndecl;
7964
7965 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7966 {
7967 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
7968 if (fndecl
7969 && (lookup_attribute ("interrupt",
7970 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
7971 error ("interrupt service routine cannot be called directly");
7972 }
7973 else
7974 fndecl = NULL_TREE;
7975
7976 if (pop == const0_rtx)
7977 pop = NULL;
7978 gcc_assert (!TARGET_64BIT || !pop);
7979
7980 if (TARGET_MACHO && !TARGET_64BIT)
7981 {
7982 #if TARGET_MACHO
7983 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7984 fnaddr = machopic_indirect_call_target (fnaddr);
7985 #endif
7986 }
7987 else
7988 {
7989 /* Static functions and indirect calls don't need the pic register. Also,
7990 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7991 it an indirect call. */
7992 rtx addr = XEXP (fnaddr, 0);
7993 if (flag_pic
7994 && GET_CODE (addr) == SYMBOL_REF
7995 && !SYMBOL_REF_LOCAL_P (addr))
7996 {
7997 if (flag_plt
7998 && (SYMBOL_REF_DECL (addr) == NULL_TREE
7999 || !lookup_attribute ("noplt",
8000 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
8001 {
8002 if (!TARGET_64BIT
8003 || (ix86_cmodel == CM_LARGE_PIC
8004 && DEFAULT_ABI != MS_ABI))
8005 {
8006 use_reg (&use, gen_rtx_REG (Pmode,
8007 REAL_PIC_OFFSET_TABLE_REGNUM));
8008 if (ix86_use_pseudo_pic_reg ())
8009 emit_move_insn (gen_rtx_REG (Pmode,
8010 REAL_PIC_OFFSET_TABLE_REGNUM),
8011 pic_offset_table_rtx);
8012 }
8013 }
8014 else if (!TARGET_PECOFF && !TARGET_MACHO)
8015 {
8016 if (TARGET_64BIT)
8017 {
8018 fnaddr = gen_rtx_UNSPEC (Pmode,
8019 gen_rtvec (1, addr),
8020 UNSPEC_GOTPCREL);
8021 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8022 }
8023 else
8024 {
8025 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8026 UNSPEC_GOT);
8027 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8028 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
8029 fnaddr);
8030 }
8031 fnaddr = gen_const_mem (Pmode, fnaddr);
8032 /* Pmode may not be the same as word_mode for x32, which
8033 doesn't support indirect branch via 32-bit memory slot.
8034 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8035 indirect branch via x32 GOT slot is OK. */
8036 if (GET_MODE (fnaddr) != word_mode)
8037 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
8038 fnaddr = gen_rtx_MEM (QImode, fnaddr);
8039 }
8040 }
8041 }
8042
8043 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8044 parameters passed in vector registers. */
8045 if (TARGET_64BIT
8046 && (INTVAL (callarg2) > 0
8047 || (INTVAL (callarg2) == 0
8048 && (TARGET_SSE || !flag_skip_rax_setup))))
8049 {
8050 rtx al = gen_rtx_REG (QImode, AX_REG);
8051 emit_move_insn (al, callarg2);
8052 use_reg (&use, al);
8053 }
8054
8055 if (ix86_cmodel == CM_LARGE_PIC
8056 && !TARGET_PECOFF
8057 && MEM_P (fnaddr)
8058 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8059 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8060 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8061 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8062 branch via x32 GOT slot is OK. */
8063 else if (!(TARGET_X32
8064 && MEM_P (fnaddr)
8065 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8066 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8067 && (sibcall
8068 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8069 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8070 {
8071 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8072 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8073 }
8074
8075 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8076
8077 if (retval)
8078 call = gen_rtx_SET (retval, call);
8079 vec[vec_len++] = call;
8080
8081 if (pop)
8082 {
8083 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8084 pop = gen_rtx_SET (stack_pointer_rtx, pop);
8085 vec[vec_len++] = pop;
8086 }
8087
8088 if (cfun->machine->no_caller_saved_registers
8089 && (!fndecl
8090 || (!TREE_THIS_VOLATILE (fndecl)
8091 && !lookup_attribute ("no_caller_saved_registers",
8092 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8093 {
8094 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8095 bool is_64bit_ms_abi = (TARGET_64BIT
8096 && ix86_function_abi (fndecl) == MS_ABI);
8097 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8098
8099 /* If there are no caller-saved registers, add all registers
8100 that are clobbered by the call which returns. */
8101 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8102 if (!fixed_regs[i]
8103 && (ix86_call_used_regs[i] == 1
8104 || (ix86_call_used_regs[i] & c_mask))
8105 && !STACK_REGNO_P (i)
8106 && !MMX_REGNO_P (i))
8107 clobber_reg (&use,
8108 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8109 }
8110 else if (TARGET_64BIT_MS_ABI
8111 && (!callarg2 || INTVAL (callarg2) != -2))
8112 {
8113 unsigned i;
8114
8115 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8116 {
8117 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8118 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8119
8120 clobber_reg (&use, gen_rtx_REG (mode, regno));
8121 }
8122
8123 /* Set here, but it may get cleared later. */
8124 if (TARGET_CALL_MS2SYSV_XLOGUES)
8125 {
8126 if (!TARGET_SSE)
8127 ;
8128
8129 /* Don't break hot-patched functions. */
8130 else if (ix86_function_ms_hook_prologue (current_function_decl))
8131 ;
8132
8133 /* TODO: Cases not yet examined. */
8134 else if (flag_split_stack)
8135 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8136
8137 else
8138 {
8139 gcc_assert (!reload_completed);
8140 cfun->machine->call_ms2sysv = true;
8141 }
8142 }
8143 }
8144
8145 if (vec_len > 1)
8146 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8147 rtx_insn *call_insn = emit_call_insn (call);
8148 if (use)
8149 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8150
8151 return call_insn;
8152 }
8153
8154 /* Split simple return with popping POPC bytes from stack to indirect
8155 branch with stack adjustment . */
8156
8157 void
8158 ix86_split_simple_return_pop_internal (rtx popc)
8159 {
8160 struct machine_function *m = cfun->machine;
8161 rtx ecx = gen_rtx_REG (SImode, CX_REG);
8162 rtx_insn *insn;
8163
8164 /* There is no "pascal" calling convention in any 64bit ABI. */
8165 gcc_assert (!TARGET_64BIT);
8166
8167 insn = emit_insn (gen_pop (ecx));
8168 m->fs.cfa_offset -= UNITS_PER_WORD;
8169 m->fs.sp_offset -= UNITS_PER_WORD;
8170
8171 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8172 x = gen_rtx_SET (stack_pointer_rtx, x);
8173 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8174 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8175 RTX_FRAME_RELATED_P (insn) = 1;
8176
8177 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8178 x = gen_rtx_SET (stack_pointer_rtx, x);
8179 insn = emit_insn (x);
8180 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8181 RTX_FRAME_RELATED_P (insn) = 1;
8182
8183 /* Now return address is in ECX. */
8184 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8185 }
8186
8187 /* Errors in the source file can cause expand_expr to return const0_rtx
8188 where we expect a vector. To avoid crashing, use one of the vector
8189 clear instructions. */
8190
8191 static rtx
8192 safe_vector_operand (rtx x, machine_mode mode)
8193 {
8194 if (x == const0_rtx)
8195 x = CONST0_RTX (mode);
8196 return x;
8197 }
8198
8199 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8200
8201 static rtx
8202 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8203 {
8204 rtx pat;
8205 tree arg0 = CALL_EXPR_ARG (exp, 0);
8206 tree arg1 = CALL_EXPR_ARG (exp, 1);
8207 rtx op0 = expand_normal (arg0);
8208 rtx op1 = expand_normal (arg1);
8209 machine_mode tmode = insn_data[icode].operand[0].mode;
8210 machine_mode mode0 = insn_data[icode].operand[1].mode;
8211 machine_mode mode1 = insn_data[icode].operand[2].mode;
8212
8213 if (VECTOR_MODE_P (mode0))
8214 op0 = safe_vector_operand (op0, mode0);
8215 if (VECTOR_MODE_P (mode1))
8216 op1 = safe_vector_operand (op1, mode1);
8217
8218 if (optimize || !target
8219 || GET_MODE (target) != tmode
8220 || !insn_data[icode].operand[0].predicate (target, tmode))
8221 target = gen_reg_rtx (tmode);
8222
8223 if (GET_MODE (op1) == SImode && mode1 == TImode)
8224 {
8225 rtx x = gen_reg_rtx (V4SImode);
8226 emit_insn (gen_sse2_loadd (x, op1));
8227 op1 = gen_lowpart (TImode, x);
8228 }
8229
8230 if (!insn_data[icode].operand[1].predicate (op0, mode0))
8231 op0 = copy_to_mode_reg (mode0, op0);
8232 if (!insn_data[icode].operand[2].predicate (op1, mode1))
8233 op1 = copy_to_mode_reg (mode1, op1);
8234
8235 pat = GEN_FCN (icode) (target, op0, op1);
8236 if (! pat)
8237 return 0;
8238
8239 emit_insn (pat);
8240
8241 return target;
8242 }
8243
8244 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8245
8246 static rtx
8247 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8248 enum ix86_builtin_func_type m_type,
8249 enum rtx_code sub_code)
8250 {
8251 rtx pat;
8252 int i;
8253 int nargs;
8254 bool comparison_p = false;
8255 bool tf_p = false;
8256 bool last_arg_constant = false;
8257 int num_memory = 0;
8258 struct {
8259 rtx op;
8260 machine_mode mode;
8261 } args[4];
8262
8263 machine_mode tmode = insn_data[icode].operand[0].mode;
8264
8265 switch (m_type)
8266 {
8267 case MULTI_ARG_4_DF2_DI_I:
8268 case MULTI_ARG_4_DF2_DI_I1:
8269 case MULTI_ARG_4_SF2_SI_I:
8270 case MULTI_ARG_4_SF2_SI_I1:
8271 nargs = 4;
8272 last_arg_constant = true;
8273 break;
8274
8275 case MULTI_ARG_3_SF:
8276 case MULTI_ARG_3_DF:
8277 case MULTI_ARG_3_SF2:
8278 case MULTI_ARG_3_DF2:
8279 case MULTI_ARG_3_DI:
8280 case MULTI_ARG_3_SI:
8281 case MULTI_ARG_3_SI_DI:
8282 case MULTI_ARG_3_HI:
8283 case MULTI_ARG_3_HI_SI:
8284 case MULTI_ARG_3_QI:
8285 case MULTI_ARG_3_DI2:
8286 case MULTI_ARG_3_SI2:
8287 case MULTI_ARG_3_HI2:
8288 case MULTI_ARG_3_QI2:
8289 nargs = 3;
8290 break;
8291
8292 case MULTI_ARG_2_SF:
8293 case MULTI_ARG_2_DF:
8294 case MULTI_ARG_2_DI:
8295 case MULTI_ARG_2_SI:
8296 case MULTI_ARG_2_HI:
8297 case MULTI_ARG_2_QI:
8298 nargs = 2;
8299 break;
8300
8301 case MULTI_ARG_2_DI_IMM:
8302 case MULTI_ARG_2_SI_IMM:
8303 case MULTI_ARG_2_HI_IMM:
8304 case MULTI_ARG_2_QI_IMM:
8305 nargs = 2;
8306 last_arg_constant = true;
8307 break;
8308
8309 case MULTI_ARG_1_SF:
8310 case MULTI_ARG_1_DF:
8311 case MULTI_ARG_1_SF2:
8312 case MULTI_ARG_1_DF2:
8313 case MULTI_ARG_1_DI:
8314 case MULTI_ARG_1_SI:
8315 case MULTI_ARG_1_HI:
8316 case MULTI_ARG_1_QI:
8317 case MULTI_ARG_1_SI_DI:
8318 case MULTI_ARG_1_HI_DI:
8319 case MULTI_ARG_1_HI_SI:
8320 case MULTI_ARG_1_QI_DI:
8321 case MULTI_ARG_1_QI_SI:
8322 case MULTI_ARG_1_QI_HI:
8323 nargs = 1;
8324 break;
8325
8326 case MULTI_ARG_2_DI_CMP:
8327 case MULTI_ARG_2_SI_CMP:
8328 case MULTI_ARG_2_HI_CMP:
8329 case MULTI_ARG_2_QI_CMP:
8330 nargs = 2;
8331 comparison_p = true;
8332 break;
8333
8334 case MULTI_ARG_2_SF_TF:
8335 case MULTI_ARG_2_DF_TF:
8336 case MULTI_ARG_2_DI_TF:
8337 case MULTI_ARG_2_SI_TF:
8338 case MULTI_ARG_2_HI_TF:
8339 case MULTI_ARG_2_QI_TF:
8340 nargs = 2;
8341 tf_p = true;
8342 break;
8343
8344 default:
8345 gcc_unreachable ();
8346 }
8347
8348 if (optimize || !target
8349 || GET_MODE (target) != tmode
8350 || !insn_data[icode].operand[0].predicate (target, tmode))
8351 target = gen_reg_rtx (tmode);
8352 else if (memory_operand (target, tmode))
8353 num_memory++;
8354
8355 gcc_assert (nargs <= 4);
8356
8357 for (i = 0; i < nargs; i++)
8358 {
8359 tree arg = CALL_EXPR_ARG (exp, i);
8360 rtx op = expand_normal (arg);
8361 int adjust = (comparison_p) ? 1 : 0;
8362 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8363
8364 if (last_arg_constant && i == nargs - 1)
8365 {
8366 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8367 {
8368 enum insn_code new_icode = icode;
8369 switch (icode)
8370 {
8371 case CODE_FOR_xop_vpermil2v2df3:
8372 case CODE_FOR_xop_vpermil2v4sf3:
8373 case CODE_FOR_xop_vpermil2v4df3:
8374 case CODE_FOR_xop_vpermil2v8sf3:
8375 error ("the last argument must be a 2-bit immediate");
8376 return gen_reg_rtx (tmode);
8377 case CODE_FOR_xop_rotlv2di3:
8378 new_icode = CODE_FOR_rotlv2di3;
8379 goto xop_rotl;
8380 case CODE_FOR_xop_rotlv4si3:
8381 new_icode = CODE_FOR_rotlv4si3;
8382 goto xop_rotl;
8383 case CODE_FOR_xop_rotlv8hi3:
8384 new_icode = CODE_FOR_rotlv8hi3;
8385 goto xop_rotl;
8386 case CODE_FOR_xop_rotlv16qi3:
8387 new_icode = CODE_FOR_rotlv16qi3;
8388 xop_rotl:
8389 if (CONST_INT_P (op))
8390 {
8391 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8392 op = GEN_INT (INTVAL (op) & mask);
8393 gcc_checking_assert
8394 (insn_data[icode].operand[i + 1].predicate (op, mode));
8395 }
8396 else
8397 {
8398 gcc_checking_assert
8399 (nargs == 2
8400 && insn_data[new_icode].operand[0].mode == tmode
8401 && insn_data[new_icode].operand[1].mode == tmode
8402 && insn_data[new_icode].operand[2].mode == mode
8403 && insn_data[new_icode].operand[0].predicate
8404 == insn_data[icode].operand[0].predicate
8405 && insn_data[new_icode].operand[1].predicate
8406 == insn_data[icode].operand[1].predicate);
8407 icode = new_icode;
8408 goto non_constant;
8409 }
8410 break;
8411 default:
8412 gcc_unreachable ();
8413 }
8414 }
8415 }
8416 else
8417 {
8418 non_constant:
8419 if (VECTOR_MODE_P (mode))
8420 op = safe_vector_operand (op, mode);
8421
8422 /* If we aren't optimizing, only allow one memory operand to be
8423 generated. */
8424 if (memory_operand (op, mode))
8425 num_memory++;
8426
8427 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8428
8429 if (optimize
8430 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8431 || num_memory > 1)
8432 op = force_reg (mode, op);
8433 }
8434
8435 args[i].op = op;
8436 args[i].mode = mode;
8437 }
8438
8439 switch (nargs)
8440 {
8441 case 1:
8442 pat = GEN_FCN (icode) (target, args[0].op);
8443 break;
8444
8445 case 2:
8446 if (tf_p)
8447 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
8448 GEN_INT ((int)sub_code));
8449 else if (! comparison_p)
8450 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
8451 else
8452 {
8453 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8454 args[0].op,
8455 args[1].op);
8456
8457 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
8458 }
8459 break;
8460
8461 case 3:
8462 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
8463 break;
8464
8465 case 4:
8466 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
8467 break;
8468
8469 default:
8470 gcc_unreachable ();
8471 }
8472
8473 if (! pat)
8474 return 0;
8475
8476 emit_insn (pat);
8477 return target;
8478 }
8479
8480 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8481 insns with vec_merge. */
8482
8483 static rtx
8484 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8485 rtx target)
8486 {
8487 rtx pat;
8488 tree arg0 = CALL_EXPR_ARG (exp, 0);
8489 rtx op1, op0 = expand_normal (arg0);
8490 machine_mode tmode = insn_data[icode].operand[0].mode;
8491 machine_mode mode0 = insn_data[icode].operand[1].mode;
8492
8493 if (optimize || !target
8494 || GET_MODE (target) != tmode
8495 || !insn_data[icode].operand[0].predicate (target, tmode))
8496 target = gen_reg_rtx (tmode);
8497
8498 if (VECTOR_MODE_P (mode0))
8499 op0 = safe_vector_operand (op0, mode0);
8500
8501 if ((optimize && !register_operand (op0, mode0))
8502 || !insn_data[icode].operand[1].predicate (op0, mode0))
8503 op0 = copy_to_mode_reg (mode0, op0);
8504
8505 op1 = op0;
8506 if (!insn_data[icode].operand[2].predicate (op1, mode0))
8507 op1 = copy_to_mode_reg (mode0, op1);
8508
8509 pat = GEN_FCN (icode) (target, op0, op1);
8510 if (! pat)
8511 return 0;
8512 emit_insn (pat);
8513 return target;
8514 }
8515
8516 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8517
8518 static rtx
8519 ix86_expand_sse_compare (const struct builtin_description *d,
8520 tree exp, rtx target, bool swap)
8521 {
8522 rtx pat;
8523 tree arg0 = CALL_EXPR_ARG (exp, 0);
8524 tree arg1 = CALL_EXPR_ARG (exp, 1);
8525 rtx op0 = expand_normal (arg0);
8526 rtx op1 = expand_normal (arg1);
8527 rtx op2;
8528 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8529 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8530 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8531 enum rtx_code comparison = d->comparison;
8532
8533 if (VECTOR_MODE_P (mode0))
8534 op0 = safe_vector_operand (op0, mode0);
8535 if (VECTOR_MODE_P (mode1))
8536 op1 = safe_vector_operand (op1, mode1);
8537
8538 /* Swap operands if we have a comparison that isn't available in
8539 hardware. */
8540 if (swap)
8541 std::swap (op0, op1);
8542
8543 if (optimize || !target
8544 || GET_MODE (target) != tmode
8545 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8546 target = gen_reg_rtx (tmode);
8547
8548 if ((optimize && !register_operand (op0, mode0))
8549 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8550 op0 = copy_to_mode_reg (mode0, op0);
8551 if ((optimize && !register_operand (op1, mode1))
8552 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8553 op1 = copy_to_mode_reg (mode1, op1);
8554
8555 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8556 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8557 if (! pat)
8558 return 0;
8559 emit_insn (pat);
8560 return target;
8561 }
8562
8563 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8564
8565 static rtx
8566 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8567 rtx target)
8568 {
8569 rtx pat;
8570 tree arg0 = CALL_EXPR_ARG (exp, 0);
8571 tree arg1 = CALL_EXPR_ARG (exp, 1);
8572 rtx op0 = expand_normal (arg0);
8573 rtx op1 = expand_normal (arg1);
8574 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8575 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8576 enum rtx_code comparison = d->comparison;
8577
8578 if (VECTOR_MODE_P (mode0))
8579 op0 = safe_vector_operand (op0, mode0);
8580 if (VECTOR_MODE_P (mode1))
8581 op1 = safe_vector_operand (op1, mode1);
8582
8583 /* Swap operands if we have a comparison that isn't available in
8584 hardware. */
8585 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
8586 std::swap (op0, op1);
8587
8588 target = gen_reg_rtx (SImode);
8589 emit_move_insn (target, const0_rtx);
8590 target = gen_rtx_SUBREG (QImode, target, 0);
8591
8592 if ((optimize && !register_operand (op0, mode0))
8593 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8594 op0 = copy_to_mode_reg (mode0, op0);
8595 if ((optimize && !register_operand (op1, mode1))
8596 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8597 op1 = copy_to_mode_reg (mode1, op1);
8598
8599 pat = GEN_FCN (d->icode) (op0, op1);
8600 if (! pat)
8601 return 0;
8602 emit_insn (pat);
8603 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8604 gen_rtx_fmt_ee (comparison, QImode,
8605 SET_DEST (pat),
8606 const0_rtx)));
8607
8608 return SUBREG_REG (target);
8609 }
8610
8611 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8612
8613 static rtx
8614 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8615 rtx target)
8616 {
8617 rtx pat;
8618 tree arg0 = CALL_EXPR_ARG (exp, 0);
8619 rtx op1, op0 = expand_normal (arg0);
8620 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8621 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8622
8623 if (optimize || target == 0
8624 || GET_MODE (target) != tmode
8625 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8626 target = gen_reg_rtx (tmode);
8627
8628 if (VECTOR_MODE_P (mode0))
8629 op0 = safe_vector_operand (op0, mode0);
8630
8631 if ((optimize && !register_operand (op0, mode0))
8632 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8633 op0 = copy_to_mode_reg (mode0, op0);
8634
8635 op1 = GEN_INT (d->comparison);
8636
8637 pat = GEN_FCN (d->icode) (target, op0, op1);
8638 if (! pat)
8639 return 0;
8640 emit_insn (pat);
8641 return target;
8642 }
8643
8644 static rtx
8645 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8646 tree exp, rtx target)
8647 {
8648 rtx pat;
8649 tree arg0 = CALL_EXPR_ARG (exp, 0);
8650 tree arg1 = CALL_EXPR_ARG (exp, 1);
8651 rtx op0 = expand_normal (arg0);
8652 rtx op1 = expand_normal (arg1);
8653 rtx op2;
8654 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8655 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8656 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8657
8658 if (optimize || target == 0
8659 || GET_MODE (target) != tmode
8660 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8661 target = gen_reg_rtx (tmode);
8662
8663 op0 = safe_vector_operand (op0, mode0);
8664 op1 = safe_vector_operand (op1, mode1);
8665
8666 if ((optimize && !register_operand (op0, mode0))
8667 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8668 op0 = copy_to_mode_reg (mode0, op0);
8669 if ((optimize && !register_operand (op1, mode1))
8670 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8671 op1 = copy_to_mode_reg (mode1, op1);
8672
8673 op2 = GEN_INT (d->comparison);
8674
8675 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8676 if (! pat)
8677 return 0;
8678 emit_insn (pat);
8679 return target;
8680 }
8681
8682 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8683
8684 static rtx
8685 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8686 rtx target)
8687 {
8688 rtx pat;
8689 tree arg0 = CALL_EXPR_ARG (exp, 0);
8690 tree arg1 = CALL_EXPR_ARG (exp, 1);
8691 rtx op0 = expand_normal (arg0);
8692 rtx op1 = expand_normal (arg1);
8693 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8694 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8695 enum rtx_code comparison = d->comparison;
8696
8697 if (VECTOR_MODE_P (mode0))
8698 op0 = safe_vector_operand (op0, mode0);
8699 if (VECTOR_MODE_P (mode1))
8700 op1 = safe_vector_operand (op1, mode1);
8701
8702 target = gen_reg_rtx (SImode);
8703 emit_move_insn (target, const0_rtx);
8704 target = gen_rtx_SUBREG (QImode, target, 0);
8705
8706 if ((optimize && !register_operand (op0, mode0))
8707 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8708 op0 = copy_to_mode_reg (mode0, op0);
8709 if ((optimize && !register_operand (op1, mode1))
8710 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8711 op1 = copy_to_mode_reg (mode1, op1);
8712
8713 pat = GEN_FCN (d->icode) (op0, op1);
8714 if (! pat)
8715 return 0;
8716 emit_insn (pat);
8717 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8718 gen_rtx_fmt_ee (comparison, QImode,
8719 SET_DEST (pat),
8720 const0_rtx)));
8721
8722 return SUBREG_REG (target);
8723 }
8724
8725 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8726
8727 static rtx
8728 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8729 tree exp, rtx target)
8730 {
8731 rtx pat;
8732 tree arg0 = CALL_EXPR_ARG (exp, 0);
8733 tree arg1 = CALL_EXPR_ARG (exp, 1);
8734 tree arg2 = CALL_EXPR_ARG (exp, 2);
8735 tree arg3 = CALL_EXPR_ARG (exp, 3);
8736 tree arg4 = CALL_EXPR_ARG (exp, 4);
8737 rtx scratch0, scratch1;
8738 rtx op0 = expand_normal (arg0);
8739 rtx op1 = expand_normal (arg1);
8740 rtx op2 = expand_normal (arg2);
8741 rtx op3 = expand_normal (arg3);
8742 rtx op4 = expand_normal (arg4);
8743 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8744
8745 tmode0 = insn_data[d->icode].operand[0].mode;
8746 tmode1 = insn_data[d->icode].operand[1].mode;
8747 modev2 = insn_data[d->icode].operand[2].mode;
8748 modei3 = insn_data[d->icode].operand[3].mode;
8749 modev4 = insn_data[d->icode].operand[4].mode;
8750 modei5 = insn_data[d->icode].operand[5].mode;
8751 modeimm = insn_data[d->icode].operand[6].mode;
8752
8753 if (VECTOR_MODE_P (modev2))
8754 op0 = safe_vector_operand (op0, modev2);
8755 if (VECTOR_MODE_P (modev4))
8756 op2 = safe_vector_operand (op2, modev4);
8757
8758 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8759 op0 = copy_to_mode_reg (modev2, op0);
8760 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8761 op1 = copy_to_mode_reg (modei3, op1);
8762 if ((optimize && !register_operand (op2, modev4))
8763 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8764 op2 = copy_to_mode_reg (modev4, op2);
8765 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8766 op3 = copy_to_mode_reg (modei5, op3);
8767
8768 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8769 {
8770 error ("the fifth argument must be an 8-bit immediate");
8771 return const0_rtx;
8772 }
8773
8774 if (d->code == IX86_BUILTIN_PCMPESTRI128)
8775 {
8776 if (optimize || !target
8777 || GET_MODE (target) != tmode0
8778 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8779 target = gen_reg_rtx (tmode0);
8780
8781 scratch1 = gen_reg_rtx (tmode1);
8782
8783 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8784 }
8785 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8786 {
8787 if (optimize || !target
8788 || GET_MODE (target) != tmode1
8789 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8790 target = gen_reg_rtx (tmode1);
8791
8792 scratch0 = gen_reg_rtx (tmode0);
8793
8794 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8795 }
8796 else
8797 {
8798 gcc_assert (d->flag);
8799
8800 scratch0 = gen_reg_rtx (tmode0);
8801 scratch1 = gen_reg_rtx (tmode1);
8802
8803 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8804 }
8805
8806 if (! pat)
8807 return 0;
8808
8809 emit_insn (pat);
8810
8811 if (d->flag)
8812 {
8813 target = gen_reg_rtx (SImode);
8814 emit_move_insn (target, const0_rtx);
8815 target = gen_rtx_SUBREG (QImode, target, 0);
8816
8817 emit_insn
8818 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8819 gen_rtx_fmt_ee (EQ, QImode,
8820 gen_rtx_REG ((machine_mode) d->flag,
8821 FLAGS_REG),
8822 const0_rtx)));
8823 return SUBREG_REG (target);
8824 }
8825 else
8826 return target;
8827 }
8828
8829
8830 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8831
8832 static rtx
8833 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8834 tree exp, rtx target)
8835 {
8836 rtx pat;
8837 tree arg0 = CALL_EXPR_ARG (exp, 0);
8838 tree arg1 = CALL_EXPR_ARG (exp, 1);
8839 tree arg2 = CALL_EXPR_ARG (exp, 2);
8840 rtx scratch0, scratch1;
8841 rtx op0 = expand_normal (arg0);
8842 rtx op1 = expand_normal (arg1);
8843 rtx op2 = expand_normal (arg2);
8844 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8845
8846 tmode0 = insn_data[d->icode].operand[0].mode;
8847 tmode1 = insn_data[d->icode].operand[1].mode;
8848 modev2 = insn_data[d->icode].operand[2].mode;
8849 modev3 = insn_data[d->icode].operand[3].mode;
8850 modeimm = insn_data[d->icode].operand[4].mode;
8851
8852 if (VECTOR_MODE_P (modev2))
8853 op0 = safe_vector_operand (op0, modev2);
8854 if (VECTOR_MODE_P (modev3))
8855 op1 = safe_vector_operand (op1, modev3);
8856
8857 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8858 op0 = copy_to_mode_reg (modev2, op0);
8859 if ((optimize && !register_operand (op1, modev3))
8860 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8861 op1 = copy_to_mode_reg (modev3, op1);
8862
8863 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8864 {
8865 error ("the third argument must be an 8-bit immediate");
8866 return const0_rtx;
8867 }
8868
8869 if (d->code == IX86_BUILTIN_PCMPISTRI128)
8870 {
8871 if (optimize || !target
8872 || GET_MODE (target) != tmode0
8873 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8874 target = gen_reg_rtx (tmode0);
8875
8876 scratch1 = gen_reg_rtx (tmode1);
8877
8878 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8879 }
8880 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8881 {
8882 if (optimize || !target
8883 || GET_MODE (target) != tmode1
8884 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8885 target = gen_reg_rtx (tmode1);
8886
8887 scratch0 = gen_reg_rtx (tmode0);
8888
8889 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8890 }
8891 else
8892 {
8893 gcc_assert (d->flag);
8894
8895 scratch0 = gen_reg_rtx (tmode0);
8896 scratch1 = gen_reg_rtx (tmode1);
8897
8898 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8899 }
8900
8901 if (! pat)
8902 return 0;
8903
8904 emit_insn (pat);
8905
8906 if (d->flag)
8907 {
8908 target = gen_reg_rtx (SImode);
8909 emit_move_insn (target, const0_rtx);
8910 target = gen_rtx_SUBREG (QImode, target, 0);
8911
8912 emit_insn
8913 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8914 gen_rtx_fmt_ee (EQ, QImode,
8915 gen_rtx_REG ((machine_mode) d->flag,
8916 FLAGS_REG),
8917 const0_rtx)));
8918 return SUBREG_REG (target);
8919 }
8920 else
8921 return target;
8922 }
8923
8924 /* Fixup modeless constants to fit required mode. */
8925
8926 static rtx
8927 fixup_modeless_constant (rtx x, machine_mode mode)
8928 {
8929 if (GET_MODE (x) == VOIDmode)
8930 x = convert_to_mode (mode, x, 1);
8931 return x;
8932 }
8933
8934 /* Subroutine of ix86_expand_builtin to take care of insns with
8935 variable number of operands. */
8936
8937 static rtx
8938 ix86_expand_args_builtin (const struct builtin_description *d,
8939 tree exp, rtx target)
8940 {
8941 rtx pat, real_target;
8942 unsigned int i, nargs;
8943 unsigned int nargs_constant = 0;
8944 unsigned int mask_pos = 0;
8945 int num_memory = 0;
8946 struct
8947 {
8948 rtx op;
8949 machine_mode mode;
8950 } args[6];
8951 bool second_arg_count = false;
8952 enum insn_code icode = d->icode;
8953 const struct insn_data_d *insn_p = &insn_data[icode];
8954 machine_mode tmode = insn_p->operand[0].mode;
8955 machine_mode rmode = VOIDmode;
8956 bool swap = false;
8957 enum rtx_code comparison = d->comparison;
8958
8959 switch ((enum ix86_builtin_func_type) d->flag)
8960 {
8961 case V2DF_FTYPE_V2DF_ROUND:
8962 case V4DF_FTYPE_V4DF_ROUND:
8963 case V8DF_FTYPE_V8DF_ROUND:
8964 case V4SF_FTYPE_V4SF_ROUND:
8965 case V8SF_FTYPE_V8SF_ROUND:
8966 case V16SF_FTYPE_V16SF_ROUND:
8967 case V4SI_FTYPE_V4SF_ROUND:
8968 case V8SI_FTYPE_V8SF_ROUND:
8969 case V16SI_FTYPE_V16SF_ROUND:
8970 return ix86_expand_sse_round (d, exp, target);
8971 case V4SI_FTYPE_V2DF_V2DF_ROUND:
8972 case V8SI_FTYPE_V4DF_V4DF_ROUND:
8973 case V16SI_FTYPE_V8DF_V8DF_ROUND:
8974 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
8975 case INT_FTYPE_V8SF_V8SF_PTEST:
8976 case INT_FTYPE_V4DI_V4DI_PTEST:
8977 case INT_FTYPE_V4DF_V4DF_PTEST:
8978 case INT_FTYPE_V4SF_V4SF_PTEST:
8979 case INT_FTYPE_V2DI_V2DI_PTEST:
8980 case INT_FTYPE_V2DF_V2DF_PTEST:
8981 return ix86_expand_sse_ptest (d, exp, target);
8982 case FLOAT128_FTYPE_FLOAT128:
8983 case FLOAT_FTYPE_FLOAT:
8984 case INT_FTYPE_INT:
8985 case UINT_FTYPE_UINT:
8986 case UINT16_FTYPE_UINT16:
8987 case UINT64_FTYPE_INT:
8988 case UINT64_FTYPE_UINT64:
8989 case INT64_FTYPE_INT64:
8990 case INT64_FTYPE_V4SF:
8991 case INT64_FTYPE_V2DF:
8992 case INT_FTYPE_V16QI:
8993 case INT_FTYPE_V8QI:
8994 case INT_FTYPE_V8SF:
8995 case INT_FTYPE_V4DF:
8996 case INT_FTYPE_V4SF:
8997 case INT_FTYPE_V2DF:
8998 case INT_FTYPE_V32QI:
8999 case V16QI_FTYPE_V16QI:
9000 case V8SI_FTYPE_V8SF:
9001 case V8SI_FTYPE_V4SI:
9002 case V8HI_FTYPE_V8HI:
9003 case V8HI_FTYPE_V16QI:
9004 case V8QI_FTYPE_V8QI:
9005 case V8SF_FTYPE_V8SF:
9006 case V8SF_FTYPE_V8SI:
9007 case V8SF_FTYPE_V4SF:
9008 case V8SF_FTYPE_V8HI:
9009 case V4SI_FTYPE_V4SI:
9010 case V4SI_FTYPE_V16QI:
9011 case V4SI_FTYPE_V4SF:
9012 case V4SI_FTYPE_V8SI:
9013 case V4SI_FTYPE_V8HI:
9014 case V4SI_FTYPE_V4DF:
9015 case V4SI_FTYPE_V2DF:
9016 case V4HI_FTYPE_V4HI:
9017 case V4DF_FTYPE_V4DF:
9018 case V4DF_FTYPE_V4SI:
9019 case V4DF_FTYPE_V4SF:
9020 case V4DF_FTYPE_V2DF:
9021 case V4SF_FTYPE_V4SF:
9022 case V4SF_FTYPE_V4SI:
9023 case V4SF_FTYPE_V8SF:
9024 case V4SF_FTYPE_V4DF:
9025 case V4SF_FTYPE_V8HI:
9026 case V4SF_FTYPE_V2DF:
9027 case V2DI_FTYPE_V2DI:
9028 case V2DI_FTYPE_V16QI:
9029 case V2DI_FTYPE_V8HI:
9030 case V2DI_FTYPE_V4SI:
9031 case V2DF_FTYPE_V2DF:
9032 case V2DF_FTYPE_V4SI:
9033 case V2DF_FTYPE_V4DF:
9034 case V2DF_FTYPE_V4SF:
9035 case V2DF_FTYPE_V2SI:
9036 case V2SI_FTYPE_V2SI:
9037 case V2SI_FTYPE_V4SF:
9038 case V2SI_FTYPE_V2SF:
9039 case V2SI_FTYPE_V2DF:
9040 case V2SF_FTYPE_V2SF:
9041 case V2SF_FTYPE_V2SI:
9042 case V32QI_FTYPE_V32QI:
9043 case V32QI_FTYPE_V16QI:
9044 case V16HI_FTYPE_V16HI:
9045 case V16HI_FTYPE_V8HI:
9046 case V8SI_FTYPE_V8SI:
9047 case V16HI_FTYPE_V16QI:
9048 case V8SI_FTYPE_V16QI:
9049 case V4DI_FTYPE_V16QI:
9050 case V8SI_FTYPE_V8HI:
9051 case V4DI_FTYPE_V8HI:
9052 case V4DI_FTYPE_V4SI:
9053 case V4DI_FTYPE_V2DI:
9054 case UQI_FTYPE_UQI:
9055 case UHI_FTYPE_UHI:
9056 case USI_FTYPE_USI:
9057 case USI_FTYPE_UQI:
9058 case USI_FTYPE_UHI:
9059 case UDI_FTYPE_UDI:
9060 case UHI_FTYPE_V16QI:
9061 case USI_FTYPE_V32QI:
9062 case UDI_FTYPE_V64QI:
9063 case V16QI_FTYPE_UHI:
9064 case V32QI_FTYPE_USI:
9065 case V64QI_FTYPE_UDI:
9066 case V8HI_FTYPE_UQI:
9067 case V16HI_FTYPE_UHI:
9068 case V32HI_FTYPE_USI:
9069 case V4SI_FTYPE_UQI:
9070 case V8SI_FTYPE_UQI:
9071 case V4SI_FTYPE_UHI:
9072 case V8SI_FTYPE_UHI:
9073 case UQI_FTYPE_V8HI:
9074 case UHI_FTYPE_V16HI:
9075 case USI_FTYPE_V32HI:
9076 case UQI_FTYPE_V4SI:
9077 case UQI_FTYPE_V8SI:
9078 case UHI_FTYPE_V16SI:
9079 case UQI_FTYPE_V2DI:
9080 case UQI_FTYPE_V4DI:
9081 case UQI_FTYPE_V8DI:
9082 case V16SI_FTYPE_UHI:
9083 case V2DI_FTYPE_UQI:
9084 case V4DI_FTYPE_UQI:
9085 case V16SI_FTYPE_INT:
9086 case V16SF_FTYPE_V8SF:
9087 case V16SI_FTYPE_V8SI:
9088 case V16SF_FTYPE_V4SF:
9089 case V16SI_FTYPE_V4SI:
9090 case V16SI_FTYPE_V16SF:
9091 case V16SI_FTYPE_V16SI:
9092 case V64QI_FTYPE_V64QI:
9093 case V32HI_FTYPE_V32HI:
9094 case V16SF_FTYPE_V16SF:
9095 case V8DI_FTYPE_UQI:
9096 case V8DI_FTYPE_V8DI:
9097 case V8DF_FTYPE_V4DF:
9098 case V8DF_FTYPE_V2DF:
9099 case V8DF_FTYPE_V8DF:
9100 case V4DI_FTYPE_V4DI:
9101 case V16HI_FTYPE_V16SF:
9102 case V8HI_FTYPE_V8SF:
9103 case V8HI_FTYPE_V4SF:
9104 nargs = 1;
9105 break;
9106 case V4SF_FTYPE_V4SF_VEC_MERGE:
9107 case V2DF_FTYPE_V2DF_VEC_MERGE:
9108 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9109 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9110 case V16QI_FTYPE_V16QI_V16QI:
9111 case V16QI_FTYPE_V8HI_V8HI:
9112 case V16SF_FTYPE_V16SF_V16SF:
9113 case V8QI_FTYPE_V8QI_V8QI:
9114 case V8QI_FTYPE_V4HI_V4HI:
9115 case V8HI_FTYPE_V8HI_V8HI:
9116 case V8HI_FTYPE_V16QI_V16QI:
9117 case V8HI_FTYPE_V4SI_V4SI:
9118 case V8SF_FTYPE_V8SF_V8SF:
9119 case V8SF_FTYPE_V8SF_V8SI:
9120 case V8DF_FTYPE_V8DF_V8DF:
9121 case V4SI_FTYPE_V4SI_V4SI:
9122 case V4SI_FTYPE_V8HI_V8HI:
9123 case V4SI_FTYPE_V2DF_V2DF:
9124 case V4HI_FTYPE_V4HI_V4HI:
9125 case V4HI_FTYPE_V8QI_V8QI:
9126 case V4HI_FTYPE_V2SI_V2SI:
9127 case V4DF_FTYPE_V4DF_V4DF:
9128 case V4DF_FTYPE_V4DF_V4DI:
9129 case V4SF_FTYPE_V4SF_V4SF:
9130 case V4SF_FTYPE_V4SF_V4SI:
9131 case V4SF_FTYPE_V4SF_V2SI:
9132 case V4SF_FTYPE_V4SF_V2DF:
9133 case V4SF_FTYPE_V4SF_UINT:
9134 case V4SF_FTYPE_V4SF_DI:
9135 case V4SF_FTYPE_V4SF_SI:
9136 case V2DI_FTYPE_V2DI_V2DI:
9137 case V2DI_FTYPE_V16QI_V16QI:
9138 case V2DI_FTYPE_V4SI_V4SI:
9139 case V2DI_FTYPE_V2DI_V16QI:
9140 case V2SI_FTYPE_V2SI_V2SI:
9141 case V2SI_FTYPE_V4HI_V4HI:
9142 case V2SI_FTYPE_V2SF_V2SF:
9143 case V2DF_FTYPE_V2DF_V2DF:
9144 case V2DF_FTYPE_V2DF_V4SF:
9145 case V2DF_FTYPE_V2DF_V2DI:
9146 case V2DF_FTYPE_V2DF_DI:
9147 case V2DF_FTYPE_V2DF_SI:
9148 case V2DF_FTYPE_V2DF_UINT:
9149 case V2SF_FTYPE_V2SF_V2SF:
9150 case V1DI_FTYPE_V1DI_V1DI:
9151 case V1DI_FTYPE_V8QI_V8QI:
9152 case V1DI_FTYPE_V2SI_V2SI:
9153 case V32QI_FTYPE_V16HI_V16HI:
9154 case V16HI_FTYPE_V8SI_V8SI:
9155 case V64QI_FTYPE_V64QI_V64QI:
9156 case V32QI_FTYPE_V32QI_V32QI:
9157 case V16HI_FTYPE_V32QI_V32QI:
9158 case V16HI_FTYPE_V16HI_V16HI:
9159 case V8SI_FTYPE_V4DF_V4DF:
9160 case V8SI_FTYPE_V8SI_V8SI:
9161 case V8SI_FTYPE_V16HI_V16HI:
9162 case V4DI_FTYPE_V4DI_V4DI:
9163 case V4DI_FTYPE_V8SI_V8SI:
9164 case V8DI_FTYPE_V64QI_V64QI:
9165 if (comparison == UNKNOWN)
9166 return ix86_expand_binop_builtin (icode, exp, target);
9167 nargs = 2;
9168 break;
9169 case V4SF_FTYPE_V4SF_V4SF_SWAP:
9170 case V2DF_FTYPE_V2DF_V2DF_SWAP:
9171 gcc_assert (comparison != UNKNOWN);
9172 nargs = 2;
9173 swap = true;
9174 break;
9175 case V16HI_FTYPE_V16HI_V8HI_COUNT:
9176 case V16HI_FTYPE_V16HI_SI_COUNT:
9177 case V8SI_FTYPE_V8SI_V4SI_COUNT:
9178 case V8SI_FTYPE_V8SI_SI_COUNT:
9179 case V4DI_FTYPE_V4DI_V2DI_COUNT:
9180 case V4DI_FTYPE_V4DI_INT_COUNT:
9181 case V8HI_FTYPE_V8HI_V8HI_COUNT:
9182 case V8HI_FTYPE_V8HI_SI_COUNT:
9183 case V4SI_FTYPE_V4SI_V4SI_COUNT:
9184 case V4SI_FTYPE_V4SI_SI_COUNT:
9185 case V4HI_FTYPE_V4HI_V4HI_COUNT:
9186 case V4HI_FTYPE_V4HI_SI_COUNT:
9187 case V2DI_FTYPE_V2DI_V2DI_COUNT:
9188 case V2DI_FTYPE_V2DI_SI_COUNT:
9189 case V2SI_FTYPE_V2SI_V2SI_COUNT:
9190 case V2SI_FTYPE_V2SI_SI_COUNT:
9191 case V1DI_FTYPE_V1DI_V1DI_COUNT:
9192 case V1DI_FTYPE_V1DI_SI_COUNT:
9193 nargs = 2;
9194 second_arg_count = true;
9195 break;
9196 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9197 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9198 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9199 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9200 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9201 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9202 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9203 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9204 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9205 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9206 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9207 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9208 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9209 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9210 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9211 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9212 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9213 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9214 nargs = 4;
9215 second_arg_count = true;
9216 break;
9217 case UINT64_FTYPE_UINT64_UINT64:
9218 case UINT_FTYPE_UINT_UINT:
9219 case UINT_FTYPE_UINT_USHORT:
9220 case UINT_FTYPE_UINT_UCHAR:
9221 case UINT16_FTYPE_UINT16_INT:
9222 case UINT8_FTYPE_UINT8_INT:
9223 case UQI_FTYPE_UQI_UQI:
9224 case UHI_FTYPE_UHI_UHI:
9225 case USI_FTYPE_USI_USI:
9226 case UDI_FTYPE_UDI_UDI:
9227 case V16SI_FTYPE_V8DF_V8DF:
9228 case V32HI_FTYPE_V16SF_V16SF:
9229 case V16HI_FTYPE_V8SF_V8SF:
9230 case V8HI_FTYPE_V4SF_V4SF:
9231 case V16HI_FTYPE_V16SF_UHI:
9232 case V8HI_FTYPE_V8SF_UQI:
9233 case V8HI_FTYPE_V4SF_UQI:
9234 nargs = 2;
9235 break;
9236 case V2DI_FTYPE_V2DI_INT_CONVERT:
9237 nargs = 2;
9238 rmode = V1TImode;
9239 nargs_constant = 1;
9240 break;
9241 case V4DI_FTYPE_V4DI_INT_CONVERT:
9242 nargs = 2;
9243 rmode = V2TImode;
9244 nargs_constant = 1;
9245 break;
9246 case V8DI_FTYPE_V8DI_INT_CONVERT:
9247 nargs = 2;
9248 rmode = V4TImode;
9249 nargs_constant = 1;
9250 break;
9251 case V8HI_FTYPE_V8HI_INT:
9252 case V8HI_FTYPE_V8SF_INT:
9253 case V16HI_FTYPE_V16SF_INT:
9254 case V8HI_FTYPE_V4SF_INT:
9255 case V8SF_FTYPE_V8SF_INT:
9256 case V4SF_FTYPE_V16SF_INT:
9257 case V16SF_FTYPE_V16SF_INT:
9258 case V4SI_FTYPE_V4SI_INT:
9259 case V4SI_FTYPE_V8SI_INT:
9260 case V4HI_FTYPE_V4HI_INT:
9261 case V4DF_FTYPE_V4DF_INT:
9262 case V4DF_FTYPE_V8DF_INT:
9263 case V4SF_FTYPE_V4SF_INT:
9264 case V4SF_FTYPE_V8SF_INT:
9265 case V2DI_FTYPE_V2DI_INT:
9266 case V2DF_FTYPE_V2DF_INT:
9267 case V2DF_FTYPE_V4DF_INT:
9268 case V16HI_FTYPE_V16HI_INT:
9269 case V8SI_FTYPE_V8SI_INT:
9270 case V16SI_FTYPE_V16SI_INT:
9271 case V4SI_FTYPE_V16SI_INT:
9272 case V4DI_FTYPE_V4DI_INT:
9273 case V2DI_FTYPE_V4DI_INT:
9274 case V4DI_FTYPE_V8DI_INT:
9275 case QI_FTYPE_V4SF_INT:
9276 case QI_FTYPE_V2DF_INT:
9277 case UQI_FTYPE_UQI_UQI_CONST:
9278 case UHI_FTYPE_UHI_UQI:
9279 case USI_FTYPE_USI_UQI:
9280 case UDI_FTYPE_UDI_UQI:
9281 nargs = 2;
9282 nargs_constant = 1;
9283 break;
9284 case V16QI_FTYPE_V16QI_V16QI_V16QI:
9285 case V8SF_FTYPE_V8SF_V8SF_V8SF:
9286 case V4DF_FTYPE_V4DF_V4DF_V4DF:
9287 case V4SF_FTYPE_V4SF_V4SF_V4SF:
9288 case V2DF_FTYPE_V2DF_V2DF_V2DF:
9289 case V32QI_FTYPE_V32QI_V32QI_V32QI:
9290 case UHI_FTYPE_V16SI_V16SI_UHI:
9291 case UQI_FTYPE_V8DI_V8DI_UQI:
9292 case V16HI_FTYPE_V16SI_V16HI_UHI:
9293 case V16QI_FTYPE_V16SI_V16QI_UHI:
9294 case V16QI_FTYPE_V8DI_V16QI_UQI:
9295 case V16SF_FTYPE_V16SF_V16SF_UHI:
9296 case V16SF_FTYPE_V4SF_V16SF_UHI:
9297 case V16SI_FTYPE_SI_V16SI_UHI:
9298 case V16SI_FTYPE_V16HI_V16SI_UHI:
9299 case V16SI_FTYPE_V16QI_V16SI_UHI:
9300 case V8SF_FTYPE_V4SF_V8SF_UQI:
9301 case V4DF_FTYPE_V2DF_V4DF_UQI:
9302 case V8SI_FTYPE_V4SI_V8SI_UQI:
9303 case V8SI_FTYPE_SI_V8SI_UQI:
9304 case V4SI_FTYPE_V4SI_V4SI_UQI:
9305 case V4SI_FTYPE_SI_V4SI_UQI:
9306 case V4DI_FTYPE_V2DI_V4DI_UQI:
9307 case V4DI_FTYPE_DI_V4DI_UQI:
9308 case V2DI_FTYPE_V2DI_V2DI_UQI:
9309 case V2DI_FTYPE_DI_V2DI_UQI:
9310 case V64QI_FTYPE_V64QI_V64QI_UDI:
9311 case V64QI_FTYPE_V16QI_V64QI_UDI:
9312 case V64QI_FTYPE_QI_V64QI_UDI:
9313 case V32QI_FTYPE_V32QI_V32QI_USI:
9314 case V32QI_FTYPE_V16QI_V32QI_USI:
9315 case V32QI_FTYPE_QI_V32QI_USI:
9316 case V16QI_FTYPE_V16QI_V16QI_UHI:
9317 case V16QI_FTYPE_QI_V16QI_UHI:
9318 case V32HI_FTYPE_V8HI_V32HI_USI:
9319 case V32HI_FTYPE_HI_V32HI_USI:
9320 case V16HI_FTYPE_V8HI_V16HI_UHI:
9321 case V16HI_FTYPE_HI_V16HI_UHI:
9322 case V8HI_FTYPE_V8HI_V8HI_UQI:
9323 case V8HI_FTYPE_HI_V8HI_UQI:
9324 case V8SF_FTYPE_V8HI_V8SF_UQI:
9325 case V4SF_FTYPE_V8HI_V4SF_UQI:
9326 case V8SI_FTYPE_V8SF_V8SI_UQI:
9327 case V4SI_FTYPE_V4SF_V4SI_UQI:
9328 case V4DI_FTYPE_V4SF_V4DI_UQI:
9329 case V2DI_FTYPE_V4SF_V2DI_UQI:
9330 case V4SF_FTYPE_V4DI_V4SF_UQI:
9331 case V4SF_FTYPE_V2DI_V4SF_UQI:
9332 case V4DF_FTYPE_V4DI_V4DF_UQI:
9333 case V2DF_FTYPE_V2DI_V2DF_UQI:
9334 case V16QI_FTYPE_V8HI_V16QI_UQI:
9335 case V16QI_FTYPE_V16HI_V16QI_UHI:
9336 case V16QI_FTYPE_V4SI_V16QI_UQI:
9337 case V16QI_FTYPE_V8SI_V16QI_UQI:
9338 case V8HI_FTYPE_V4SI_V8HI_UQI:
9339 case V8HI_FTYPE_V8SI_V8HI_UQI:
9340 case V16QI_FTYPE_V2DI_V16QI_UQI:
9341 case V16QI_FTYPE_V4DI_V16QI_UQI:
9342 case V8HI_FTYPE_V2DI_V8HI_UQI:
9343 case V8HI_FTYPE_V4DI_V8HI_UQI:
9344 case V4SI_FTYPE_V2DI_V4SI_UQI:
9345 case V4SI_FTYPE_V4DI_V4SI_UQI:
9346 case V32QI_FTYPE_V32HI_V32QI_USI:
9347 case UHI_FTYPE_V16QI_V16QI_UHI:
9348 case USI_FTYPE_V32QI_V32QI_USI:
9349 case UDI_FTYPE_V64QI_V64QI_UDI:
9350 case UQI_FTYPE_V8HI_V8HI_UQI:
9351 case UHI_FTYPE_V16HI_V16HI_UHI:
9352 case USI_FTYPE_V32HI_V32HI_USI:
9353 case UQI_FTYPE_V4SI_V4SI_UQI:
9354 case UQI_FTYPE_V8SI_V8SI_UQI:
9355 case UQI_FTYPE_V2DI_V2DI_UQI:
9356 case UQI_FTYPE_V4DI_V4DI_UQI:
9357 case V4SF_FTYPE_V2DF_V4SF_UQI:
9358 case V4SF_FTYPE_V4DF_V4SF_UQI:
9359 case V16SI_FTYPE_V16SI_V16SI_UHI:
9360 case V16SI_FTYPE_V4SI_V16SI_UHI:
9361 case V2DI_FTYPE_V4SI_V2DI_UQI:
9362 case V2DI_FTYPE_V8HI_V2DI_UQI:
9363 case V2DI_FTYPE_V16QI_V2DI_UQI:
9364 case V4DI_FTYPE_V4DI_V4DI_UQI:
9365 case V4DI_FTYPE_V4SI_V4DI_UQI:
9366 case V4DI_FTYPE_V8HI_V4DI_UQI:
9367 case V4DI_FTYPE_V16QI_V4DI_UQI:
9368 case V4DI_FTYPE_V4DF_V4DI_UQI:
9369 case V2DI_FTYPE_V2DF_V2DI_UQI:
9370 case V4SI_FTYPE_V4DF_V4SI_UQI:
9371 case V4SI_FTYPE_V2DF_V4SI_UQI:
9372 case V4SI_FTYPE_V8HI_V4SI_UQI:
9373 case V4SI_FTYPE_V16QI_V4SI_UQI:
9374 case V4DI_FTYPE_V4DI_V4DI_V4DI:
9375 case V8DF_FTYPE_V2DF_V8DF_UQI:
9376 case V8DF_FTYPE_V4DF_V8DF_UQI:
9377 case V8DF_FTYPE_V8DF_V8DF_UQI:
9378 case V8SF_FTYPE_V8SF_V8SF_UQI:
9379 case V8SF_FTYPE_V8SI_V8SF_UQI:
9380 case V4DF_FTYPE_V4DF_V4DF_UQI:
9381 case V4SF_FTYPE_V4SF_V4SF_UQI:
9382 case V2DF_FTYPE_V2DF_V2DF_UQI:
9383 case V2DF_FTYPE_V4SF_V2DF_UQI:
9384 case V2DF_FTYPE_V4SI_V2DF_UQI:
9385 case V4SF_FTYPE_V4SI_V4SF_UQI:
9386 case V4DF_FTYPE_V4SF_V4DF_UQI:
9387 case V4DF_FTYPE_V4SI_V4DF_UQI:
9388 case V8SI_FTYPE_V8SI_V8SI_UQI:
9389 case V8SI_FTYPE_V8HI_V8SI_UQI:
9390 case V8SI_FTYPE_V16QI_V8SI_UQI:
9391 case V8DF_FTYPE_V8SI_V8DF_UQI:
9392 case V8DI_FTYPE_DI_V8DI_UQI:
9393 case V16SF_FTYPE_V8SF_V16SF_UHI:
9394 case V16SI_FTYPE_V8SI_V16SI_UHI:
9395 case V16HI_FTYPE_V16HI_V16HI_UHI:
9396 case V8HI_FTYPE_V16QI_V8HI_UQI:
9397 case V16HI_FTYPE_V16QI_V16HI_UHI:
9398 case V32HI_FTYPE_V32HI_V32HI_USI:
9399 case V32HI_FTYPE_V32QI_V32HI_USI:
9400 case V8DI_FTYPE_V16QI_V8DI_UQI:
9401 case V8DI_FTYPE_V2DI_V8DI_UQI:
9402 case V8DI_FTYPE_V4DI_V8DI_UQI:
9403 case V8DI_FTYPE_V8DI_V8DI_UQI:
9404 case V8DI_FTYPE_V8HI_V8DI_UQI:
9405 case V8DI_FTYPE_V8SI_V8DI_UQI:
9406 case V8HI_FTYPE_V8DI_V8HI_UQI:
9407 case V8SI_FTYPE_V8DI_V8SI_UQI:
9408 case V4SI_FTYPE_V4SI_V4SI_V4SI:
9409 case V16SI_FTYPE_V16SI_V16SI_V16SI:
9410 case V8DI_FTYPE_V8DI_V8DI_V8DI:
9411 case V32HI_FTYPE_V32HI_V32HI_V32HI:
9412 case V2DI_FTYPE_V2DI_V2DI_V2DI:
9413 case V16HI_FTYPE_V16HI_V16HI_V16HI:
9414 case V8SI_FTYPE_V8SI_V8SI_V8SI:
9415 case V8HI_FTYPE_V8HI_V8HI_V8HI:
9416 case V32HI_FTYPE_V16SF_V16SF_USI:
9417 case V16HI_FTYPE_V8SF_V8SF_UHI:
9418 case V8HI_FTYPE_V4SF_V4SF_UQI:
9419 case V16HI_FTYPE_V16SF_V16HI_UHI:
9420 case V8HI_FTYPE_V8SF_V8HI_UQI:
9421 case V8HI_FTYPE_V4SF_V8HI_UQI:
9422 case V16SF_FTYPE_V16SF_V32HI_V32HI:
9423 case V8SF_FTYPE_V8SF_V16HI_V16HI:
9424 case V4SF_FTYPE_V4SF_V8HI_V8HI:
9425 nargs = 3;
9426 break;
9427 case V32QI_FTYPE_V32QI_V32QI_INT:
9428 case V16HI_FTYPE_V16HI_V16HI_INT:
9429 case V16QI_FTYPE_V16QI_V16QI_INT:
9430 case V4DI_FTYPE_V4DI_V4DI_INT:
9431 case V8HI_FTYPE_V8HI_V8HI_INT:
9432 case V8SI_FTYPE_V8SI_V8SI_INT:
9433 case V8SI_FTYPE_V8SI_V4SI_INT:
9434 case V8SF_FTYPE_V8SF_V8SF_INT:
9435 case V8SF_FTYPE_V8SF_V4SF_INT:
9436 case V4SI_FTYPE_V4SI_V4SI_INT:
9437 case V4DF_FTYPE_V4DF_V4DF_INT:
9438 case V16SF_FTYPE_V16SF_V16SF_INT:
9439 case V16SF_FTYPE_V16SF_V4SF_INT:
9440 case V16SI_FTYPE_V16SI_V4SI_INT:
9441 case V4DF_FTYPE_V4DF_V2DF_INT:
9442 case V4SF_FTYPE_V4SF_V4SF_INT:
9443 case V2DI_FTYPE_V2DI_V2DI_INT:
9444 case V4DI_FTYPE_V4DI_V2DI_INT:
9445 case V2DF_FTYPE_V2DF_V2DF_INT:
9446 case UQI_FTYPE_V8DI_V8UDI_INT:
9447 case UQI_FTYPE_V8DF_V8DF_INT:
9448 case UQI_FTYPE_V2DF_V2DF_INT:
9449 case UQI_FTYPE_V4SF_V4SF_INT:
9450 case UHI_FTYPE_V16SI_V16SI_INT:
9451 case UHI_FTYPE_V16SF_V16SF_INT:
9452 case V64QI_FTYPE_V64QI_V64QI_INT:
9453 case V32HI_FTYPE_V32HI_V32HI_INT:
9454 case V16SI_FTYPE_V16SI_V16SI_INT:
9455 case V8DI_FTYPE_V8DI_V8DI_INT:
9456 nargs = 3;
9457 nargs_constant = 1;
9458 break;
9459 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9460 nargs = 3;
9461 rmode = V4DImode;
9462 nargs_constant = 1;
9463 break;
9464 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9465 nargs = 3;
9466 rmode = V2DImode;
9467 nargs_constant = 1;
9468 break;
9469 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9470 nargs = 3;
9471 rmode = DImode;
9472 nargs_constant = 1;
9473 break;
9474 case V2DI_FTYPE_V2DI_UINT_UINT:
9475 nargs = 3;
9476 nargs_constant = 2;
9477 break;
9478 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9479 nargs = 3;
9480 rmode = V8DImode;
9481 nargs_constant = 1;
9482 break;
9483 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9484 nargs = 5;
9485 rmode = V8DImode;
9486 mask_pos = 2;
9487 nargs_constant = 1;
9488 break;
9489 case QI_FTYPE_V8DF_INT_UQI:
9490 case QI_FTYPE_V4DF_INT_UQI:
9491 case QI_FTYPE_V2DF_INT_UQI:
9492 case HI_FTYPE_V16SF_INT_UHI:
9493 case QI_FTYPE_V8SF_INT_UQI:
9494 case QI_FTYPE_V4SF_INT_UQI:
9495 case V4SI_FTYPE_V4SI_V4SI_UHI:
9496 case V8SI_FTYPE_V8SI_V8SI_UHI:
9497 nargs = 3;
9498 mask_pos = 1;
9499 nargs_constant = 1;
9500 break;
9501 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9502 nargs = 5;
9503 rmode = V4DImode;
9504 mask_pos = 2;
9505 nargs_constant = 1;
9506 break;
9507 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9508 nargs = 5;
9509 rmode = V2DImode;
9510 mask_pos = 2;
9511 nargs_constant = 1;
9512 break;
9513 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9514 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9515 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9516 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9517 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9518 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9519 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9520 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9521 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9522 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9523 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9524 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9525 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9526 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9527 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9528 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9529 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9530 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9531 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9532 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9533 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9534 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9535 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9536 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9537 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9538 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9539 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9540 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9541 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9542 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9543 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9544 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9545 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9546 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9547 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9548 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9549 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9550 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9551 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9552 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9553 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9554 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9555 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9556 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9557 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9558 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9559 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9560 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9561 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9562 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9563 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
9564 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9565 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9566 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
9567 nargs = 4;
9568 break;
9569 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9570 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9571 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9572 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9573 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9574 nargs = 4;
9575 nargs_constant = 1;
9576 break;
9577 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9578 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9579 case QI_FTYPE_V4DF_V4DF_INT_UQI:
9580 case QI_FTYPE_V8SF_V8SF_INT_UQI:
9581 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9582 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9583 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9584 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9585 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9586 case USI_FTYPE_V32QI_V32QI_INT_USI:
9587 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9588 case USI_FTYPE_V32HI_V32HI_INT_USI:
9589 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9590 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
9591 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
9592 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
9593 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
9594 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
9595 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
9596 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
9597 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
9598 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
9599 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
9600 nargs = 4;
9601 mask_pos = 1;
9602 nargs_constant = 1;
9603 break;
9604 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9605 nargs = 4;
9606 nargs_constant = 2;
9607 break;
9608 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9609 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
9610 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9611 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9612 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
9613 nargs = 4;
9614 break;
9615 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9616 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9617 mask_pos = 1;
9618 nargs = 4;
9619 nargs_constant = 1;
9620 break;
9621 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9622 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9623 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9624 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9625 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9626 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9627 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9628 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9629 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9630 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9631 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9632 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9633 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9634 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9635 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9636 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9637 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9638 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9639 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9640 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9641 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9642 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9643 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9644 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9645 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9646 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9647 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9648 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9649 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9650 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9651 nargs = 4;
9652 mask_pos = 2;
9653 nargs_constant = 1;
9654 break;
9655 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9656 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9657 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9658 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9659 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9660 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9661 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9662 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9663 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9664 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9665 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9666 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9667 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9668 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9669 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9670 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9671 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9672 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9673 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9674 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9675 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9676 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9677 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9678 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9679 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9680 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9681 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9682 nargs = 5;
9683 mask_pos = 2;
9684 nargs_constant = 1;
9685 break;
9686 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9687 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9688 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9689 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9690 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9691 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9692 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9693 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9694 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9695 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9696 nargs = 5;
9697 mask_pos = 1;
9698 nargs_constant = 1;
9699 break;
9700 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9701 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9702 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9703 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9704 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9705 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9706 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9707 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9708 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9709 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9710 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9711 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9712 nargs = 5;
9713 mask_pos = 1;
9714 nargs_constant = 2;
9715 break;
9716
9717 default:
9718 gcc_unreachable ();
9719 }
9720
9721 gcc_assert (nargs <= ARRAY_SIZE (args));
9722
9723 if (comparison != UNKNOWN)
9724 {
9725 gcc_assert (nargs == 2);
9726 return ix86_expand_sse_compare (d, exp, target, swap);
9727 }
9728
9729 if (rmode == VOIDmode || rmode == tmode)
9730 {
9731 if (optimize
9732 || target == 0
9733 || GET_MODE (target) != tmode
9734 || !insn_p->operand[0].predicate (target, tmode))
9735 target = gen_reg_rtx (tmode);
9736 else if (memory_operand (target, tmode))
9737 num_memory++;
9738 real_target = target;
9739 }
9740 else
9741 {
9742 real_target = gen_reg_rtx (tmode);
9743 target = lowpart_subreg (rmode, real_target, tmode);
9744 }
9745
9746 for (i = 0; i < nargs; i++)
9747 {
9748 tree arg = CALL_EXPR_ARG (exp, i);
9749 rtx op = expand_normal (arg);
9750 machine_mode mode = insn_p->operand[i + 1].mode;
9751 bool match = insn_p->operand[i + 1].predicate (op, mode);
9752
9753 if (second_arg_count && i == 1)
9754 {
9755 /* SIMD shift insns take either an 8-bit immediate or
9756 register as count. But builtin functions take int as
9757 count. If count doesn't match, we put it in register.
9758 The instructions are using 64-bit count, if op is just
9759 32-bit, zero-extend it, as negative shift counts
9760 are undefined behavior and zero-extension is more
9761 efficient. */
9762 if (!match)
9763 {
9764 if (SCALAR_INT_MODE_P (GET_MODE (op)))
9765 op = convert_modes (mode, GET_MODE (op), op, 1);
9766 else
9767 op = lowpart_subreg (mode, op, GET_MODE (op));
9768 if (!insn_p->operand[i + 1].predicate (op, mode))
9769 op = copy_to_reg (op);
9770 }
9771 }
9772 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9773 (!mask_pos && (nargs - i) <= nargs_constant))
9774 {
9775 if (!match)
9776 switch (icode)
9777 {
9778 case CODE_FOR_avx_vinsertf128v4di:
9779 case CODE_FOR_avx_vextractf128v4di:
9780 error ("the last argument must be an 1-bit immediate");
9781 return const0_rtx;
9782
9783 case CODE_FOR_avx512f_cmpv8di3_mask:
9784 case CODE_FOR_avx512f_cmpv16si3_mask:
9785 case CODE_FOR_avx512f_ucmpv8di3_mask:
9786 case CODE_FOR_avx512f_ucmpv16si3_mask:
9787 case CODE_FOR_avx512vl_cmpv4di3_mask:
9788 case CODE_FOR_avx512vl_cmpv8si3_mask:
9789 case CODE_FOR_avx512vl_ucmpv4di3_mask:
9790 case CODE_FOR_avx512vl_ucmpv8si3_mask:
9791 case CODE_FOR_avx512vl_cmpv2di3_mask:
9792 case CODE_FOR_avx512vl_cmpv4si3_mask:
9793 case CODE_FOR_avx512vl_ucmpv2di3_mask:
9794 case CODE_FOR_avx512vl_ucmpv4si3_mask:
9795 error ("the last argument must be a 3-bit immediate");
9796 return const0_rtx;
9797
9798 case CODE_FOR_sse4_1_roundsd:
9799 case CODE_FOR_sse4_1_roundss:
9800
9801 case CODE_FOR_sse4_1_roundpd:
9802 case CODE_FOR_sse4_1_roundps:
9803 case CODE_FOR_avx_roundpd256:
9804 case CODE_FOR_avx_roundps256:
9805
9806 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9807 case CODE_FOR_sse4_1_roundps_sfix:
9808 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9809 case CODE_FOR_avx_roundps_sfix256:
9810
9811 case CODE_FOR_sse4_1_blendps:
9812 case CODE_FOR_avx_blendpd256:
9813 case CODE_FOR_avx_vpermilv4df:
9814 case CODE_FOR_avx_vpermilv4df_mask:
9815 case CODE_FOR_avx512f_getmantv8df_mask:
9816 case CODE_FOR_avx512f_getmantv16sf_mask:
9817 case CODE_FOR_avx512vl_getmantv8sf_mask:
9818 case CODE_FOR_avx512vl_getmantv4df_mask:
9819 case CODE_FOR_avx512vl_getmantv4sf_mask:
9820 case CODE_FOR_avx512vl_getmantv2df_mask:
9821 case CODE_FOR_avx512dq_rangepv8df_mask_round:
9822 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9823 case CODE_FOR_avx512dq_rangepv4df_mask:
9824 case CODE_FOR_avx512dq_rangepv8sf_mask:
9825 case CODE_FOR_avx512dq_rangepv2df_mask:
9826 case CODE_FOR_avx512dq_rangepv4sf_mask:
9827 case CODE_FOR_avx_shufpd256_mask:
9828 error ("the last argument must be a 4-bit immediate");
9829 return const0_rtx;
9830
9831 case CODE_FOR_sha1rnds4:
9832 case CODE_FOR_sse4_1_blendpd:
9833 case CODE_FOR_avx_vpermilv2df:
9834 case CODE_FOR_avx_vpermilv2df_mask:
9835 case CODE_FOR_xop_vpermil2v2df3:
9836 case CODE_FOR_xop_vpermil2v4sf3:
9837 case CODE_FOR_xop_vpermil2v4df3:
9838 case CODE_FOR_xop_vpermil2v8sf3:
9839 case CODE_FOR_avx512f_vinsertf32x4_mask:
9840 case CODE_FOR_avx512f_vinserti32x4_mask:
9841 case CODE_FOR_avx512f_vextractf32x4_mask:
9842 case CODE_FOR_avx512f_vextracti32x4_mask:
9843 case CODE_FOR_sse2_shufpd:
9844 case CODE_FOR_sse2_shufpd_mask:
9845 case CODE_FOR_avx512dq_shuf_f64x2_mask:
9846 case CODE_FOR_avx512dq_shuf_i64x2_mask:
9847 case CODE_FOR_avx512vl_shuf_i32x4_mask:
9848 case CODE_FOR_avx512vl_shuf_f32x4_mask:
9849 error ("the last argument must be a 2-bit immediate");
9850 return const0_rtx;
9851
9852 case CODE_FOR_avx_vextractf128v4df:
9853 case CODE_FOR_avx_vextractf128v8sf:
9854 case CODE_FOR_avx_vextractf128v8si:
9855 case CODE_FOR_avx_vinsertf128v4df:
9856 case CODE_FOR_avx_vinsertf128v8sf:
9857 case CODE_FOR_avx_vinsertf128v8si:
9858 case CODE_FOR_avx512f_vinsertf64x4_mask:
9859 case CODE_FOR_avx512f_vinserti64x4_mask:
9860 case CODE_FOR_avx512f_vextractf64x4_mask:
9861 case CODE_FOR_avx512f_vextracti64x4_mask:
9862 case CODE_FOR_avx512dq_vinsertf32x8_mask:
9863 case CODE_FOR_avx512dq_vinserti32x8_mask:
9864 case CODE_FOR_avx512vl_vinsertv4df:
9865 case CODE_FOR_avx512vl_vinsertv4di:
9866 case CODE_FOR_avx512vl_vinsertv8sf:
9867 case CODE_FOR_avx512vl_vinsertv8si:
9868 error ("the last argument must be a 1-bit immediate");
9869 return const0_rtx;
9870
9871 case CODE_FOR_avx_vmcmpv2df3:
9872 case CODE_FOR_avx_vmcmpv4sf3:
9873 case CODE_FOR_avx_cmpv2df3:
9874 case CODE_FOR_avx_cmpv4sf3:
9875 case CODE_FOR_avx_cmpv4df3:
9876 case CODE_FOR_avx_cmpv8sf3:
9877 case CODE_FOR_avx512f_cmpv8df3_mask:
9878 case CODE_FOR_avx512f_cmpv16sf3_mask:
9879 case CODE_FOR_avx512f_vmcmpv2df3_mask:
9880 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9881 error ("the last argument must be a 5-bit immediate");
9882 return const0_rtx;
9883
9884 default:
9885 switch (nargs_constant)
9886 {
9887 case 2:
9888 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9889 (!mask_pos && (nargs - i) == nargs_constant))
9890 {
9891 error ("the next to last argument must be an 8-bit immediate");
9892 break;
9893 }
9894 /* FALLTHRU */
9895 case 1:
9896 error ("the last argument must be an 8-bit immediate");
9897 break;
9898 default:
9899 gcc_unreachable ();
9900 }
9901 return const0_rtx;
9902 }
9903 }
9904 else
9905 {
9906 if (VECTOR_MODE_P (mode))
9907 op = safe_vector_operand (op, mode);
9908
9909 /* If we aren't optimizing, only allow one memory operand to
9910 be generated. */
9911 if (memory_operand (op, mode))
9912 num_memory++;
9913
9914 op = fixup_modeless_constant (op, mode);
9915
9916 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9917 {
9918 if (optimize || !match || num_memory > 1)
9919 op = copy_to_mode_reg (mode, op);
9920 }
9921 else
9922 {
9923 op = copy_to_reg (op);
9924 op = lowpart_subreg (mode, op, GET_MODE (op));
9925 }
9926 }
9927
9928 args[i].op = op;
9929 args[i].mode = mode;
9930 }
9931
9932 switch (nargs)
9933 {
9934 case 1:
9935 pat = GEN_FCN (icode) (real_target, args[0].op);
9936 break;
9937 case 2:
9938 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
9939 break;
9940 case 3:
9941 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9942 args[2].op);
9943 break;
9944 case 4:
9945 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9946 args[2].op, args[3].op);
9947 break;
9948 case 5:
9949 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9950 args[2].op, args[3].op, args[4].op);
9951 break;
9952 case 6:
9953 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9954 args[2].op, args[3].op, args[4].op,
9955 args[5].op);
9956 break;
9957 default:
9958 gcc_unreachable ();
9959 }
9960
9961 if (! pat)
9962 return 0;
9963
9964 emit_insn (pat);
9965 return target;
9966 }
9967
9968 /* Transform pattern of following layout:
9969 (set A
9970 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9971 )
9972 into:
9973 (set (A B)) */
9974
9975 static rtx
9976 ix86_erase_embedded_rounding (rtx pat)
9977 {
9978 if (GET_CODE (pat) == INSN)
9979 pat = PATTERN (pat);
9980
9981 gcc_assert (GET_CODE (pat) == SET);
9982 rtx src = SET_SRC (pat);
9983 gcc_assert (XVECLEN (src, 0) == 2);
9984 rtx p0 = XVECEXP (src, 0, 0);
9985 gcc_assert (GET_CODE (src) == UNSPEC
9986 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
9987 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
9988 return res;
9989 }
9990
9991 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9992 with rounding. */
9993 static rtx
9994 ix86_expand_sse_comi_round (const struct builtin_description *d,
9995 tree exp, rtx target)
9996 {
9997 rtx pat, set_dst;
9998 tree arg0 = CALL_EXPR_ARG (exp, 0);
9999 tree arg1 = CALL_EXPR_ARG (exp, 1);
10000 tree arg2 = CALL_EXPR_ARG (exp, 2);
10001 tree arg3 = CALL_EXPR_ARG (exp, 3);
10002 rtx op0 = expand_normal (arg0);
10003 rtx op1 = expand_normal (arg1);
10004 rtx op2 = expand_normal (arg2);
10005 rtx op3 = expand_normal (arg3);
10006 enum insn_code icode = d->icode;
10007 const struct insn_data_d *insn_p = &insn_data[icode];
10008 machine_mode mode0 = insn_p->operand[0].mode;
10009 machine_mode mode1 = insn_p->operand[1].mode;
10010 enum rtx_code comparison = UNEQ;
10011 bool need_ucomi = false;
10012
10013 /* See avxintrin.h for values. */
10014 enum rtx_code comi_comparisons[32] =
10015 {
10016 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
10017 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
10018 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
10019 };
10020 bool need_ucomi_values[32] =
10021 {
10022 true, false, false, true, true, false, false, true,
10023 true, false, false, true, true, false, false, true,
10024 false, true, true, false, false, true, true, false,
10025 false, true, true, false, false, true, true, false
10026 };
10027
10028 if (!CONST_INT_P (op2))
10029 {
10030 error ("the third argument must be comparison constant");
10031 return const0_rtx;
10032 }
10033 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
10034 {
10035 error ("incorrect comparison mode");
10036 return const0_rtx;
10037 }
10038
10039 if (!insn_p->operand[2].predicate (op3, SImode))
10040 {
10041 error ("incorrect rounding operand");
10042 return const0_rtx;
10043 }
10044
10045 comparison = comi_comparisons[INTVAL (op2)];
10046 need_ucomi = need_ucomi_values[INTVAL (op2)];
10047
10048 if (VECTOR_MODE_P (mode0))
10049 op0 = safe_vector_operand (op0, mode0);
10050 if (VECTOR_MODE_P (mode1))
10051 op1 = safe_vector_operand (op1, mode1);
10052
10053 target = gen_reg_rtx (SImode);
10054 emit_move_insn (target, const0_rtx);
10055 target = gen_rtx_SUBREG (QImode, target, 0);
10056
10057 if ((optimize && !register_operand (op0, mode0))
10058 || !insn_p->operand[0].predicate (op0, mode0))
10059 op0 = copy_to_mode_reg (mode0, op0);
10060 if ((optimize && !register_operand (op1, mode1))
10061 || !insn_p->operand[1].predicate (op1, mode1))
10062 op1 = copy_to_mode_reg (mode1, op1);
10063
10064 if (need_ucomi)
10065 icode = icode == CODE_FOR_sse_comi_round
10066 ? CODE_FOR_sse_ucomi_round
10067 : CODE_FOR_sse2_ucomi_round;
10068
10069 pat = GEN_FCN (icode) (op0, op1, op3);
10070 if (! pat)
10071 return 0;
10072
10073 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10074 if (INTVAL (op3) == NO_ROUND)
10075 {
10076 pat = ix86_erase_embedded_rounding (pat);
10077 if (! pat)
10078 return 0;
10079
10080 set_dst = SET_DEST (pat);
10081 }
10082 else
10083 {
10084 gcc_assert (GET_CODE (pat) == SET);
10085 set_dst = SET_DEST (pat);
10086 }
10087
10088 emit_insn (pat);
10089 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10090 gen_rtx_fmt_ee (comparison, QImode,
10091 set_dst,
10092 const0_rtx)));
10093
10094 return SUBREG_REG (target);
10095 }
10096
10097 static rtx
10098 ix86_expand_round_builtin (const struct builtin_description *d,
10099 tree exp, rtx target)
10100 {
10101 rtx pat;
10102 unsigned int i, nargs;
10103 struct
10104 {
10105 rtx op;
10106 machine_mode mode;
10107 } args[6];
10108 enum insn_code icode = d->icode;
10109 const struct insn_data_d *insn_p = &insn_data[icode];
10110 machine_mode tmode = insn_p->operand[0].mode;
10111 unsigned int nargs_constant = 0;
10112 unsigned int redundant_embed_rnd = 0;
10113
10114 switch ((enum ix86_builtin_func_type) d->flag)
10115 {
10116 case UINT64_FTYPE_V2DF_INT:
10117 case UINT64_FTYPE_V4SF_INT:
10118 case UINT_FTYPE_V2DF_INT:
10119 case UINT_FTYPE_V4SF_INT:
10120 case INT64_FTYPE_V2DF_INT:
10121 case INT64_FTYPE_V4SF_INT:
10122 case INT_FTYPE_V2DF_INT:
10123 case INT_FTYPE_V4SF_INT:
10124 nargs = 2;
10125 break;
10126 case V4SF_FTYPE_V4SF_UINT_INT:
10127 case V4SF_FTYPE_V4SF_UINT64_INT:
10128 case V2DF_FTYPE_V2DF_UINT64_INT:
10129 case V4SF_FTYPE_V4SF_INT_INT:
10130 case V4SF_FTYPE_V4SF_INT64_INT:
10131 case V2DF_FTYPE_V2DF_INT64_INT:
10132 case V4SF_FTYPE_V4SF_V4SF_INT:
10133 case V2DF_FTYPE_V2DF_V2DF_INT:
10134 case V4SF_FTYPE_V4SF_V2DF_INT:
10135 case V2DF_FTYPE_V2DF_V4SF_INT:
10136 nargs = 3;
10137 break;
10138 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10139 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10140 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10141 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10142 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10143 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10144 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10145 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10146 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10147 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10148 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10149 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10150 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10151 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10152 nargs = 4;
10153 break;
10154 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10155 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10156 nargs_constant = 2;
10157 nargs = 4;
10158 break;
10159 case INT_FTYPE_V4SF_V4SF_INT_INT:
10160 case INT_FTYPE_V2DF_V2DF_INT_INT:
10161 return ix86_expand_sse_comi_round (d, exp, target);
10162 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10163 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10164 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10165 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10166 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10167 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10168 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10169 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10170 nargs = 5;
10171 break;
10172 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10173 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10174 nargs_constant = 4;
10175 nargs = 5;
10176 break;
10177 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10178 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10179 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10180 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10181 nargs_constant = 3;
10182 nargs = 5;
10183 break;
10184 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10185 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10186 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10187 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10188 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10189 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10190 nargs = 6;
10191 nargs_constant = 4;
10192 break;
10193 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10194 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10195 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10196 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10197 nargs = 6;
10198 nargs_constant = 3;
10199 break;
10200 default:
10201 gcc_unreachable ();
10202 }
10203 gcc_assert (nargs <= ARRAY_SIZE (args));
10204
10205 if (optimize
10206 || target == 0
10207 || GET_MODE (target) != tmode
10208 || !insn_p->operand[0].predicate (target, tmode))
10209 target = gen_reg_rtx (tmode);
10210
10211 for (i = 0; i < nargs; i++)
10212 {
10213 tree arg = CALL_EXPR_ARG (exp, i);
10214 rtx op = expand_normal (arg);
10215 machine_mode mode = insn_p->operand[i + 1].mode;
10216 bool match = insn_p->operand[i + 1].predicate (op, mode);
10217
10218 if (i == nargs - nargs_constant)
10219 {
10220 if (!match)
10221 {
10222 switch (icode)
10223 {
10224 case CODE_FOR_avx512f_getmantv8df_mask_round:
10225 case CODE_FOR_avx512f_getmantv16sf_mask_round:
10226 case CODE_FOR_avx512f_vgetmantv2df_round:
10227 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10228 case CODE_FOR_avx512f_vgetmantv4sf_round:
10229 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10230 error ("the immediate argument must be a 4-bit immediate");
10231 return const0_rtx;
10232 case CODE_FOR_avx512f_cmpv8df3_mask_round:
10233 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10234 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10235 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10236 error ("the immediate argument must be a 5-bit immediate");
10237 return const0_rtx;
10238 default:
10239 error ("the immediate argument must be an 8-bit immediate");
10240 return const0_rtx;
10241 }
10242 }
10243 }
10244 else if (i == nargs-1)
10245 {
10246 if (!insn_p->operand[nargs].predicate (op, SImode))
10247 {
10248 error ("incorrect rounding operand");
10249 return const0_rtx;
10250 }
10251
10252 /* If there is no rounding use normal version of the pattern. */
10253 if (INTVAL (op) == NO_ROUND)
10254 redundant_embed_rnd = 1;
10255 }
10256 else
10257 {
10258 if (VECTOR_MODE_P (mode))
10259 op = safe_vector_operand (op, mode);
10260
10261 op = fixup_modeless_constant (op, mode);
10262
10263 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10264 {
10265 if (optimize || !match)
10266 op = copy_to_mode_reg (mode, op);
10267 }
10268 else
10269 {
10270 op = copy_to_reg (op);
10271 op = lowpart_subreg (mode, op, GET_MODE (op));
10272 }
10273 }
10274
10275 args[i].op = op;
10276 args[i].mode = mode;
10277 }
10278
10279 switch (nargs)
10280 {
10281 case 1:
10282 pat = GEN_FCN (icode) (target, args[0].op);
10283 break;
10284 case 2:
10285 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10286 break;
10287 case 3:
10288 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10289 args[2].op);
10290 break;
10291 case 4:
10292 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10293 args[2].op, args[3].op);
10294 break;
10295 case 5:
10296 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10297 args[2].op, args[3].op, args[4].op);
10298 break;
10299 case 6:
10300 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10301 args[2].op, args[3].op, args[4].op,
10302 args[5].op);
10303 break;
10304 default:
10305 gcc_unreachable ();
10306 }
10307
10308 if (!pat)
10309 return 0;
10310
10311 if (redundant_embed_rnd)
10312 pat = ix86_erase_embedded_rounding (pat);
10313
10314 emit_insn (pat);
10315 return target;
10316 }
10317
10318 /* Subroutine of ix86_expand_builtin to take care of special insns
10319 with variable number of operands. */
10320
10321 static rtx
10322 ix86_expand_special_args_builtin (const struct builtin_description *d,
10323 tree exp, rtx target)
10324 {
10325 tree arg;
10326 rtx pat, op;
10327 unsigned int i, nargs, arg_adjust, memory;
10328 bool aligned_mem = false;
10329 struct
10330 {
10331 rtx op;
10332 machine_mode mode;
10333 } args[3];
10334 enum insn_code icode = d->icode;
10335 bool last_arg_constant = false;
10336 const struct insn_data_d *insn_p = &insn_data[icode];
10337 machine_mode tmode = insn_p->operand[0].mode;
10338 enum { load, store } klass;
10339
10340 switch ((enum ix86_builtin_func_type) d->flag)
10341 {
10342 case VOID_FTYPE_VOID:
10343 emit_insn (GEN_FCN (icode) (target));
10344 return 0;
10345 case VOID_FTYPE_UINT64:
10346 case VOID_FTYPE_UNSIGNED:
10347 nargs = 0;
10348 klass = store;
10349 memory = 0;
10350 break;
10351
10352 case INT_FTYPE_VOID:
10353 case USHORT_FTYPE_VOID:
10354 case UINT64_FTYPE_VOID:
10355 case UINT_FTYPE_VOID:
10356 case UNSIGNED_FTYPE_VOID:
10357 nargs = 0;
10358 klass = load;
10359 memory = 0;
10360 break;
10361 case UINT64_FTYPE_PUNSIGNED:
10362 case V2DI_FTYPE_PV2DI:
10363 case V4DI_FTYPE_PV4DI:
10364 case V32QI_FTYPE_PCCHAR:
10365 case V16QI_FTYPE_PCCHAR:
10366 case V8SF_FTYPE_PCV4SF:
10367 case V8SF_FTYPE_PCFLOAT:
10368 case V4SF_FTYPE_PCFLOAT:
10369 case V4DF_FTYPE_PCV2DF:
10370 case V4DF_FTYPE_PCDOUBLE:
10371 case V2DF_FTYPE_PCDOUBLE:
10372 case VOID_FTYPE_PVOID:
10373 case V8DI_FTYPE_PV8DI:
10374 nargs = 1;
10375 klass = load;
10376 memory = 0;
10377 switch (icode)
10378 {
10379 case CODE_FOR_sse4_1_movntdqa:
10380 case CODE_FOR_avx2_movntdqa:
10381 case CODE_FOR_avx512f_movntdqa:
10382 aligned_mem = true;
10383 break;
10384 default:
10385 break;
10386 }
10387 break;
10388 case VOID_FTYPE_PV2SF_V4SF:
10389 case VOID_FTYPE_PV8DI_V8DI:
10390 case VOID_FTYPE_PV4DI_V4DI:
10391 case VOID_FTYPE_PV2DI_V2DI:
10392 case VOID_FTYPE_PCHAR_V32QI:
10393 case VOID_FTYPE_PCHAR_V16QI:
10394 case VOID_FTYPE_PFLOAT_V16SF:
10395 case VOID_FTYPE_PFLOAT_V8SF:
10396 case VOID_FTYPE_PFLOAT_V4SF:
10397 case VOID_FTYPE_PDOUBLE_V8DF:
10398 case VOID_FTYPE_PDOUBLE_V4DF:
10399 case VOID_FTYPE_PDOUBLE_V2DF:
10400 case VOID_FTYPE_PLONGLONG_LONGLONG:
10401 case VOID_FTYPE_PULONGLONG_ULONGLONG:
10402 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10403 case VOID_FTYPE_PINT_INT:
10404 nargs = 1;
10405 klass = store;
10406 /* Reserve memory operand for target. */
10407 memory = ARRAY_SIZE (args);
10408 switch (icode)
10409 {
10410 /* These builtins and instructions require the memory
10411 to be properly aligned. */
10412 case CODE_FOR_avx_movntv4di:
10413 case CODE_FOR_sse2_movntv2di:
10414 case CODE_FOR_avx_movntv8sf:
10415 case CODE_FOR_sse_movntv4sf:
10416 case CODE_FOR_sse4a_vmmovntv4sf:
10417 case CODE_FOR_avx_movntv4df:
10418 case CODE_FOR_sse2_movntv2df:
10419 case CODE_FOR_sse4a_vmmovntv2df:
10420 case CODE_FOR_sse2_movntidi:
10421 case CODE_FOR_sse_movntq:
10422 case CODE_FOR_sse2_movntisi:
10423 case CODE_FOR_avx512f_movntv16sf:
10424 case CODE_FOR_avx512f_movntv8df:
10425 case CODE_FOR_avx512f_movntv8di:
10426 aligned_mem = true;
10427 break;
10428 default:
10429 break;
10430 }
10431 break;
10432 case VOID_FTYPE_PVOID_PCVOID:
10433 nargs = 1;
10434 klass = store;
10435 memory = 0;
10436
10437 break;
10438 case V4SF_FTYPE_V4SF_PCV2SF:
10439 case V2DF_FTYPE_V2DF_PCDOUBLE:
10440 nargs = 2;
10441 klass = load;
10442 memory = 1;
10443 break;
10444 case V8SF_FTYPE_PCV8SF_V8SI:
10445 case V4DF_FTYPE_PCV4DF_V4DI:
10446 case V4SF_FTYPE_PCV4SF_V4SI:
10447 case V2DF_FTYPE_PCV2DF_V2DI:
10448 case V8SI_FTYPE_PCV8SI_V8SI:
10449 case V4DI_FTYPE_PCV4DI_V4DI:
10450 case V4SI_FTYPE_PCV4SI_V4SI:
10451 case V2DI_FTYPE_PCV2DI_V2DI:
10452 case VOID_FTYPE_INT_INT64:
10453 nargs = 2;
10454 klass = load;
10455 memory = 0;
10456 break;
10457 case VOID_FTYPE_PV8DF_V8DF_UQI:
10458 case VOID_FTYPE_PV4DF_V4DF_UQI:
10459 case VOID_FTYPE_PV2DF_V2DF_UQI:
10460 case VOID_FTYPE_PV16SF_V16SF_UHI:
10461 case VOID_FTYPE_PV8SF_V8SF_UQI:
10462 case VOID_FTYPE_PV4SF_V4SF_UQI:
10463 case VOID_FTYPE_PV8DI_V8DI_UQI:
10464 case VOID_FTYPE_PV4DI_V4DI_UQI:
10465 case VOID_FTYPE_PV2DI_V2DI_UQI:
10466 case VOID_FTYPE_PV16SI_V16SI_UHI:
10467 case VOID_FTYPE_PV8SI_V8SI_UQI:
10468 case VOID_FTYPE_PV4SI_V4SI_UQI:
10469 case VOID_FTYPE_PV64QI_V64QI_UDI:
10470 case VOID_FTYPE_PV32HI_V32HI_USI:
10471 case VOID_FTYPE_PV32QI_V32QI_USI:
10472 case VOID_FTYPE_PV16QI_V16QI_UHI:
10473 case VOID_FTYPE_PV16HI_V16HI_UHI:
10474 case VOID_FTYPE_PV8HI_V8HI_UQI:
10475 switch (icode)
10476 {
10477 /* These builtins and instructions require the memory
10478 to be properly aligned. */
10479 case CODE_FOR_avx512f_storev16sf_mask:
10480 case CODE_FOR_avx512f_storev16si_mask:
10481 case CODE_FOR_avx512f_storev8df_mask:
10482 case CODE_FOR_avx512f_storev8di_mask:
10483 case CODE_FOR_avx512vl_storev8sf_mask:
10484 case CODE_FOR_avx512vl_storev8si_mask:
10485 case CODE_FOR_avx512vl_storev4df_mask:
10486 case CODE_FOR_avx512vl_storev4di_mask:
10487 case CODE_FOR_avx512vl_storev4sf_mask:
10488 case CODE_FOR_avx512vl_storev4si_mask:
10489 case CODE_FOR_avx512vl_storev2df_mask:
10490 case CODE_FOR_avx512vl_storev2di_mask:
10491 aligned_mem = true;
10492 break;
10493 default:
10494 break;
10495 }
10496 /* FALLTHRU */
10497 case VOID_FTYPE_PV8SF_V8SI_V8SF:
10498 case VOID_FTYPE_PV4DF_V4DI_V4DF:
10499 case VOID_FTYPE_PV4SF_V4SI_V4SF:
10500 case VOID_FTYPE_PV2DF_V2DI_V2DF:
10501 case VOID_FTYPE_PV8SI_V8SI_V8SI:
10502 case VOID_FTYPE_PV4DI_V4DI_V4DI:
10503 case VOID_FTYPE_PV4SI_V4SI_V4SI:
10504 case VOID_FTYPE_PV2DI_V2DI_V2DI:
10505 case VOID_FTYPE_PV8SI_V8DI_UQI:
10506 case VOID_FTYPE_PV8HI_V8DI_UQI:
10507 case VOID_FTYPE_PV16HI_V16SI_UHI:
10508 case VOID_FTYPE_PV16QI_V8DI_UQI:
10509 case VOID_FTYPE_PV16QI_V16SI_UHI:
10510 case VOID_FTYPE_PV4SI_V4DI_UQI:
10511 case VOID_FTYPE_PV4SI_V2DI_UQI:
10512 case VOID_FTYPE_PV8HI_V4DI_UQI:
10513 case VOID_FTYPE_PV8HI_V2DI_UQI:
10514 case VOID_FTYPE_PV8HI_V8SI_UQI:
10515 case VOID_FTYPE_PV8HI_V4SI_UQI:
10516 case VOID_FTYPE_PV16QI_V4DI_UQI:
10517 case VOID_FTYPE_PV16QI_V2DI_UQI:
10518 case VOID_FTYPE_PV16QI_V8SI_UQI:
10519 case VOID_FTYPE_PV16QI_V4SI_UQI:
10520 case VOID_FTYPE_PCHAR_V64QI_UDI:
10521 case VOID_FTYPE_PCHAR_V32QI_USI:
10522 case VOID_FTYPE_PCHAR_V16QI_UHI:
10523 case VOID_FTYPE_PSHORT_V32HI_USI:
10524 case VOID_FTYPE_PSHORT_V16HI_UHI:
10525 case VOID_FTYPE_PSHORT_V8HI_UQI:
10526 case VOID_FTYPE_PINT_V16SI_UHI:
10527 case VOID_FTYPE_PINT_V8SI_UQI:
10528 case VOID_FTYPE_PINT_V4SI_UQI:
10529 case VOID_FTYPE_PINT64_V8DI_UQI:
10530 case VOID_FTYPE_PINT64_V4DI_UQI:
10531 case VOID_FTYPE_PINT64_V2DI_UQI:
10532 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10533 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10534 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10535 case VOID_FTYPE_PFLOAT_V16SF_UHI:
10536 case VOID_FTYPE_PFLOAT_V8SF_UQI:
10537 case VOID_FTYPE_PFLOAT_V4SF_UQI:
10538 case VOID_FTYPE_PV32QI_V32HI_USI:
10539 case VOID_FTYPE_PV16QI_V16HI_UHI:
10540 case VOID_FTYPE_PV8QI_V8HI_UQI:
10541 nargs = 2;
10542 klass = store;
10543 /* Reserve memory operand for target. */
10544 memory = ARRAY_SIZE (args);
10545 break;
10546 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10547 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10548 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10549 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10550 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10551 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10552 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10553 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10554 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10555 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10556 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10557 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10558 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10559 case V32HI_FTYPE_PCV32HI_V32HI_USI:
10560 case V32QI_FTYPE_PCV32QI_V32QI_USI:
10561 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10562 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10563 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10564 switch (icode)
10565 {
10566 /* These builtins and instructions require the memory
10567 to be properly aligned. */
10568 case CODE_FOR_avx512f_loadv16sf_mask:
10569 case CODE_FOR_avx512f_loadv16si_mask:
10570 case CODE_FOR_avx512f_loadv8df_mask:
10571 case CODE_FOR_avx512f_loadv8di_mask:
10572 case CODE_FOR_avx512vl_loadv8sf_mask:
10573 case CODE_FOR_avx512vl_loadv8si_mask:
10574 case CODE_FOR_avx512vl_loadv4df_mask:
10575 case CODE_FOR_avx512vl_loadv4di_mask:
10576 case CODE_FOR_avx512vl_loadv4sf_mask:
10577 case CODE_FOR_avx512vl_loadv4si_mask:
10578 case CODE_FOR_avx512vl_loadv2df_mask:
10579 case CODE_FOR_avx512vl_loadv2di_mask:
10580 case CODE_FOR_avx512bw_loadv64qi_mask:
10581 case CODE_FOR_avx512vl_loadv32qi_mask:
10582 case CODE_FOR_avx512vl_loadv16qi_mask:
10583 case CODE_FOR_avx512bw_loadv32hi_mask:
10584 case CODE_FOR_avx512vl_loadv16hi_mask:
10585 case CODE_FOR_avx512vl_loadv8hi_mask:
10586 aligned_mem = true;
10587 break;
10588 default:
10589 break;
10590 }
10591 /* FALLTHRU */
10592 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10593 case V32QI_FTYPE_PCCHAR_V32QI_USI:
10594 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10595 case V32HI_FTYPE_PCSHORT_V32HI_USI:
10596 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10597 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10598 case V16SI_FTYPE_PCINT_V16SI_UHI:
10599 case V8SI_FTYPE_PCINT_V8SI_UQI:
10600 case V4SI_FTYPE_PCINT_V4SI_UQI:
10601 case V8DI_FTYPE_PCINT64_V8DI_UQI:
10602 case V4DI_FTYPE_PCINT64_V4DI_UQI:
10603 case V2DI_FTYPE_PCINT64_V2DI_UQI:
10604 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10605 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10606 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10607 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10608 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10609 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10610 nargs = 3;
10611 klass = load;
10612 memory = 0;
10613 break;
10614 case VOID_FTYPE_UINT_UINT_UINT:
10615 case VOID_FTYPE_UINT64_UINT_UINT:
10616 case UCHAR_FTYPE_UINT_UINT_UINT:
10617 case UCHAR_FTYPE_UINT64_UINT_UINT:
10618 nargs = 3;
10619 klass = load;
10620 memory = ARRAY_SIZE (args);
10621 last_arg_constant = true;
10622 break;
10623 default:
10624 gcc_unreachable ();
10625 }
10626
10627 gcc_assert (nargs <= ARRAY_SIZE (args));
10628
10629 if (klass == store)
10630 {
10631 arg = CALL_EXPR_ARG (exp, 0);
10632 op = expand_normal (arg);
10633 gcc_assert (target == 0);
10634 if (memory)
10635 {
10636 op = ix86_zero_extend_to_Pmode (op);
10637 target = gen_rtx_MEM (tmode, op);
10638 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10639 on it. Try to improve it using get_pointer_alignment,
10640 and if the special builtin is one that requires strict
10641 mode alignment, also from it's GET_MODE_ALIGNMENT.
10642 Failure to do so could lead to ix86_legitimate_combined_insn
10643 rejecting all changes to such insns. */
10644 unsigned int align = get_pointer_alignment (arg);
10645 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10646 align = GET_MODE_ALIGNMENT (tmode);
10647 if (MEM_ALIGN (target) < align)
10648 set_mem_align (target, align);
10649 }
10650 else
10651 target = force_reg (tmode, op);
10652 arg_adjust = 1;
10653 }
10654 else
10655 {
10656 arg_adjust = 0;
10657 if (optimize
10658 || target == 0
10659 || !register_operand (target, tmode)
10660 || GET_MODE (target) != tmode)
10661 target = gen_reg_rtx (tmode);
10662 }
10663
10664 for (i = 0; i < nargs; i++)
10665 {
10666 machine_mode mode = insn_p->operand[i + 1].mode;
10667 bool match;
10668
10669 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10670 op = expand_normal (arg);
10671 match = insn_p->operand[i + 1].predicate (op, mode);
10672
10673 if (last_arg_constant && (i + 1) == nargs)
10674 {
10675 if (!match)
10676 {
10677 if (icode == CODE_FOR_lwp_lwpvalsi3
10678 || icode == CODE_FOR_lwp_lwpinssi3
10679 || icode == CODE_FOR_lwp_lwpvaldi3
10680 || icode == CODE_FOR_lwp_lwpinsdi3)
10681 error ("the last argument must be a 32-bit immediate");
10682 else
10683 error ("the last argument must be an 8-bit immediate");
10684 return const0_rtx;
10685 }
10686 }
10687 else
10688 {
10689 if (i == memory)
10690 {
10691 /* This must be the memory operand. */
10692 op = ix86_zero_extend_to_Pmode (op);
10693 op = gen_rtx_MEM (mode, op);
10694 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10695 on it. Try to improve it using get_pointer_alignment,
10696 and if the special builtin is one that requires strict
10697 mode alignment, also from it's GET_MODE_ALIGNMENT.
10698 Failure to do so could lead to ix86_legitimate_combined_insn
10699 rejecting all changes to such insns. */
10700 unsigned int align = get_pointer_alignment (arg);
10701 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10702 align = GET_MODE_ALIGNMENT (mode);
10703 if (MEM_ALIGN (op) < align)
10704 set_mem_align (op, align);
10705 }
10706 else
10707 {
10708 /* This must be register. */
10709 if (VECTOR_MODE_P (mode))
10710 op = safe_vector_operand (op, mode);
10711
10712 op = fixup_modeless_constant (op, mode);
10713
10714 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10715 op = copy_to_mode_reg (mode, op);
10716 else
10717 {
10718 op = copy_to_reg (op);
10719 op = lowpart_subreg (mode, op, GET_MODE (op));
10720 }
10721 }
10722 }
10723
10724 args[i].op = op;
10725 args[i].mode = mode;
10726 }
10727
10728 switch (nargs)
10729 {
10730 case 0:
10731 pat = GEN_FCN (icode) (target);
10732 break;
10733 case 1:
10734 pat = GEN_FCN (icode) (target, args[0].op);
10735 break;
10736 case 2:
10737 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10738 break;
10739 case 3:
10740 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
10741 break;
10742 default:
10743 gcc_unreachable ();
10744 }
10745
10746 if (! pat)
10747 return 0;
10748 emit_insn (pat);
10749 return klass == store ? 0 : target;
10750 }
10751
10752 /* Return the integer constant in ARG. Constrain it to be in the range
10753 of the subparts of VEC_TYPE; issue an error if not. */
10754
10755 static int
10756 get_element_number (tree vec_type, tree arg)
10757 {
10758 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10759
10760 if (!tree_fits_uhwi_p (arg)
10761 || (elt = tree_to_uhwi (arg), elt > max))
10762 {
10763 error ("selector must be an integer constant in the range "
10764 "[0, %wi]", max);
10765 return 0;
10766 }
10767
10768 return elt;
10769 }
10770
10771 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10772 ix86_expand_vector_init. We DO have language-level syntax for this, in
10773 the form of (type){ init-list }. Except that since we can't place emms
10774 instructions from inside the compiler, we can't allow the use of MMX
10775 registers unless the user explicitly asks for it. So we do *not* define
10776 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10777 we have builtins invoked by mmintrin.h that gives us license to emit
10778 these sorts of instructions. */
10779
10780 static rtx
10781 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10782 {
10783 machine_mode tmode = TYPE_MODE (type);
10784 machine_mode inner_mode = GET_MODE_INNER (tmode);
10785 int i, n_elt = GET_MODE_NUNITS (tmode);
10786 rtvec v = rtvec_alloc (n_elt);
10787
10788 gcc_assert (VECTOR_MODE_P (tmode));
10789 gcc_assert (call_expr_nargs (exp) == n_elt);
10790
10791 for (i = 0; i < n_elt; ++i)
10792 {
10793 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10794 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10795 }
10796
10797 if (!target || !register_operand (target, tmode))
10798 target = gen_reg_rtx (tmode);
10799
10800 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10801 return target;
10802 }
10803
10804 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10805 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10806 had a language-level syntax for referencing vector elements. */
10807
10808 static rtx
10809 ix86_expand_vec_ext_builtin (tree exp, rtx target)
10810 {
10811 machine_mode tmode, mode0;
10812 tree arg0, arg1;
10813 int elt;
10814 rtx op0;
10815
10816 arg0 = CALL_EXPR_ARG (exp, 0);
10817 arg1 = CALL_EXPR_ARG (exp, 1);
10818
10819 op0 = expand_normal (arg0);
10820 elt = get_element_number (TREE_TYPE (arg0), arg1);
10821
10822 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10823 mode0 = TYPE_MODE (TREE_TYPE (arg0));
10824 gcc_assert (VECTOR_MODE_P (mode0));
10825
10826 op0 = force_reg (mode0, op0);
10827
10828 if (optimize || !target || !register_operand (target, tmode))
10829 target = gen_reg_rtx (tmode);
10830
10831 ix86_expand_vector_extract (true, target, op0, elt);
10832
10833 return target;
10834 }
10835
10836 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10837 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10838 a language-level syntax for referencing vector elements. */
10839
10840 static rtx
10841 ix86_expand_vec_set_builtin (tree exp)
10842 {
10843 machine_mode tmode, mode1;
10844 tree arg0, arg1, arg2;
10845 int elt;
10846 rtx op0, op1, target;
10847
10848 arg0 = CALL_EXPR_ARG (exp, 0);
10849 arg1 = CALL_EXPR_ARG (exp, 1);
10850 arg2 = CALL_EXPR_ARG (exp, 2);
10851
10852 tmode = TYPE_MODE (TREE_TYPE (arg0));
10853 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10854 gcc_assert (VECTOR_MODE_P (tmode));
10855
10856 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
10857 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
10858 elt = get_element_number (TREE_TYPE (arg0), arg2);
10859
10860 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
10861 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
10862
10863 op0 = force_reg (tmode, op0);
10864 op1 = force_reg (mode1, op1);
10865
10866 /* OP0 is the source of these builtin functions and shouldn't be
10867 modified. Create a copy, use it and return it as target. */
10868 target = gen_reg_rtx (tmode);
10869 emit_move_insn (target, op0);
10870 ix86_expand_vector_set (true, target, op1, elt);
10871
10872 return target;
10873 }
10874
10875 /* Expand an expression EXP that calls a built-in function,
10876 with result going to TARGET if that's convenient
10877 (and in mode MODE if that's convenient).
10878 SUBTARGET may be used as the target for computing one of EXP's operands.
10879 IGNORE is nonzero if the value is to be ignored. */
10880
10881 rtx
10882 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
10883 machine_mode mode, int ignore)
10884 {
10885 size_t i;
10886 enum insn_code icode, icode2;
10887 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
10888 tree arg0, arg1, arg2, arg3, arg4;
10889 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
10890 machine_mode mode0, mode1, mode2, mode3, mode4;
10891 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
10892
10893 /* For CPU builtins that can be folded, fold first and expand the fold. */
10894 switch (fcode)
10895 {
10896 case IX86_BUILTIN_CPU_INIT:
10897 {
10898 /* Make it call __cpu_indicator_init in libgcc. */
10899 tree call_expr, fndecl, type;
10900 type = build_function_type_list (integer_type_node, NULL_TREE);
10901 fndecl = build_fn_decl ("__cpu_indicator_init", type);
10902 call_expr = build_call_expr (fndecl, 0);
10903 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
10904 }
10905 case IX86_BUILTIN_CPU_IS:
10906 case IX86_BUILTIN_CPU_SUPPORTS:
10907 {
10908 tree arg0 = CALL_EXPR_ARG (exp, 0);
10909 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
10910 gcc_assert (fold_expr != NULL_TREE);
10911 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
10912 }
10913 }
10914
10915 HOST_WIDE_INT isa = ix86_isa_flags;
10916 HOST_WIDE_INT isa2 = ix86_isa_flags2;
10917 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
10918 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
10919 /* The general case is we require all the ISAs specified in bisa{,2}
10920 to be enabled.
10921 The exceptions are:
10922 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
10923 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
10924 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
10925 where for each this pair it is sufficient if either of the ISAs is
10926 enabled, plus if it is ored with other options also those others. */
10927 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
10928 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
10929 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
10930 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
10931 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
10932 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
10933 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
10934 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
10935 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
10936 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
10937 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
10938 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
10939 /* Use SSE/SSE2/SSSE3 to emulate MMX intrinsics in 64-bit mode when
10940 MMX is disabled. NB: Since MMX intrinsics are marked with
10941 SSE/SSE2/SSSE3, enable them without SSE/SSE2/SSSE3 if MMX is
10942 enabled. */
10943 if (TARGET_MMX || TARGET_MMX_WITH_SSE)
10944 {
10945 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
10946 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
10947 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX)) != 0)
10948 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX);
10949 if (((bisa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
10950 == (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
10951 && (isa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX)) != 0)
10952 isa |= (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX);
10953 if (((bisa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
10954 == (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
10955 && (isa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX)) != 0)
10956 isa |= (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX);
10957 }
10958 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
10959 {
10960 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
10961 if (TARGET_ABI_X32)
10962 bisa |= OPTION_MASK_ABI_X32;
10963 else
10964 bisa |= OPTION_MASK_ABI_64;
10965 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
10966 (enum fpmath_unit) 0, false, add_abi_p);
10967 if (!opts)
10968 error ("%qE needs unknown isa option", fndecl);
10969 else
10970 {
10971 gcc_assert (opts != NULL);
10972 error ("%qE needs isa option %s", fndecl, opts);
10973 free (opts);
10974 }
10975 return expand_call (exp, target, ignore);
10976 }
10977
10978 switch (fcode)
10979 {
10980 case IX86_BUILTIN_MASKMOVQ:
10981 case IX86_BUILTIN_MASKMOVDQU:
10982 icode = (fcode == IX86_BUILTIN_MASKMOVQ
10983 ? CODE_FOR_mmx_maskmovq
10984 : CODE_FOR_sse2_maskmovdqu);
10985 /* Note the arg order is different from the operand order. */
10986 arg1 = CALL_EXPR_ARG (exp, 0);
10987 arg2 = CALL_EXPR_ARG (exp, 1);
10988 arg0 = CALL_EXPR_ARG (exp, 2);
10989 op0 = expand_normal (arg0);
10990 op1 = expand_normal (arg1);
10991 op2 = expand_normal (arg2);
10992 mode0 = insn_data[icode].operand[0].mode;
10993 mode1 = insn_data[icode].operand[1].mode;
10994 mode2 = insn_data[icode].operand[2].mode;
10995
10996 op0 = ix86_zero_extend_to_Pmode (op0);
10997 op0 = gen_rtx_MEM (mode1, op0);
10998
10999 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11000 op0 = copy_to_mode_reg (mode0, op0);
11001 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11002 op1 = copy_to_mode_reg (mode1, op1);
11003 if (!insn_data[icode].operand[2].predicate (op2, mode2))
11004 op2 = copy_to_mode_reg (mode2, op2);
11005 pat = GEN_FCN (icode) (op0, op1, op2);
11006 if (! pat)
11007 return 0;
11008 emit_insn (pat);
11009 return 0;
11010
11011 case IX86_BUILTIN_LDMXCSR:
11012 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11013 target = assign_386_stack_local (SImode, SLOT_TEMP);
11014 emit_move_insn (target, op0);
11015 emit_insn (gen_sse_ldmxcsr (target));
11016 return 0;
11017
11018 case IX86_BUILTIN_STMXCSR:
11019 target = assign_386_stack_local (SImode, SLOT_TEMP);
11020 emit_insn (gen_sse_stmxcsr (target));
11021 return copy_to_mode_reg (SImode, target);
11022
11023 case IX86_BUILTIN_CLFLUSH:
11024 arg0 = CALL_EXPR_ARG (exp, 0);
11025 op0 = expand_normal (arg0);
11026 icode = CODE_FOR_sse2_clflush;
11027 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11028 op0 = ix86_zero_extend_to_Pmode (op0);
11029
11030 emit_insn (gen_sse2_clflush (op0));
11031 return 0;
11032
11033 case IX86_BUILTIN_CLWB:
11034 arg0 = CALL_EXPR_ARG (exp, 0);
11035 op0 = expand_normal (arg0);
11036 icode = CODE_FOR_clwb;
11037 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11038 op0 = ix86_zero_extend_to_Pmode (op0);
11039
11040 emit_insn (gen_clwb (op0));
11041 return 0;
11042
11043 case IX86_BUILTIN_CLFLUSHOPT:
11044 arg0 = CALL_EXPR_ARG (exp, 0);
11045 op0 = expand_normal (arg0);
11046 icode = CODE_FOR_clflushopt;
11047 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11048 op0 = ix86_zero_extend_to_Pmode (op0);
11049
11050 emit_insn (gen_clflushopt (op0));
11051 return 0;
11052
11053 case IX86_BUILTIN_MONITOR:
11054 case IX86_BUILTIN_MONITORX:
11055 arg0 = CALL_EXPR_ARG (exp, 0);
11056 arg1 = CALL_EXPR_ARG (exp, 1);
11057 arg2 = CALL_EXPR_ARG (exp, 2);
11058 op0 = expand_normal (arg0);
11059 op1 = expand_normal (arg1);
11060 op2 = expand_normal (arg2);
11061 if (!REG_P (op0))
11062 op0 = ix86_zero_extend_to_Pmode (op0);
11063 if (!REG_P (op1))
11064 op1 = copy_to_mode_reg (SImode, op1);
11065 if (!REG_P (op2))
11066 op2 = copy_to_mode_reg (SImode, op2);
11067
11068 emit_insn (fcode == IX86_BUILTIN_MONITOR
11069 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11070 : gen_monitorx (Pmode, op0, op1, op2));
11071 return 0;
11072
11073 case IX86_BUILTIN_MWAIT:
11074 arg0 = CALL_EXPR_ARG (exp, 0);
11075 arg1 = CALL_EXPR_ARG (exp, 1);
11076 op0 = expand_normal (arg0);
11077 op1 = expand_normal (arg1);
11078 if (!REG_P (op0))
11079 op0 = copy_to_mode_reg (SImode, op0);
11080 if (!REG_P (op1))
11081 op1 = copy_to_mode_reg (SImode, op1);
11082 emit_insn (gen_sse3_mwait (op0, op1));
11083 return 0;
11084
11085 case IX86_BUILTIN_MWAITX:
11086 arg0 = CALL_EXPR_ARG (exp, 0);
11087 arg1 = CALL_EXPR_ARG (exp, 1);
11088 arg2 = CALL_EXPR_ARG (exp, 2);
11089 op0 = expand_normal (arg0);
11090 op1 = expand_normal (arg1);
11091 op2 = expand_normal (arg2);
11092 if (!REG_P (op0))
11093 op0 = copy_to_mode_reg (SImode, op0);
11094 if (!REG_P (op1))
11095 op1 = copy_to_mode_reg (SImode, op1);
11096 if (!REG_P (op2))
11097 op2 = copy_to_mode_reg (SImode, op2);
11098 emit_insn (gen_mwaitx (op0, op1, op2));
11099 return 0;
11100
11101 case IX86_BUILTIN_UMONITOR:
11102 arg0 = CALL_EXPR_ARG (exp, 0);
11103 op0 = expand_normal (arg0);
11104
11105 op0 = ix86_zero_extend_to_Pmode (op0);
11106
11107 insn = (TARGET_64BIT
11108 ? gen_umonitor_di (op0)
11109 : gen_umonitor_si (op0));
11110
11111 emit_insn (insn);
11112 return 0;
11113
11114 case IX86_BUILTIN_UMWAIT:
11115 case IX86_BUILTIN_TPAUSE:
11116 arg0 = CALL_EXPR_ARG (exp, 0);
11117 arg1 = CALL_EXPR_ARG (exp, 1);
11118 op0 = expand_normal (arg0);
11119 op1 = expand_normal (arg1);
11120
11121 if (!REG_P (op0))
11122 op0 = copy_to_mode_reg (SImode, op0);
11123
11124 op1 = force_reg (DImode, op1);
11125
11126 if (TARGET_64BIT)
11127 {
11128 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11129 NULL, 1, OPTAB_DIRECT);
11130 switch (fcode)
11131 {
11132 case IX86_BUILTIN_UMWAIT:
11133 icode = CODE_FOR_umwait_rex64;
11134 break;
11135 case IX86_BUILTIN_TPAUSE:
11136 icode = CODE_FOR_tpause_rex64;
11137 break;
11138 default:
11139 gcc_unreachable ();
11140 }
11141
11142 op2 = gen_lowpart (SImode, op2);
11143 op1 = gen_lowpart (SImode, op1);
11144 pat = GEN_FCN (icode) (op0, op1, op2);
11145 }
11146 else
11147 {
11148 switch (fcode)
11149 {
11150 case IX86_BUILTIN_UMWAIT:
11151 icode = CODE_FOR_umwait;
11152 break;
11153 case IX86_BUILTIN_TPAUSE:
11154 icode = CODE_FOR_tpause;
11155 break;
11156 default:
11157 gcc_unreachable ();
11158 }
11159 pat = GEN_FCN (icode) (op0, op1);
11160 }
11161
11162 if (!pat)
11163 return 0;
11164
11165 emit_insn (pat);
11166
11167 if (target == 0
11168 || !register_operand (target, QImode))
11169 target = gen_reg_rtx (QImode);
11170
11171 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11172 const0_rtx);
11173 emit_insn (gen_rtx_SET (target, pat));
11174
11175 return target;
11176
11177 case IX86_BUILTIN_CLZERO:
11178 arg0 = CALL_EXPR_ARG (exp, 0);
11179 op0 = expand_normal (arg0);
11180 if (!REG_P (op0))
11181 op0 = ix86_zero_extend_to_Pmode (op0);
11182 emit_insn (gen_clzero (Pmode, op0));
11183 return 0;
11184
11185 case IX86_BUILTIN_CLDEMOTE:
11186 arg0 = CALL_EXPR_ARG (exp, 0);
11187 op0 = expand_normal (arg0);
11188 icode = CODE_FOR_cldemote;
11189 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11190 op0 = ix86_zero_extend_to_Pmode (op0);
11191
11192 emit_insn (gen_cldemote (op0));
11193 return 0;
11194
11195 case IX86_BUILTIN_VEC_INIT_V2SI:
11196 case IX86_BUILTIN_VEC_INIT_V4HI:
11197 case IX86_BUILTIN_VEC_INIT_V8QI:
11198 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11199
11200 case IX86_BUILTIN_VEC_EXT_V2DF:
11201 case IX86_BUILTIN_VEC_EXT_V2DI:
11202 case IX86_BUILTIN_VEC_EXT_V4SF:
11203 case IX86_BUILTIN_VEC_EXT_V4SI:
11204 case IX86_BUILTIN_VEC_EXT_V8HI:
11205 case IX86_BUILTIN_VEC_EXT_V2SI:
11206 case IX86_BUILTIN_VEC_EXT_V4HI:
11207 case IX86_BUILTIN_VEC_EXT_V16QI:
11208 return ix86_expand_vec_ext_builtin (exp, target);
11209
11210 case IX86_BUILTIN_VEC_SET_V2DI:
11211 case IX86_BUILTIN_VEC_SET_V4SF:
11212 case IX86_BUILTIN_VEC_SET_V4SI:
11213 case IX86_BUILTIN_VEC_SET_V8HI:
11214 case IX86_BUILTIN_VEC_SET_V4HI:
11215 case IX86_BUILTIN_VEC_SET_V16QI:
11216 return ix86_expand_vec_set_builtin (exp);
11217
11218 case IX86_BUILTIN_NANQ:
11219 case IX86_BUILTIN_NANSQ:
11220 return expand_call (exp, target, ignore);
11221
11222 case IX86_BUILTIN_RDPID:
11223
11224 op0 = gen_reg_rtx (word_mode);
11225
11226 if (TARGET_64BIT)
11227 {
11228 insn = gen_rdpid_rex64 (op0);
11229 op0 = convert_to_mode (SImode, op0, 1);
11230 }
11231 else
11232 insn = gen_rdpid (op0);
11233
11234 emit_insn (insn);
11235
11236 if (target == 0
11237 || !register_operand (target, SImode))
11238 target = gen_reg_rtx (SImode);
11239
11240 emit_move_insn (target, op0);
11241 return target;
11242
11243 case IX86_BUILTIN_RDPMC:
11244 case IX86_BUILTIN_RDTSC:
11245 case IX86_BUILTIN_RDTSCP:
11246 case IX86_BUILTIN_XGETBV:
11247
11248 op0 = gen_reg_rtx (DImode);
11249 op1 = gen_reg_rtx (DImode);
11250
11251 if (fcode == IX86_BUILTIN_RDPMC)
11252 {
11253 arg0 = CALL_EXPR_ARG (exp, 0);
11254 op2 = expand_normal (arg0);
11255 if (!register_operand (op2, SImode))
11256 op2 = copy_to_mode_reg (SImode, op2);
11257
11258 insn = (TARGET_64BIT
11259 ? gen_rdpmc_rex64 (op0, op1, op2)
11260 : gen_rdpmc (op0, op2));
11261 emit_insn (insn);
11262 }
11263 else if (fcode == IX86_BUILTIN_XGETBV)
11264 {
11265 arg0 = CALL_EXPR_ARG (exp, 0);
11266 op2 = expand_normal (arg0);
11267 if (!register_operand (op2, SImode))
11268 op2 = copy_to_mode_reg (SImode, op2);
11269
11270 insn = (TARGET_64BIT
11271 ? gen_xgetbv_rex64 (op0, op1, op2)
11272 : gen_xgetbv (op0, op2));
11273 emit_insn (insn);
11274 }
11275 else if (fcode == IX86_BUILTIN_RDTSC)
11276 {
11277 insn = (TARGET_64BIT
11278 ? gen_rdtsc_rex64 (op0, op1)
11279 : gen_rdtsc (op0));
11280 emit_insn (insn);
11281 }
11282 else
11283 {
11284 op2 = gen_reg_rtx (SImode);
11285
11286 insn = (TARGET_64BIT
11287 ? gen_rdtscp_rex64 (op0, op1, op2)
11288 : gen_rdtscp (op0, op2));
11289 emit_insn (insn);
11290
11291 arg0 = CALL_EXPR_ARG (exp, 0);
11292 op4 = expand_normal (arg0);
11293 if (!address_operand (op4, VOIDmode))
11294 {
11295 op4 = convert_memory_address (Pmode, op4);
11296 op4 = copy_addr_to_reg (op4);
11297 }
11298 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11299 }
11300
11301 if (target == 0
11302 || !register_operand (target, DImode))
11303 target = gen_reg_rtx (DImode);
11304
11305 if (TARGET_64BIT)
11306 {
11307 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11308 op1, 1, OPTAB_DIRECT);
11309 op0 = expand_simple_binop (DImode, IOR, op0, op1,
11310 op0, 1, OPTAB_DIRECT);
11311 }
11312
11313 emit_move_insn (target, op0);
11314 return target;
11315
11316 case IX86_BUILTIN_ENQCMD:
11317 case IX86_BUILTIN_ENQCMDS:
11318 case IX86_BUILTIN_MOVDIR64B:
11319
11320 arg0 = CALL_EXPR_ARG (exp, 0);
11321 arg1 = CALL_EXPR_ARG (exp, 1);
11322 op0 = expand_normal (arg0);
11323 op1 = expand_normal (arg1);
11324
11325 op0 = ix86_zero_extend_to_Pmode (op0);
11326 if (!address_operand (op1, VOIDmode))
11327 {
11328 op1 = convert_memory_address (Pmode, op1);
11329 op1 = copy_addr_to_reg (op1);
11330 }
11331 op1 = gen_rtx_MEM (XImode, op1);
11332
11333 if (fcode == IX86_BUILTIN_MOVDIR64B)
11334 {
11335 emit_insn (gen_movdir64b (Pmode, op0, op1));
11336 return 0;
11337 }
11338 else
11339 {
11340 rtx pat;
11341
11342 target = gen_reg_rtx (SImode);
11343 emit_move_insn (target, const0_rtx);
11344 target = gen_rtx_SUBREG (QImode, target, 0);
11345
11346 if (fcode == IX86_BUILTIN_ENQCMD)
11347 pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1);
11348 else
11349 pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1);
11350
11351 emit_insn (pat);
11352
11353 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11354 gen_rtx_fmt_ee (EQ, QImode,
11355 SET_DEST (pat),
11356 const0_rtx)));
11357
11358 return SUBREG_REG (target);
11359 }
11360
11361 case IX86_BUILTIN_FXSAVE:
11362 case IX86_BUILTIN_FXRSTOR:
11363 case IX86_BUILTIN_FXSAVE64:
11364 case IX86_BUILTIN_FXRSTOR64:
11365 case IX86_BUILTIN_FNSTENV:
11366 case IX86_BUILTIN_FLDENV:
11367 mode0 = BLKmode;
11368 switch (fcode)
11369 {
11370 case IX86_BUILTIN_FXSAVE:
11371 icode = CODE_FOR_fxsave;
11372 break;
11373 case IX86_BUILTIN_FXRSTOR:
11374 icode = CODE_FOR_fxrstor;
11375 break;
11376 case IX86_BUILTIN_FXSAVE64:
11377 icode = CODE_FOR_fxsave64;
11378 break;
11379 case IX86_BUILTIN_FXRSTOR64:
11380 icode = CODE_FOR_fxrstor64;
11381 break;
11382 case IX86_BUILTIN_FNSTENV:
11383 icode = CODE_FOR_fnstenv;
11384 break;
11385 case IX86_BUILTIN_FLDENV:
11386 icode = CODE_FOR_fldenv;
11387 break;
11388 default:
11389 gcc_unreachable ();
11390 }
11391
11392 arg0 = CALL_EXPR_ARG (exp, 0);
11393 op0 = expand_normal (arg0);
11394
11395 if (!address_operand (op0, VOIDmode))
11396 {
11397 op0 = convert_memory_address (Pmode, op0);
11398 op0 = copy_addr_to_reg (op0);
11399 }
11400 op0 = gen_rtx_MEM (mode0, op0);
11401
11402 pat = GEN_FCN (icode) (op0);
11403 if (pat)
11404 emit_insn (pat);
11405 return 0;
11406
11407 case IX86_BUILTIN_XSETBV:
11408 arg0 = CALL_EXPR_ARG (exp, 0);
11409 arg1 = CALL_EXPR_ARG (exp, 1);
11410 op0 = expand_normal (arg0);
11411 op1 = expand_normal (arg1);
11412
11413 if (!REG_P (op0))
11414 op0 = copy_to_mode_reg (SImode, op0);
11415
11416 op1 = force_reg (DImode, op1);
11417
11418 if (TARGET_64BIT)
11419 {
11420 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11421 NULL, 1, OPTAB_DIRECT);
11422
11423 icode = CODE_FOR_xsetbv_rex64;
11424
11425 op2 = gen_lowpart (SImode, op2);
11426 op1 = gen_lowpart (SImode, op1);
11427 pat = GEN_FCN (icode) (op0, op1, op2);
11428 }
11429 else
11430 {
11431 icode = CODE_FOR_xsetbv;
11432
11433 pat = GEN_FCN (icode) (op0, op1);
11434 }
11435 if (pat)
11436 emit_insn (pat);
11437 return 0;
11438
11439 case IX86_BUILTIN_XSAVE:
11440 case IX86_BUILTIN_XRSTOR:
11441 case IX86_BUILTIN_XSAVE64:
11442 case IX86_BUILTIN_XRSTOR64:
11443 case IX86_BUILTIN_XSAVEOPT:
11444 case IX86_BUILTIN_XSAVEOPT64:
11445 case IX86_BUILTIN_XSAVES:
11446 case IX86_BUILTIN_XRSTORS:
11447 case IX86_BUILTIN_XSAVES64:
11448 case IX86_BUILTIN_XRSTORS64:
11449 case IX86_BUILTIN_XSAVEC:
11450 case IX86_BUILTIN_XSAVEC64:
11451 arg0 = CALL_EXPR_ARG (exp, 0);
11452 arg1 = CALL_EXPR_ARG (exp, 1);
11453 op0 = expand_normal (arg0);
11454 op1 = expand_normal (arg1);
11455
11456 if (!address_operand (op0, VOIDmode))
11457 {
11458 op0 = convert_memory_address (Pmode, op0);
11459 op0 = copy_addr_to_reg (op0);
11460 }
11461 op0 = gen_rtx_MEM (BLKmode, op0);
11462
11463 op1 = force_reg (DImode, op1);
11464
11465 if (TARGET_64BIT)
11466 {
11467 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11468 NULL, 1, OPTAB_DIRECT);
11469 switch (fcode)
11470 {
11471 case IX86_BUILTIN_XSAVE:
11472 icode = CODE_FOR_xsave_rex64;
11473 break;
11474 case IX86_BUILTIN_XRSTOR:
11475 icode = CODE_FOR_xrstor_rex64;
11476 break;
11477 case IX86_BUILTIN_XSAVE64:
11478 icode = CODE_FOR_xsave64;
11479 break;
11480 case IX86_BUILTIN_XRSTOR64:
11481 icode = CODE_FOR_xrstor64;
11482 break;
11483 case IX86_BUILTIN_XSAVEOPT:
11484 icode = CODE_FOR_xsaveopt_rex64;
11485 break;
11486 case IX86_BUILTIN_XSAVEOPT64:
11487 icode = CODE_FOR_xsaveopt64;
11488 break;
11489 case IX86_BUILTIN_XSAVES:
11490 icode = CODE_FOR_xsaves_rex64;
11491 break;
11492 case IX86_BUILTIN_XRSTORS:
11493 icode = CODE_FOR_xrstors_rex64;
11494 break;
11495 case IX86_BUILTIN_XSAVES64:
11496 icode = CODE_FOR_xsaves64;
11497 break;
11498 case IX86_BUILTIN_XRSTORS64:
11499 icode = CODE_FOR_xrstors64;
11500 break;
11501 case IX86_BUILTIN_XSAVEC:
11502 icode = CODE_FOR_xsavec_rex64;
11503 break;
11504 case IX86_BUILTIN_XSAVEC64:
11505 icode = CODE_FOR_xsavec64;
11506 break;
11507 default:
11508 gcc_unreachable ();
11509 }
11510
11511 op2 = gen_lowpart (SImode, op2);
11512 op1 = gen_lowpart (SImode, op1);
11513 pat = GEN_FCN (icode) (op0, op1, op2);
11514 }
11515 else
11516 {
11517 switch (fcode)
11518 {
11519 case IX86_BUILTIN_XSAVE:
11520 icode = CODE_FOR_xsave;
11521 break;
11522 case IX86_BUILTIN_XRSTOR:
11523 icode = CODE_FOR_xrstor;
11524 break;
11525 case IX86_BUILTIN_XSAVEOPT:
11526 icode = CODE_FOR_xsaveopt;
11527 break;
11528 case IX86_BUILTIN_XSAVES:
11529 icode = CODE_FOR_xsaves;
11530 break;
11531 case IX86_BUILTIN_XRSTORS:
11532 icode = CODE_FOR_xrstors;
11533 break;
11534 case IX86_BUILTIN_XSAVEC:
11535 icode = CODE_FOR_xsavec;
11536 break;
11537 default:
11538 gcc_unreachable ();
11539 }
11540 pat = GEN_FCN (icode) (op0, op1);
11541 }
11542
11543 if (pat)
11544 emit_insn (pat);
11545 return 0;
11546
11547 case IX86_BUILTIN_LLWPCB:
11548 arg0 = CALL_EXPR_ARG (exp, 0);
11549 op0 = expand_normal (arg0);
11550 icode = CODE_FOR_lwp_llwpcb;
11551 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11552 op0 = ix86_zero_extend_to_Pmode (op0);
11553 emit_insn (gen_lwp_llwpcb (op0));
11554 return 0;
11555
11556 case IX86_BUILTIN_SLWPCB:
11557 icode = CODE_FOR_lwp_slwpcb;
11558 if (!target
11559 || !insn_data[icode].operand[0].predicate (target, Pmode))
11560 target = gen_reg_rtx (Pmode);
11561 emit_insn (gen_lwp_slwpcb (target));
11562 return target;
11563
11564 case IX86_BUILTIN_BEXTRI32:
11565 case IX86_BUILTIN_BEXTRI64:
11566 arg0 = CALL_EXPR_ARG (exp, 0);
11567 arg1 = CALL_EXPR_ARG (exp, 1);
11568 op0 = expand_normal (arg0);
11569 op1 = expand_normal (arg1);
11570 icode = (fcode == IX86_BUILTIN_BEXTRI32
11571 ? CODE_FOR_tbm_bextri_si
11572 : CODE_FOR_tbm_bextri_di);
11573 if (!CONST_INT_P (op1))
11574 {
11575 error ("last argument must be an immediate");
11576 return const0_rtx;
11577 }
11578 else
11579 {
11580 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
11581 unsigned char lsb_index = INTVAL (op1) & 0xFF;
11582 op1 = GEN_INT (length);
11583 op2 = GEN_INT (lsb_index);
11584
11585 mode1 = insn_data[icode].operand[1].mode;
11586 if (!insn_data[icode].operand[1].predicate (op0, mode1))
11587 op0 = copy_to_mode_reg (mode1, op0);
11588
11589 mode0 = insn_data[icode].operand[0].mode;
11590 if (target == 0
11591 || !register_operand (target, mode0))
11592 target = gen_reg_rtx (mode0);
11593
11594 pat = GEN_FCN (icode) (target, op0, op1, op2);
11595 if (pat)
11596 emit_insn (pat);
11597 return target;
11598 }
11599
11600 case IX86_BUILTIN_RDRAND16_STEP:
11601 icode = CODE_FOR_rdrandhi_1;
11602 mode0 = HImode;
11603 goto rdrand_step;
11604
11605 case IX86_BUILTIN_RDRAND32_STEP:
11606 icode = CODE_FOR_rdrandsi_1;
11607 mode0 = SImode;
11608 goto rdrand_step;
11609
11610 case IX86_BUILTIN_RDRAND64_STEP:
11611 icode = CODE_FOR_rdranddi_1;
11612 mode0 = DImode;
11613
11614 rdrand_step:
11615 arg0 = CALL_EXPR_ARG (exp, 0);
11616 op1 = expand_normal (arg0);
11617 if (!address_operand (op1, VOIDmode))
11618 {
11619 op1 = convert_memory_address (Pmode, op1);
11620 op1 = copy_addr_to_reg (op1);
11621 }
11622
11623 op0 = gen_reg_rtx (mode0);
11624 emit_insn (GEN_FCN (icode) (op0));
11625
11626 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11627
11628 op1 = gen_reg_rtx (SImode);
11629 emit_move_insn (op1, CONST1_RTX (SImode));
11630
11631 /* Emit SImode conditional move. */
11632 if (mode0 == HImode)
11633 {
11634 if (TARGET_ZERO_EXTEND_WITH_AND
11635 && optimize_function_for_speed_p (cfun))
11636 {
11637 op2 = force_reg (SImode, const0_rtx);
11638
11639 emit_insn (gen_movstricthi
11640 (gen_lowpart (HImode, op2), op0));
11641 }
11642 else
11643 {
11644 op2 = gen_reg_rtx (SImode);
11645
11646 emit_insn (gen_zero_extendhisi2 (op2, op0));
11647 }
11648 }
11649 else if (mode0 == SImode)
11650 op2 = op0;
11651 else
11652 op2 = gen_rtx_SUBREG (SImode, op0, 0);
11653
11654 if (target == 0
11655 || !register_operand (target, SImode))
11656 target = gen_reg_rtx (SImode);
11657
11658 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
11659 const0_rtx);
11660 emit_insn (gen_rtx_SET (target,
11661 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
11662 return target;
11663
11664 case IX86_BUILTIN_RDSEED16_STEP:
11665 icode = CODE_FOR_rdseedhi_1;
11666 mode0 = HImode;
11667 goto rdseed_step;
11668
11669 case IX86_BUILTIN_RDSEED32_STEP:
11670 icode = CODE_FOR_rdseedsi_1;
11671 mode0 = SImode;
11672 goto rdseed_step;
11673
11674 case IX86_BUILTIN_RDSEED64_STEP:
11675 icode = CODE_FOR_rdseeddi_1;
11676 mode0 = DImode;
11677
11678 rdseed_step:
11679 arg0 = CALL_EXPR_ARG (exp, 0);
11680 op1 = expand_normal (arg0);
11681 if (!address_operand (op1, VOIDmode))
11682 {
11683 op1 = convert_memory_address (Pmode, op1);
11684 op1 = copy_addr_to_reg (op1);
11685 }
11686
11687 op0 = gen_reg_rtx (mode0);
11688 emit_insn (GEN_FCN (icode) (op0));
11689
11690 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11691
11692 op2 = gen_reg_rtx (QImode);
11693
11694 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11695 const0_rtx);
11696 emit_insn (gen_rtx_SET (op2, pat));
11697
11698 if (target == 0
11699 || !register_operand (target, SImode))
11700 target = gen_reg_rtx (SImode);
11701
11702 emit_insn (gen_zero_extendqisi2 (target, op2));
11703 return target;
11704
11705 case IX86_BUILTIN_SBB32:
11706 icode = CODE_FOR_subborrowsi;
11707 icode2 = CODE_FOR_subborrowsi_0;
11708 mode0 = SImode;
11709 mode1 = DImode;
11710 mode2 = CCmode;
11711 goto handlecarry;
11712
11713 case IX86_BUILTIN_SBB64:
11714 icode = CODE_FOR_subborrowdi;
11715 icode2 = CODE_FOR_subborrowdi_0;
11716 mode0 = DImode;
11717 mode1 = TImode;
11718 mode2 = CCmode;
11719 goto handlecarry;
11720
11721 case IX86_BUILTIN_ADDCARRYX32:
11722 icode = CODE_FOR_addcarrysi;
11723 icode2 = CODE_FOR_addcarrysi_0;
11724 mode0 = SImode;
11725 mode1 = DImode;
11726 mode2 = CCCmode;
11727 goto handlecarry;
11728
11729 case IX86_BUILTIN_ADDCARRYX64:
11730 icode = CODE_FOR_addcarrydi;
11731 icode2 = CODE_FOR_addcarrydi_0;
11732 mode0 = DImode;
11733 mode1 = TImode;
11734 mode2 = CCCmode;
11735
11736 handlecarry:
11737 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
11738 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
11739 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
11740 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
11741
11742 op1 = expand_normal (arg0);
11743 if (!integer_zerop (arg0))
11744 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
11745
11746 op2 = expand_normal (arg1);
11747 if (!register_operand (op2, mode0))
11748 op2 = copy_to_mode_reg (mode0, op2);
11749
11750 op3 = expand_normal (arg2);
11751 if (!register_operand (op3, mode0))
11752 op3 = copy_to_mode_reg (mode0, op3);
11753
11754 op4 = expand_normal (arg3);
11755 if (!address_operand (op4, VOIDmode))
11756 {
11757 op4 = convert_memory_address (Pmode, op4);
11758 op4 = copy_addr_to_reg (op4);
11759 }
11760
11761 op0 = gen_reg_rtx (mode0);
11762 if (integer_zerop (arg0))
11763 {
11764 /* If arg0 is 0, optimize right away into add or sub
11765 instruction that sets CCCmode flags. */
11766 op1 = gen_rtx_REG (mode2, FLAGS_REG);
11767 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
11768 }
11769 else
11770 {
11771 /* Generate CF from input operand. */
11772 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
11773
11774 /* Generate instruction that consumes CF. */
11775 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
11776 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
11777 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
11778 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
11779 }
11780
11781 /* Return current CF value. */
11782 if (target == 0)
11783 target = gen_reg_rtx (QImode);
11784
11785 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
11786 emit_insn (gen_rtx_SET (target, pat));
11787
11788 /* Store the result. */
11789 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
11790
11791 return target;
11792
11793 case IX86_BUILTIN_READ_FLAGS:
11794 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
11795
11796 if (optimize
11797 || target == NULL_RTX
11798 || !nonimmediate_operand (target, word_mode)
11799 || GET_MODE (target) != word_mode)
11800 target = gen_reg_rtx (word_mode);
11801
11802 emit_insn (gen_pop (target));
11803 return target;
11804
11805 case IX86_BUILTIN_WRITE_FLAGS:
11806
11807 arg0 = CALL_EXPR_ARG (exp, 0);
11808 op0 = expand_normal (arg0);
11809 if (!general_no_elim_operand (op0, word_mode))
11810 op0 = copy_to_mode_reg (word_mode, op0);
11811
11812 emit_insn (gen_push (op0));
11813 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
11814 return 0;
11815
11816 case IX86_BUILTIN_KTESTC8:
11817 icode = CODE_FOR_ktestqi;
11818 mode3 = CCCmode;
11819 goto kortest;
11820
11821 case IX86_BUILTIN_KTESTZ8:
11822 icode = CODE_FOR_ktestqi;
11823 mode3 = CCZmode;
11824 goto kortest;
11825
11826 case IX86_BUILTIN_KTESTC16:
11827 icode = CODE_FOR_ktesthi;
11828 mode3 = CCCmode;
11829 goto kortest;
11830
11831 case IX86_BUILTIN_KTESTZ16:
11832 icode = CODE_FOR_ktesthi;
11833 mode3 = CCZmode;
11834 goto kortest;
11835
11836 case IX86_BUILTIN_KTESTC32:
11837 icode = CODE_FOR_ktestsi;
11838 mode3 = CCCmode;
11839 goto kortest;
11840
11841 case IX86_BUILTIN_KTESTZ32:
11842 icode = CODE_FOR_ktestsi;
11843 mode3 = CCZmode;
11844 goto kortest;
11845
11846 case IX86_BUILTIN_KTESTC64:
11847 icode = CODE_FOR_ktestdi;
11848 mode3 = CCCmode;
11849 goto kortest;
11850
11851 case IX86_BUILTIN_KTESTZ64:
11852 icode = CODE_FOR_ktestdi;
11853 mode3 = CCZmode;
11854 goto kortest;
11855
11856 case IX86_BUILTIN_KORTESTC8:
11857 icode = CODE_FOR_kortestqi;
11858 mode3 = CCCmode;
11859 goto kortest;
11860
11861 case IX86_BUILTIN_KORTESTZ8:
11862 icode = CODE_FOR_kortestqi;
11863 mode3 = CCZmode;
11864 goto kortest;
11865
11866 case IX86_BUILTIN_KORTESTC16:
11867 icode = CODE_FOR_kortesthi;
11868 mode3 = CCCmode;
11869 goto kortest;
11870
11871 case IX86_BUILTIN_KORTESTZ16:
11872 icode = CODE_FOR_kortesthi;
11873 mode3 = CCZmode;
11874 goto kortest;
11875
11876 case IX86_BUILTIN_KORTESTC32:
11877 icode = CODE_FOR_kortestsi;
11878 mode3 = CCCmode;
11879 goto kortest;
11880
11881 case IX86_BUILTIN_KORTESTZ32:
11882 icode = CODE_FOR_kortestsi;
11883 mode3 = CCZmode;
11884 goto kortest;
11885
11886 case IX86_BUILTIN_KORTESTC64:
11887 icode = CODE_FOR_kortestdi;
11888 mode3 = CCCmode;
11889 goto kortest;
11890
11891 case IX86_BUILTIN_KORTESTZ64:
11892 icode = CODE_FOR_kortestdi;
11893 mode3 = CCZmode;
11894
11895 kortest:
11896 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
11897 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
11898 op0 = expand_normal (arg0);
11899 op1 = expand_normal (arg1);
11900
11901 mode0 = insn_data[icode].operand[0].mode;
11902 mode1 = insn_data[icode].operand[1].mode;
11903
11904 if (GET_MODE (op0) != VOIDmode)
11905 op0 = force_reg (GET_MODE (op0), op0);
11906
11907 op0 = gen_lowpart (mode0, op0);
11908
11909 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11910 op0 = copy_to_mode_reg (mode0, op0);
11911
11912 if (GET_MODE (op1) != VOIDmode)
11913 op1 = force_reg (GET_MODE (op1), op1);
11914
11915 op1 = gen_lowpart (mode1, op1);
11916
11917 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11918 op1 = copy_to_mode_reg (mode1, op1);
11919
11920 target = gen_reg_rtx (QImode);
11921
11922 /* Emit kortest. */
11923 emit_insn (GEN_FCN (icode) (op0, op1));
11924 /* And use setcc to return result from flags. */
11925 ix86_expand_setcc (target, EQ,
11926 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
11927 return target;
11928
11929 case IX86_BUILTIN_GATHERSIV2DF:
11930 icode = CODE_FOR_avx2_gathersiv2df;
11931 goto gather_gen;
11932 case IX86_BUILTIN_GATHERSIV4DF:
11933 icode = CODE_FOR_avx2_gathersiv4df;
11934 goto gather_gen;
11935 case IX86_BUILTIN_GATHERDIV2DF:
11936 icode = CODE_FOR_avx2_gatherdiv2df;
11937 goto gather_gen;
11938 case IX86_BUILTIN_GATHERDIV4DF:
11939 icode = CODE_FOR_avx2_gatherdiv4df;
11940 goto gather_gen;
11941 case IX86_BUILTIN_GATHERSIV4SF:
11942 icode = CODE_FOR_avx2_gathersiv4sf;
11943 goto gather_gen;
11944 case IX86_BUILTIN_GATHERSIV8SF:
11945 icode = CODE_FOR_avx2_gathersiv8sf;
11946 goto gather_gen;
11947 case IX86_BUILTIN_GATHERDIV4SF:
11948 icode = CODE_FOR_avx2_gatherdiv4sf;
11949 goto gather_gen;
11950 case IX86_BUILTIN_GATHERDIV8SF:
11951 icode = CODE_FOR_avx2_gatherdiv8sf;
11952 goto gather_gen;
11953 case IX86_BUILTIN_GATHERSIV2DI:
11954 icode = CODE_FOR_avx2_gathersiv2di;
11955 goto gather_gen;
11956 case IX86_BUILTIN_GATHERSIV4DI:
11957 icode = CODE_FOR_avx2_gathersiv4di;
11958 goto gather_gen;
11959 case IX86_BUILTIN_GATHERDIV2DI:
11960 icode = CODE_FOR_avx2_gatherdiv2di;
11961 goto gather_gen;
11962 case IX86_BUILTIN_GATHERDIV4DI:
11963 icode = CODE_FOR_avx2_gatherdiv4di;
11964 goto gather_gen;
11965 case IX86_BUILTIN_GATHERSIV4SI:
11966 icode = CODE_FOR_avx2_gathersiv4si;
11967 goto gather_gen;
11968 case IX86_BUILTIN_GATHERSIV8SI:
11969 icode = CODE_FOR_avx2_gathersiv8si;
11970 goto gather_gen;
11971 case IX86_BUILTIN_GATHERDIV4SI:
11972 icode = CODE_FOR_avx2_gatherdiv4si;
11973 goto gather_gen;
11974 case IX86_BUILTIN_GATHERDIV8SI:
11975 icode = CODE_FOR_avx2_gatherdiv8si;
11976 goto gather_gen;
11977 case IX86_BUILTIN_GATHERALTSIV4DF:
11978 icode = CODE_FOR_avx2_gathersiv4df;
11979 goto gather_gen;
11980 case IX86_BUILTIN_GATHERALTDIV8SF:
11981 icode = CODE_FOR_avx2_gatherdiv8sf;
11982 goto gather_gen;
11983 case IX86_BUILTIN_GATHERALTSIV4DI:
11984 icode = CODE_FOR_avx2_gathersiv4di;
11985 goto gather_gen;
11986 case IX86_BUILTIN_GATHERALTDIV8SI:
11987 icode = CODE_FOR_avx2_gatherdiv8si;
11988 goto gather_gen;
11989 case IX86_BUILTIN_GATHER3SIV16SF:
11990 icode = CODE_FOR_avx512f_gathersiv16sf;
11991 goto gather_gen;
11992 case IX86_BUILTIN_GATHER3SIV8DF:
11993 icode = CODE_FOR_avx512f_gathersiv8df;
11994 goto gather_gen;
11995 case IX86_BUILTIN_GATHER3DIV16SF:
11996 icode = CODE_FOR_avx512f_gatherdiv16sf;
11997 goto gather_gen;
11998 case IX86_BUILTIN_GATHER3DIV8DF:
11999 icode = CODE_FOR_avx512f_gatherdiv8df;
12000 goto gather_gen;
12001 case IX86_BUILTIN_GATHER3SIV16SI:
12002 icode = CODE_FOR_avx512f_gathersiv16si;
12003 goto gather_gen;
12004 case IX86_BUILTIN_GATHER3SIV8DI:
12005 icode = CODE_FOR_avx512f_gathersiv8di;
12006 goto gather_gen;
12007 case IX86_BUILTIN_GATHER3DIV16SI:
12008 icode = CODE_FOR_avx512f_gatherdiv16si;
12009 goto gather_gen;
12010 case IX86_BUILTIN_GATHER3DIV8DI:
12011 icode = CODE_FOR_avx512f_gatherdiv8di;
12012 goto gather_gen;
12013 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12014 icode = CODE_FOR_avx512f_gathersiv8df;
12015 goto gather_gen;
12016 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12017 icode = CODE_FOR_avx512f_gatherdiv16sf;
12018 goto gather_gen;
12019 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12020 icode = CODE_FOR_avx512f_gathersiv8di;
12021 goto gather_gen;
12022 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12023 icode = CODE_FOR_avx512f_gatherdiv16si;
12024 goto gather_gen;
12025 case IX86_BUILTIN_GATHER3SIV2DF:
12026 icode = CODE_FOR_avx512vl_gathersiv2df;
12027 goto gather_gen;
12028 case IX86_BUILTIN_GATHER3SIV4DF:
12029 icode = CODE_FOR_avx512vl_gathersiv4df;
12030 goto gather_gen;
12031 case IX86_BUILTIN_GATHER3DIV2DF:
12032 icode = CODE_FOR_avx512vl_gatherdiv2df;
12033 goto gather_gen;
12034 case IX86_BUILTIN_GATHER3DIV4DF:
12035 icode = CODE_FOR_avx512vl_gatherdiv4df;
12036 goto gather_gen;
12037 case IX86_BUILTIN_GATHER3SIV4SF:
12038 icode = CODE_FOR_avx512vl_gathersiv4sf;
12039 goto gather_gen;
12040 case IX86_BUILTIN_GATHER3SIV8SF:
12041 icode = CODE_FOR_avx512vl_gathersiv8sf;
12042 goto gather_gen;
12043 case IX86_BUILTIN_GATHER3DIV4SF:
12044 icode = CODE_FOR_avx512vl_gatherdiv4sf;
12045 goto gather_gen;
12046 case IX86_BUILTIN_GATHER3DIV8SF:
12047 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12048 goto gather_gen;
12049 case IX86_BUILTIN_GATHER3SIV2DI:
12050 icode = CODE_FOR_avx512vl_gathersiv2di;
12051 goto gather_gen;
12052 case IX86_BUILTIN_GATHER3SIV4DI:
12053 icode = CODE_FOR_avx512vl_gathersiv4di;
12054 goto gather_gen;
12055 case IX86_BUILTIN_GATHER3DIV2DI:
12056 icode = CODE_FOR_avx512vl_gatherdiv2di;
12057 goto gather_gen;
12058 case IX86_BUILTIN_GATHER3DIV4DI:
12059 icode = CODE_FOR_avx512vl_gatherdiv4di;
12060 goto gather_gen;
12061 case IX86_BUILTIN_GATHER3SIV4SI:
12062 icode = CODE_FOR_avx512vl_gathersiv4si;
12063 goto gather_gen;
12064 case IX86_BUILTIN_GATHER3SIV8SI:
12065 icode = CODE_FOR_avx512vl_gathersiv8si;
12066 goto gather_gen;
12067 case IX86_BUILTIN_GATHER3DIV4SI:
12068 icode = CODE_FOR_avx512vl_gatherdiv4si;
12069 goto gather_gen;
12070 case IX86_BUILTIN_GATHER3DIV8SI:
12071 icode = CODE_FOR_avx512vl_gatherdiv8si;
12072 goto gather_gen;
12073 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12074 icode = CODE_FOR_avx512vl_gathersiv4df;
12075 goto gather_gen;
12076 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12077 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12078 goto gather_gen;
12079 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12080 icode = CODE_FOR_avx512vl_gathersiv4di;
12081 goto gather_gen;
12082 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12083 icode = CODE_FOR_avx512vl_gatherdiv8si;
12084 goto gather_gen;
12085 case IX86_BUILTIN_SCATTERSIV16SF:
12086 icode = CODE_FOR_avx512f_scattersiv16sf;
12087 goto scatter_gen;
12088 case IX86_BUILTIN_SCATTERSIV8DF:
12089 icode = CODE_FOR_avx512f_scattersiv8df;
12090 goto scatter_gen;
12091 case IX86_BUILTIN_SCATTERDIV16SF:
12092 icode = CODE_FOR_avx512f_scatterdiv16sf;
12093 goto scatter_gen;
12094 case IX86_BUILTIN_SCATTERDIV8DF:
12095 icode = CODE_FOR_avx512f_scatterdiv8df;
12096 goto scatter_gen;
12097 case IX86_BUILTIN_SCATTERSIV16SI:
12098 icode = CODE_FOR_avx512f_scattersiv16si;
12099 goto scatter_gen;
12100 case IX86_BUILTIN_SCATTERSIV8DI:
12101 icode = CODE_FOR_avx512f_scattersiv8di;
12102 goto scatter_gen;
12103 case IX86_BUILTIN_SCATTERDIV16SI:
12104 icode = CODE_FOR_avx512f_scatterdiv16si;
12105 goto scatter_gen;
12106 case IX86_BUILTIN_SCATTERDIV8DI:
12107 icode = CODE_FOR_avx512f_scatterdiv8di;
12108 goto scatter_gen;
12109 case IX86_BUILTIN_SCATTERSIV8SF:
12110 icode = CODE_FOR_avx512vl_scattersiv8sf;
12111 goto scatter_gen;
12112 case IX86_BUILTIN_SCATTERSIV4SF:
12113 icode = CODE_FOR_avx512vl_scattersiv4sf;
12114 goto scatter_gen;
12115 case IX86_BUILTIN_SCATTERSIV4DF:
12116 icode = CODE_FOR_avx512vl_scattersiv4df;
12117 goto scatter_gen;
12118 case IX86_BUILTIN_SCATTERSIV2DF:
12119 icode = CODE_FOR_avx512vl_scattersiv2df;
12120 goto scatter_gen;
12121 case IX86_BUILTIN_SCATTERDIV8SF:
12122 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12123 goto scatter_gen;
12124 case IX86_BUILTIN_SCATTERDIV4SF:
12125 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12126 goto scatter_gen;
12127 case IX86_BUILTIN_SCATTERDIV4DF:
12128 icode = CODE_FOR_avx512vl_scatterdiv4df;
12129 goto scatter_gen;
12130 case IX86_BUILTIN_SCATTERDIV2DF:
12131 icode = CODE_FOR_avx512vl_scatterdiv2df;
12132 goto scatter_gen;
12133 case IX86_BUILTIN_SCATTERSIV8SI:
12134 icode = CODE_FOR_avx512vl_scattersiv8si;
12135 goto scatter_gen;
12136 case IX86_BUILTIN_SCATTERSIV4SI:
12137 icode = CODE_FOR_avx512vl_scattersiv4si;
12138 goto scatter_gen;
12139 case IX86_BUILTIN_SCATTERSIV4DI:
12140 icode = CODE_FOR_avx512vl_scattersiv4di;
12141 goto scatter_gen;
12142 case IX86_BUILTIN_SCATTERSIV2DI:
12143 icode = CODE_FOR_avx512vl_scattersiv2di;
12144 goto scatter_gen;
12145 case IX86_BUILTIN_SCATTERDIV8SI:
12146 icode = CODE_FOR_avx512vl_scatterdiv8si;
12147 goto scatter_gen;
12148 case IX86_BUILTIN_SCATTERDIV4SI:
12149 icode = CODE_FOR_avx512vl_scatterdiv4si;
12150 goto scatter_gen;
12151 case IX86_BUILTIN_SCATTERDIV4DI:
12152 icode = CODE_FOR_avx512vl_scatterdiv4di;
12153 goto scatter_gen;
12154 case IX86_BUILTIN_SCATTERDIV2DI:
12155 icode = CODE_FOR_avx512vl_scatterdiv2di;
12156 goto scatter_gen;
12157 case IX86_BUILTIN_GATHERPFDPD:
12158 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12159 goto vec_prefetch_gen;
12160 case IX86_BUILTIN_SCATTERALTSIV8DF:
12161 icode = CODE_FOR_avx512f_scattersiv8df;
12162 goto scatter_gen;
12163 case IX86_BUILTIN_SCATTERALTDIV16SF:
12164 icode = CODE_FOR_avx512f_scatterdiv16sf;
12165 goto scatter_gen;
12166 case IX86_BUILTIN_SCATTERALTSIV8DI:
12167 icode = CODE_FOR_avx512f_scattersiv8di;
12168 goto scatter_gen;
12169 case IX86_BUILTIN_SCATTERALTDIV16SI:
12170 icode = CODE_FOR_avx512f_scatterdiv16si;
12171 goto scatter_gen;
12172 case IX86_BUILTIN_SCATTERALTSIV4DF:
12173 icode = CODE_FOR_avx512vl_scattersiv4df;
12174 goto scatter_gen;
12175 case IX86_BUILTIN_SCATTERALTDIV8SF:
12176 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12177 goto scatter_gen;
12178 case IX86_BUILTIN_SCATTERALTSIV4DI:
12179 icode = CODE_FOR_avx512vl_scattersiv4di;
12180 goto scatter_gen;
12181 case IX86_BUILTIN_SCATTERALTDIV8SI:
12182 icode = CODE_FOR_avx512vl_scatterdiv8si;
12183 goto scatter_gen;
12184 case IX86_BUILTIN_SCATTERALTSIV2DF:
12185 icode = CODE_FOR_avx512vl_scattersiv2df;
12186 goto scatter_gen;
12187 case IX86_BUILTIN_SCATTERALTDIV4SF:
12188 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12189 goto scatter_gen;
12190 case IX86_BUILTIN_SCATTERALTSIV2DI:
12191 icode = CODE_FOR_avx512vl_scattersiv2di;
12192 goto scatter_gen;
12193 case IX86_BUILTIN_SCATTERALTDIV4SI:
12194 icode = CODE_FOR_avx512vl_scatterdiv4si;
12195 goto scatter_gen;
12196 case IX86_BUILTIN_GATHERPFDPS:
12197 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12198 goto vec_prefetch_gen;
12199 case IX86_BUILTIN_GATHERPFQPD:
12200 icode = CODE_FOR_avx512pf_gatherpfv8didf;
12201 goto vec_prefetch_gen;
12202 case IX86_BUILTIN_GATHERPFQPS:
12203 icode = CODE_FOR_avx512pf_gatherpfv8disf;
12204 goto vec_prefetch_gen;
12205 case IX86_BUILTIN_SCATTERPFDPD:
12206 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12207 goto vec_prefetch_gen;
12208 case IX86_BUILTIN_SCATTERPFDPS:
12209 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12210 goto vec_prefetch_gen;
12211 case IX86_BUILTIN_SCATTERPFQPD:
12212 icode = CODE_FOR_avx512pf_scatterpfv8didf;
12213 goto vec_prefetch_gen;
12214 case IX86_BUILTIN_SCATTERPFQPS:
12215 icode = CODE_FOR_avx512pf_scatterpfv8disf;
12216 goto vec_prefetch_gen;
12217
12218 gather_gen:
12219 rtx half;
12220 rtx (*gen) (rtx, rtx);
12221
12222 arg0 = CALL_EXPR_ARG (exp, 0);
12223 arg1 = CALL_EXPR_ARG (exp, 1);
12224 arg2 = CALL_EXPR_ARG (exp, 2);
12225 arg3 = CALL_EXPR_ARG (exp, 3);
12226 arg4 = CALL_EXPR_ARG (exp, 4);
12227 op0 = expand_normal (arg0);
12228 op1 = expand_normal (arg1);
12229 op2 = expand_normal (arg2);
12230 op3 = expand_normal (arg3);
12231 op4 = expand_normal (arg4);
12232 /* Note the arg order is different from the operand order. */
12233 mode0 = insn_data[icode].operand[1].mode;
12234 mode2 = insn_data[icode].operand[3].mode;
12235 mode3 = insn_data[icode].operand[4].mode;
12236 mode4 = insn_data[icode].operand[5].mode;
12237
12238 if (target == NULL_RTX
12239 || GET_MODE (target) != insn_data[icode].operand[0].mode
12240 || !insn_data[icode].operand[0].predicate (target,
12241 GET_MODE (target)))
12242 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12243 else
12244 subtarget = target;
12245
12246 switch (fcode)
12247 {
12248 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12249 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12250 half = gen_reg_rtx (V8SImode);
12251 if (!nonimmediate_operand (op2, V16SImode))
12252 op2 = copy_to_mode_reg (V16SImode, op2);
12253 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12254 op2 = half;
12255 break;
12256 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12257 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12258 case IX86_BUILTIN_GATHERALTSIV4DF:
12259 case IX86_BUILTIN_GATHERALTSIV4DI:
12260 half = gen_reg_rtx (V4SImode);
12261 if (!nonimmediate_operand (op2, V8SImode))
12262 op2 = copy_to_mode_reg (V8SImode, op2);
12263 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12264 op2 = half;
12265 break;
12266 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12267 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12268 half = gen_reg_rtx (mode0);
12269 if (mode0 == V8SFmode)
12270 gen = gen_vec_extract_lo_v16sf;
12271 else
12272 gen = gen_vec_extract_lo_v16si;
12273 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12274 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12275 emit_insn (gen (half, op0));
12276 op0 = half;
12277 op3 = lowpart_subreg (QImode, op3, HImode);
12278 break;
12279 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12280 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12281 case IX86_BUILTIN_GATHERALTDIV8SF:
12282 case IX86_BUILTIN_GATHERALTDIV8SI:
12283 half = gen_reg_rtx (mode0);
12284 if (mode0 == V4SFmode)
12285 gen = gen_vec_extract_lo_v8sf;
12286 else
12287 gen = gen_vec_extract_lo_v8si;
12288 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12289 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12290 emit_insn (gen (half, op0));
12291 op0 = half;
12292 if (VECTOR_MODE_P (GET_MODE (op3)))
12293 {
12294 half = gen_reg_rtx (mode0);
12295 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12296 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12297 emit_insn (gen (half, op3));
12298 op3 = half;
12299 }
12300 break;
12301 default:
12302 break;
12303 }
12304
12305 /* Force memory operand only with base register here. But we
12306 don't want to do it on memory operand for other builtin
12307 functions. */
12308 op1 = ix86_zero_extend_to_Pmode (op1);
12309
12310 if (!insn_data[icode].operand[1].predicate (op0, mode0))
12311 op0 = copy_to_mode_reg (mode0, op0);
12312 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12313 op1 = copy_to_mode_reg (Pmode, op1);
12314 if (!insn_data[icode].operand[3].predicate (op2, mode2))
12315 op2 = copy_to_mode_reg (mode2, op2);
12316
12317 op3 = fixup_modeless_constant (op3, mode3);
12318
12319 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12320 {
12321 if (!insn_data[icode].operand[4].predicate (op3, mode3))
12322 op3 = copy_to_mode_reg (mode3, op3);
12323 }
12324 else
12325 {
12326 op3 = copy_to_reg (op3);
12327 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12328 }
12329 if (!insn_data[icode].operand[5].predicate (op4, mode4))
12330 {
12331 error ("the last argument must be scale 1, 2, 4, 8");
12332 return const0_rtx;
12333 }
12334
12335 /* Optimize. If mask is known to have all high bits set,
12336 replace op0 with pc_rtx to signal that the instruction
12337 overwrites the whole destination and doesn't use its
12338 previous contents. */
12339 if (optimize)
12340 {
12341 if (TREE_CODE (arg3) == INTEGER_CST)
12342 {
12343 if (integer_all_onesp (arg3))
12344 op0 = pc_rtx;
12345 }
12346 else if (TREE_CODE (arg3) == VECTOR_CST)
12347 {
12348 unsigned int negative = 0;
12349 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12350 {
12351 tree cst = VECTOR_CST_ELT (arg3, i);
12352 if (TREE_CODE (cst) == INTEGER_CST
12353 && tree_int_cst_sign_bit (cst))
12354 negative++;
12355 else if (TREE_CODE (cst) == REAL_CST
12356 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12357 negative++;
12358 }
12359 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12360 op0 = pc_rtx;
12361 }
12362 else if (TREE_CODE (arg3) == SSA_NAME
12363 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12364 {
12365 /* Recognize also when mask is like:
12366 __v2df src = _mm_setzero_pd ();
12367 __v2df mask = _mm_cmpeq_pd (src, src);
12368 or
12369 __v8sf src = _mm256_setzero_ps ();
12370 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12371 as that is a cheaper way to load all ones into
12372 a register than having to load a constant from
12373 memory. */
12374 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12375 if (is_gimple_call (def_stmt))
12376 {
12377 tree fndecl = gimple_call_fndecl (def_stmt);
12378 if (fndecl
12379 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
12380 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
12381 {
12382 case IX86_BUILTIN_CMPPD:
12383 case IX86_BUILTIN_CMPPS:
12384 case IX86_BUILTIN_CMPPD256:
12385 case IX86_BUILTIN_CMPPS256:
12386 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12387 break;
12388 /* FALLTHRU */
12389 case IX86_BUILTIN_CMPEQPD:
12390 case IX86_BUILTIN_CMPEQPS:
12391 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12392 && initializer_zerop (gimple_call_arg (def_stmt,
12393 1)))
12394 op0 = pc_rtx;
12395 break;
12396 default:
12397 break;
12398 }
12399 }
12400 }
12401 }
12402
12403 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12404 if (! pat)
12405 return const0_rtx;
12406 emit_insn (pat);
12407
12408 switch (fcode)
12409 {
12410 case IX86_BUILTIN_GATHER3DIV16SF:
12411 if (target == NULL_RTX)
12412 target = gen_reg_rtx (V8SFmode);
12413 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12414 break;
12415 case IX86_BUILTIN_GATHER3DIV16SI:
12416 if (target == NULL_RTX)
12417 target = gen_reg_rtx (V8SImode);
12418 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12419 break;
12420 case IX86_BUILTIN_GATHER3DIV8SF:
12421 case IX86_BUILTIN_GATHERDIV8SF:
12422 if (target == NULL_RTX)
12423 target = gen_reg_rtx (V4SFmode);
12424 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12425 break;
12426 case IX86_BUILTIN_GATHER3DIV8SI:
12427 case IX86_BUILTIN_GATHERDIV8SI:
12428 if (target == NULL_RTX)
12429 target = gen_reg_rtx (V4SImode);
12430 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12431 break;
12432 default:
12433 target = subtarget;
12434 break;
12435 }
12436 return target;
12437
12438 scatter_gen:
12439 arg0 = CALL_EXPR_ARG (exp, 0);
12440 arg1 = CALL_EXPR_ARG (exp, 1);
12441 arg2 = CALL_EXPR_ARG (exp, 2);
12442 arg3 = CALL_EXPR_ARG (exp, 3);
12443 arg4 = CALL_EXPR_ARG (exp, 4);
12444 op0 = expand_normal (arg0);
12445 op1 = expand_normal (arg1);
12446 op2 = expand_normal (arg2);
12447 op3 = expand_normal (arg3);
12448 op4 = expand_normal (arg4);
12449 mode1 = insn_data[icode].operand[1].mode;
12450 mode2 = insn_data[icode].operand[2].mode;
12451 mode3 = insn_data[icode].operand[3].mode;
12452 mode4 = insn_data[icode].operand[4].mode;
12453
12454 /* Scatter instruction stores operand op3 to memory with
12455 indices from op2 and scale from op4 under writemask op1.
12456 If index operand op2 has more elements then source operand
12457 op3 one need to use only its low half. And vice versa. */
12458 switch (fcode)
12459 {
12460 case IX86_BUILTIN_SCATTERALTSIV8DF:
12461 case IX86_BUILTIN_SCATTERALTSIV8DI:
12462 half = gen_reg_rtx (V8SImode);
12463 if (!nonimmediate_operand (op2, V16SImode))
12464 op2 = copy_to_mode_reg (V16SImode, op2);
12465 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12466 op2 = half;
12467 break;
12468 case IX86_BUILTIN_SCATTERALTDIV16SF:
12469 case IX86_BUILTIN_SCATTERALTDIV16SI:
12470 half = gen_reg_rtx (mode3);
12471 if (mode3 == V8SFmode)
12472 gen = gen_vec_extract_lo_v16sf;
12473 else
12474 gen = gen_vec_extract_lo_v16si;
12475 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12476 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12477 emit_insn (gen (half, op3));
12478 op3 = half;
12479 break;
12480 case IX86_BUILTIN_SCATTERALTSIV4DF:
12481 case IX86_BUILTIN_SCATTERALTSIV4DI:
12482 half = gen_reg_rtx (V4SImode);
12483 if (!nonimmediate_operand (op2, V8SImode))
12484 op2 = copy_to_mode_reg (V8SImode, op2);
12485 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12486 op2 = half;
12487 break;
12488 case IX86_BUILTIN_SCATTERALTDIV8SF:
12489 case IX86_BUILTIN_SCATTERALTDIV8SI:
12490 half = gen_reg_rtx (mode3);
12491 if (mode3 == V4SFmode)
12492 gen = gen_vec_extract_lo_v8sf;
12493 else
12494 gen = gen_vec_extract_lo_v8si;
12495 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12496 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12497 emit_insn (gen (half, op3));
12498 op3 = half;
12499 break;
12500 case IX86_BUILTIN_SCATTERALTSIV2DF:
12501 case IX86_BUILTIN_SCATTERALTSIV2DI:
12502 if (!nonimmediate_operand (op2, V4SImode))
12503 op2 = copy_to_mode_reg (V4SImode, op2);
12504 break;
12505 case IX86_BUILTIN_SCATTERALTDIV4SF:
12506 case IX86_BUILTIN_SCATTERALTDIV4SI:
12507 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12508 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12509 break;
12510 default:
12511 break;
12512 }
12513
12514 /* Force memory operand only with base register here. But we
12515 don't want to do it on memory operand for other builtin
12516 functions. */
12517 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
12518
12519 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12520 op0 = copy_to_mode_reg (Pmode, op0);
12521
12522 op1 = fixup_modeless_constant (op1, mode1);
12523
12524 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
12525 {
12526 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12527 op1 = copy_to_mode_reg (mode1, op1);
12528 }
12529 else
12530 {
12531 op1 = copy_to_reg (op1);
12532 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
12533 }
12534
12535 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12536 op2 = copy_to_mode_reg (mode2, op2);
12537
12538 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12539 op3 = copy_to_mode_reg (mode3, op3);
12540
12541 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12542 {
12543 error ("the last argument must be scale 1, 2, 4, 8");
12544 return const0_rtx;
12545 }
12546
12547 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12548 if (! pat)
12549 return const0_rtx;
12550
12551 emit_insn (pat);
12552 return 0;
12553
12554 vec_prefetch_gen:
12555 arg0 = CALL_EXPR_ARG (exp, 0);
12556 arg1 = CALL_EXPR_ARG (exp, 1);
12557 arg2 = CALL_EXPR_ARG (exp, 2);
12558 arg3 = CALL_EXPR_ARG (exp, 3);
12559 arg4 = CALL_EXPR_ARG (exp, 4);
12560 op0 = expand_normal (arg0);
12561 op1 = expand_normal (arg1);
12562 op2 = expand_normal (arg2);
12563 op3 = expand_normal (arg3);
12564 op4 = expand_normal (arg4);
12565 mode0 = insn_data[icode].operand[0].mode;
12566 mode1 = insn_data[icode].operand[1].mode;
12567 mode3 = insn_data[icode].operand[3].mode;
12568 mode4 = insn_data[icode].operand[4].mode;
12569
12570 op0 = fixup_modeless_constant (op0, mode0);
12571
12572 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
12573 {
12574 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12575 op0 = copy_to_mode_reg (mode0, op0);
12576 }
12577 else
12578 {
12579 op0 = copy_to_reg (op0);
12580 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
12581 }
12582
12583 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12584 op1 = copy_to_mode_reg (mode1, op1);
12585
12586 /* Force memory operand only with base register here. But we
12587 don't want to do it on memory operand for other builtin
12588 functions. */
12589 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
12590
12591 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
12592 op2 = copy_to_mode_reg (Pmode, op2);
12593
12594 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12595 {
12596 error ("the forth argument must be scale 1, 2, 4, 8");
12597 return const0_rtx;
12598 }
12599
12600 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12601 {
12602 error ("incorrect hint operand");
12603 return const0_rtx;
12604 }
12605
12606 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12607 if (! pat)
12608 return const0_rtx;
12609
12610 emit_insn (pat);
12611
12612 return 0;
12613
12614 case IX86_BUILTIN_XABORT:
12615 icode = CODE_FOR_xabort;
12616 arg0 = CALL_EXPR_ARG (exp, 0);
12617 op0 = expand_normal (arg0);
12618 mode0 = insn_data[icode].operand[0].mode;
12619 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12620 {
12621 error ("the argument to %<xabort%> intrinsic must "
12622 "be an 8-bit immediate");
12623 return const0_rtx;
12624 }
12625 emit_insn (gen_xabort (op0));
12626 return 0;
12627
12628 case IX86_BUILTIN_RSTORSSP:
12629 case IX86_BUILTIN_CLRSSBSY:
12630 arg0 = CALL_EXPR_ARG (exp, 0);
12631 op0 = expand_normal (arg0);
12632 icode = (fcode == IX86_BUILTIN_RSTORSSP
12633 ? CODE_FOR_rstorssp
12634 : CODE_FOR_clrssbsy);
12635 if (!address_operand (op0, VOIDmode))
12636 {
12637 op1 = convert_memory_address (Pmode, op0);
12638 op0 = copy_addr_to_reg (op1);
12639 }
12640 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
12641 return 0;
12642
12643 case IX86_BUILTIN_WRSSD:
12644 case IX86_BUILTIN_WRSSQ:
12645 case IX86_BUILTIN_WRUSSD:
12646 case IX86_BUILTIN_WRUSSQ:
12647 arg0 = CALL_EXPR_ARG (exp, 0);
12648 op0 = expand_normal (arg0);
12649 arg1 = CALL_EXPR_ARG (exp, 1);
12650 op1 = expand_normal (arg1);
12651 switch (fcode)
12652 {
12653 case IX86_BUILTIN_WRSSD:
12654 icode = CODE_FOR_wrsssi;
12655 mode = SImode;
12656 break;
12657 case IX86_BUILTIN_WRSSQ:
12658 icode = CODE_FOR_wrssdi;
12659 mode = DImode;
12660 break;
12661 case IX86_BUILTIN_WRUSSD:
12662 icode = CODE_FOR_wrusssi;
12663 mode = SImode;
12664 break;
12665 case IX86_BUILTIN_WRUSSQ:
12666 icode = CODE_FOR_wrussdi;
12667 mode = DImode;
12668 break;
12669 }
12670 op0 = force_reg (mode, op0);
12671 if (!address_operand (op1, VOIDmode))
12672 {
12673 op2 = convert_memory_address (Pmode, op1);
12674 op1 = copy_addr_to_reg (op2);
12675 }
12676 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
12677 return 0;
12678
12679 default:
12680 break;
12681 }
12682
12683 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12684 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
12685 {
12686 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
12687 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
12688 target);
12689 }
12690
12691 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
12692 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
12693 {
12694 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
12695 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
12696 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
12697 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
12698 int masked = 1;
12699 machine_mode mode, wide_mode, nar_mode;
12700
12701 nar_mode = V4SFmode;
12702 mode = V16SFmode;
12703 wide_mode = V64SFmode;
12704 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
12705 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
12706
12707 switch (fcode)
12708 {
12709 case IX86_BUILTIN_4FMAPS:
12710 fcn = gen_avx5124fmaddps_4fmaddps;
12711 masked = 0;
12712 goto v4fma_expand;
12713
12714 case IX86_BUILTIN_4DPWSSD:
12715 nar_mode = V4SImode;
12716 mode = V16SImode;
12717 wide_mode = V64SImode;
12718 fcn = gen_avx5124vnniw_vp4dpwssd;
12719 masked = 0;
12720 goto v4fma_expand;
12721
12722 case IX86_BUILTIN_4DPWSSDS:
12723 nar_mode = V4SImode;
12724 mode = V16SImode;
12725 wide_mode = V64SImode;
12726 fcn = gen_avx5124vnniw_vp4dpwssds;
12727 masked = 0;
12728 goto v4fma_expand;
12729
12730 case IX86_BUILTIN_4FNMAPS:
12731 fcn = gen_avx5124fmaddps_4fnmaddps;
12732 masked = 0;
12733 goto v4fma_expand;
12734
12735 case IX86_BUILTIN_4FNMAPS_MASK:
12736 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
12737 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
12738 goto v4fma_expand;
12739
12740 case IX86_BUILTIN_4DPWSSD_MASK:
12741 nar_mode = V4SImode;
12742 mode = V16SImode;
12743 wide_mode = V64SImode;
12744 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
12745 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
12746 goto v4fma_expand;
12747
12748 case IX86_BUILTIN_4DPWSSDS_MASK:
12749 nar_mode = V4SImode;
12750 mode = V16SImode;
12751 wide_mode = V64SImode;
12752 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
12753 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
12754 goto v4fma_expand;
12755
12756 case IX86_BUILTIN_4FMAPS_MASK:
12757 {
12758 tree args[4];
12759 rtx ops[4];
12760 rtx wide_reg;
12761 rtx accum;
12762 rtx addr;
12763 rtx mem;
12764
12765 v4fma_expand:
12766 wide_reg = gen_reg_rtx (wide_mode);
12767 for (i = 0; i < 4; i++)
12768 {
12769 args[i] = CALL_EXPR_ARG (exp, i);
12770 ops[i] = expand_normal (args[i]);
12771
12772 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
12773 ops[i]);
12774 }
12775
12776 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
12777 accum = force_reg (mode, accum);
12778
12779 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
12780 addr = force_reg (Pmode, addr);
12781
12782 mem = gen_rtx_MEM (nar_mode, addr);
12783
12784 target = gen_reg_rtx (mode);
12785
12786 emit_move_insn (target, accum);
12787
12788 if (! masked)
12789 emit_insn (fcn (target, accum, wide_reg, mem));
12790 else
12791 {
12792 rtx merge, mask;
12793 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
12794
12795 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
12796
12797 if (CONST_INT_P (mask))
12798 mask = fixup_modeless_constant (mask, HImode);
12799
12800 mask = force_reg (HImode, mask);
12801
12802 if (GET_MODE (mask) != HImode)
12803 mask = gen_rtx_SUBREG (HImode, mask, 0);
12804
12805 /* If merge is 0 then we're about to emit z-masked variant. */
12806 if (const0_operand (merge, mode))
12807 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
12808 /* If merge is the same as accum then emit merge-masked variant. */
12809 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
12810 {
12811 merge = force_reg (mode, merge);
12812 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
12813 }
12814 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12815 else
12816 {
12817 target = gen_reg_rtx (mode);
12818 emit_move_insn (target, merge);
12819 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
12820 }
12821 }
12822 return target;
12823 }
12824
12825 case IX86_BUILTIN_4FNMASS:
12826 fcn = gen_avx5124fmaddps_4fnmaddss;
12827 masked = 0;
12828 goto s4fma_expand;
12829
12830 case IX86_BUILTIN_4FMASS:
12831 fcn = gen_avx5124fmaddps_4fmaddss;
12832 masked = 0;
12833 goto s4fma_expand;
12834
12835 case IX86_BUILTIN_4FNMASS_MASK:
12836 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
12837 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
12838 goto s4fma_expand;
12839
12840 case IX86_BUILTIN_4FMASS_MASK:
12841 {
12842 tree args[4];
12843 rtx ops[4];
12844 rtx wide_reg;
12845 rtx accum;
12846 rtx addr;
12847 rtx mem;
12848
12849 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
12850 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
12851
12852 s4fma_expand:
12853 mode = V4SFmode;
12854 wide_reg = gen_reg_rtx (V64SFmode);
12855 for (i = 0; i < 4; i++)
12856 {
12857 rtx tmp;
12858 args[i] = CALL_EXPR_ARG (exp, i);
12859 ops[i] = expand_normal (args[i]);
12860
12861 tmp = gen_reg_rtx (SFmode);
12862 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
12863
12864 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
12865 gen_rtx_SUBREG (V16SFmode, tmp, 0));
12866 }
12867
12868 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
12869 accum = force_reg (V4SFmode, accum);
12870
12871 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
12872 addr = force_reg (Pmode, addr);
12873
12874 mem = gen_rtx_MEM (V4SFmode, addr);
12875
12876 target = gen_reg_rtx (V4SFmode);
12877
12878 emit_move_insn (target, accum);
12879
12880 if (! masked)
12881 emit_insn (fcn (target, accum, wide_reg, mem));
12882 else
12883 {
12884 rtx merge, mask;
12885 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
12886
12887 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
12888
12889 if (CONST_INT_P (mask))
12890 mask = fixup_modeless_constant (mask, QImode);
12891
12892 mask = force_reg (QImode, mask);
12893
12894 if (GET_MODE (mask) != QImode)
12895 mask = gen_rtx_SUBREG (QImode, mask, 0);
12896
12897 /* If merge is 0 then we're about to emit z-masked variant. */
12898 if (const0_operand (merge, mode))
12899 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
12900 /* If merge is the same as accum then emit merge-masked
12901 variant. */
12902 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
12903 {
12904 merge = force_reg (mode, merge);
12905 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
12906 }
12907 /* Merge with something unknown might happen if we z-mask
12908 w/ -O0. */
12909 else
12910 {
12911 target = gen_reg_rtx (mode);
12912 emit_move_insn (target, merge);
12913 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
12914 }
12915 }
12916 return target;
12917 }
12918 case IX86_BUILTIN_RDPID:
12919 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
12920 target);
12921 case IX86_BUILTIN_FABSQ:
12922 case IX86_BUILTIN_COPYSIGNQ:
12923 if (!TARGET_SSE)
12924 /* Emit a normal call if SSE isn't available. */
12925 return expand_call (exp, target, ignore);
12926 /* FALLTHRU */
12927 default:
12928 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
12929 }
12930 }
12931
12932 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
12933 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
12934 {
12935 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
12936 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
12937 }
12938
12939 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
12940 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
12941 {
12942 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
12943 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
12944 }
12945
12946 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
12947 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
12948 {
12949 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
12950 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
12951 }
12952
12953 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
12954 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
12955 {
12956 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
12957 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
12958 }
12959
12960 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
12961 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
12962 {
12963 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
12964 const struct builtin_description *d = bdesc_multi_arg + i;
12965 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
12966 (enum ix86_builtin_func_type)
12967 d->flag, d->comparison);
12968 }
12969
12970 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
12971 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
12972 {
12973 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
12974 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
12975 target);
12976 }
12977
12978 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
12979 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
12980 {
12981 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
12982 return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
12983 target);
12984 }
12985
12986 gcc_unreachable ();
12987 }
12988
12989 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
12990 fill target with val via vec_duplicate. */
12991
12992 static bool
12993 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
12994 {
12995 bool ok;
12996 rtx_insn *insn;
12997 rtx dup;
12998
12999 /* First attempt to recognize VAL as-is. */
13000 dup = gen_vec_duplicate (mode, val);
13001 insn = emit_insn (gen_rtx_SET (target, dup));
13002 if (recog_memoized (insn) < 0)
13003 {
13004 rtx_insn *seq;
13005 machine_mode innermode = GET_MODE_INNER (mode);
13006 rtx reg;
13007
13008 /* If that fails, force VAL into a register. */
13009
13010 start_sequence ();
13011 reg = force_reg (innermode, val);
13012 if (GET_MODE (reg) != innermode)
13013 reg = gen_lowpart (innermode, reg);
13014 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13015 seq = get_insns ();
13016 end_sequence ();
13017 if (seq)
13018 emit_insn_before (seq, insn);
13019
13020 ok = recog_memoized (insn) >= 0;
13021 gcc_assert (ok);
13022 }
13023 return true;
13024 }
13025
13026 /* Get a vector mode of the same size as the original but with elements
13027 twice as wide. This is only guaranteed to apply to integral vectors. */
13028
13029 static machine_mode
13030 get_mode_wider_vector (machine_mode o)
13031 {
13032 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13033 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13034 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13035 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13036 return n;
13037 }
13038
13039 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13040 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13041
13042 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13043 with all elements equal to VAR. Return true if successful. */
13044
13045 static bool
13046 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13047 rtx target, rtx val)
13048 {
13049 bool ok;
13050
13051 switch (mode)
13052 {
13053 case E_V2SImode:
13054 case E_V2SFmode:
13055 if (!mmx_ok)
13056 return false;
13057 /* FALLTHRU */
13058
13059 case E_V4DFmode:
13060 case E_V4DImode:
13061 case E_V8SFmode:
13062 case E_V8SImode:
13063 case E_V2DFmode:
13064 case E_V2DImode:
13065 case E_V4SFmode:
13066 case E_V4SImode:
13067 case E_V16SImode:
13068 case E_V8DImode:
13069 case E_V16SFmode:
13070 case E_V8DFmode:
13071 return ix86_vector_duplicate_value (mode, target, val);
13072
13073 case E_V4HImode:
13074 if (!mmx_ok)
13075 return false;
13076 if (TARGET_SSE || TARGET_3DNOW_A)
13077 {
13078 rtx x;
13079
13080 val = gen_lowpart (SImode, val);
13081 x = gen_rtx_TRUNCATE (HImode, val);
13082 x = gen_rtx_VEC_DUPLICATE (mode, x);
13083 emit_insn (gen_rtx_SET (target, x));
13084 return true;
13085 }
13086 goto widen;
13087
13088 case E_V8QImode:
13089 if (!mmx_ok)
13090 return false;
13091 goto widen;
13092
13093 case E_V8HImode:
13094 if (TARGET_AVX2)
13095 return ix86_vector_duplicate_value (mode, target, val);
13096
13097 if (TARGET_SSE2)
13098 {
13099 struct expand_vec_perm_d dperm;
13100 rtx tmp1, tmp2;
13101
13102 permute:
13103 memset (&dperm, 0, sizeof (dperm));
13104 dperm.target = target;
13105 dperm.vmode = mode;
13106 dperm.nelt = GET_MODE_NUNITS (mode);
13107 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13108 dperm.one_operand_p = true;
13109
13110 /* Extend to SImode using a paradoxical SUBREG. */
13111 tmp1 = gen_reg_rtx (SImode);
13112 emit_move_insn (tmp1, gen_lowpart (SImode, val));
13113
13114 /* Insert the SImode value as low element of a V4SImode vector. */
13115 tmp2 = gen_reg_rtx (V4SImode);
13116 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13117 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13118
13119 ok = (expand_vec_perm_1 (&dperm)
13120 || expand_vec_perm_broadcast_1 (&dperm));
13121 gcc_assert (ok);
13122 return ok;
13123 }
13124 goto widen;
13125
13126 case E_V16QImode:
13127 if (TARGET_AVX2)
13128 return ix86_vector_duplicate_value (mode, target, val);
13129
13130 if (TARGET_SSE2)
13131 goto permute;
13132 goto widen;
13133
13134 widen:
13135 /* Replicate the value once into the next wider mode and recurse. */
13136 {
13137 machine_mode smode, wsmode, wvmode;
13138 rtx x;
13139
13140 smode = GET_MODE_INNER (mode);
13141 wvmode = get_mode_wider_vector (mode);
13142 wsmode = GET_MODE_INNER (wvmode);
13143
13144 val = convert_modes (wsmode, smode, val, true);
13145 x = expand_simple_binop (wsmode, ASHIFT, val,
13146 GEN_INT (GET_MODE_BITSIZE (smode)),
13147 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13148 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13149
13150 x = gen_reg_rtx (wvmode);
13151 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13152 gcc_assert (ok);
13153 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13154 return ok;
13155 }
13156
13157 case E_V16HImode:
13158 case E_V32QImode:
13159 if (TARGET_AVX2)
13160 return ix86_vector_duplicate_value (mode, target, val);
13161 else
13162 {
13163 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13164 rtx x = gen_reg_rtx (hvmode);
13165
13166 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13167 gcc_assert (ok);
13168
13169 x = gen_rtx_VEC_CONCAT (mode, x, x);
13170 emit_insn (gen_rtx_SET (target, x));
13171 }
13172 return true;
13173
13174 case E_V64QImode:
13175 case E_V32HImode:
13176 if (TARGET_AVX512BW)
13177 return ix86_vector_duplicate_value (mode, target, val);
13178 else
13179 {
13180 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13181 rtx x = gen_reg_rtx (hvmode);
13182
13183 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13184 gcc_assert (ok);
13185
13186 x = gen_rtx_VEC_CONCAT (mode, x, x);
13187 emit_insn (gen_rtx_SET (target, x));
13188 }
13189 return true;
13190
13191 default:
13192 return false;
13193 }
13194 }
13195
13196 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13197 whose ONE_VAR element is VAR, and other elements are zero. Return true
13198 if successful. */
13199
13200 static bool
13201 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13202 rtx target, rtx var, int one_var)
13203 {
13204 machine_mode vsimode;
13205 rtx new_target;
13206 rtx x, tmp;
13207 bool use_vector_set = false;
13208 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13209
13210 switch (mode)
13211 {
13212 case E_V2DImode:
13213 /* For SSE4.1, we normally use vector set. But if the second
13214 element is zero and inter-unit moves are OK, we use movq
13215 instead. */
13216 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13217 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13218 && one_var == 0));
13219 break;
13220 case E_V16QImode:
13221 case E_V4SImode:
13222 case E_V4SFmode:
13223 use_vector_set = TARGET_SSE4_1;
13224 break;
13225 case E_V8HImode:
13226 use_vector_set = TARGET_SSE2;
13227 break;
13228 case E_V4HImode:
13229 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13230 break;
13231 case E_V32QImode:
13232 case E_V16HImode:
13233 use_vector_set = TARGET_AVX;
13234 break;
13235 case E_V8SImode:
13236 use_vector_set = TARGET_AVX;
13237 gen_vec_set_0 = gen_vec_setv8si_0;
13238 break;
13239 case E_V8SFmode:
13240 use_vector_set = TARGET_AVX;
13241 gen_vec_set_0 = gen_vec_setv8sf_0;
13242 break;
13243 case E_V4DFmode:
13244 use_vector_set = TARGET_AVX;
13245 gen_vec_set_0 = gen_vec_setv4df_0;
13246 break;
13247 case E_V4DImode:
13248 /* Use ix86_expand_vector_set in 64bit mode only. */
13249 use_vector_set = TARGET_AVX && TARGET_64BIT;
13250 gen_vec_set_0 = gen_vec_setv4di_0;
13251 break;
13252 case E_V16SImode:
13253 use_vector_set = TARGET_AVX512F && one_var == 0;
13254 gen_vec_set_0 = gen_vec_setv16si_0;
13255 break;
13256 case E_V16SFmode:
13257 use_vector_set = TARGET_AVX512F && one_var == 0;
13258 gen_vec_set_0 = gen_vec_setv16sf_0;
13259 break;
13260 case E_V8DFmode:
13261 use_vector_set = TARGET_AVX512F && one_var == 0;
13262 gen_vec_set_0 = gen_vec_setv8df_0;
13263 break;
13264 case E_V8DImode:
13265 /* Use ix86_expand_vector_set in 64bit mode only. */
13266 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13267 gen_vec_set_0 = gen_vec_setv8di_0;
13268 break;
13269 default:
13270 break;
13271 }
13272
13273 if (use_vector_set)
13274 {
13275 if (gen_vec_set_0 && one_var == 0)
13276 {
13277 var = force_reg (GET_MODE_INNER (mode), var);
13278 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13279 return true;
13280 }
13281 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13282 var = force_reg (GET_MODE_INNER (mode), var);
13283 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13284 return true;
13285 }
13286
13287 switch (mode)
13288 {
13289 case E_V2SFmode:
13290 case E_V2SImode:
13291 if (!mmx_ok)
13292 return false;
13293 /* FALLTHRU */
13294
13295 case E_V2DFmode:
13296 case E_V2DImode:
13297 if (one_var != 0)
13298 return false;
13299 var = force_reg (GET_MODE_INNER (mode), var);
13300 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13301 emit_insn (gen_rtx_SET (target, x));
13302 return true;
13303
13304 case E_V4SFmode:
13305 case E_V4SImode:
13306 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13307 new_target = gen_reg_rtx (mode);
13308 else
13309 new_target = target;
13310 var = force_reg (GET_MODE_INNER (mode), var);
13311 x = gen_rtx_VEC_DUPLICATE (mode, var);
13312 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13313 emit_insn (gen_rtx_SET (new_target, x));
13314 if (one_var != 0)
13315 {
13316 /* We need to shuffle the value to the correct position, so
13317 create a new pseudo to store the intermediate result. */
13318
13319 /* With SSE2, we can use the integer shuffle insns. */
13320 if (mode != V4SFmode && TARGET_SSE2)
13321 {
13322 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13323 const1_rtx,
13324 GEN_INT (one_var == 1 ? 0 : 1),
13325 GEN_INT (one_var == 2 ? 0 : 1),
13326 GEN_INT (one_var == 3 ? 0 : 1)));
13327 if (target != new_target)
13328 emit_move_insn (target, new_target);
13329 return true;
13330 }
13331
13332 /* Otherwise convert the intermediate result to V4SFmode and
13333 use the SSE1 shuffle instructions. */
13334 if (mode != V4SFmode)
13335 {
13336 tmp = gen_reg_rtx (V4SFmode);
13337 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13338 }
13339 else
13340 tmp = new_target;
13341
13342 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13343 const1_rtx,
13344 GEN_INT (one_var == 1 ? 0 : 1),
13345 GEN_INT (one_var == 2 ? 0+4 : 1+4),
13346 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13347
13348 if (mode != V4SFmode)
13349 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13350 else if (tmp != target)
13351 emit_move_insn (target, tmp);
13352 }
13353 else if (target != new_target)
13354 emit_move_insn (target, new_target);
13355 return true;
13356
13357 case E_V8HImode:
13358 case E_V16QImode:
13359 vsimode = V4SImode;
13360 goto widen;
13361 case E_V4HImode:
13362 case E_V8QImode:
13363 if (!mmx_ok)
13364 return false;
13365 vsimode = V2SImode;
13366 goto widen;
13367 widen:
13368 if (one_var != 0)
13369 return false;
13370
13371 /* Zero extend the variable element to SImode and recurse. */
13372 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13373
13374 x = gen_reg_rtx (vsimode);
13375 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13376 var, one_var))
13377 gcc_unreachable ();
13378
13379 emit_move_insn (target, gen_lowpart (mode, x));
13380 return true;
13381
13382 default:
13383 return false;
13384 }
13385 }
13386
13387 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13388 consisting of the values in VALS. It is known that all elements
13389 except ONE_VAR are constants. Return true if successful. */
13390
13391 static bool
13392 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13393 rtx target, rtx vals, int one_var)
13394 {
13395 rtx var = XVECEXP (vals, 0, one_var);
13396 machine_mode wmode;
13397 rtx const_vec, x;
13398
13399 const_vec = copy_rtx (vals);
13400 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13401 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13402
13403 switch (mode)
13404 {
13405 case E_V2DFmode:
13406 case E_V2DImode:
13407 case E_V2SFmode:
13408 case E_V2SImode:
13409 /* For the two element vectors, it's just as easy to use
13410 the general case. */
13411 return false;
13412
13413 case E_V4DImode:
13414 /* Use ix86_expand_vector_set in 64bit mode only. */
13415 if (!TARGET_64BIT)
13416 return false;
13417 /* FALLTHRU */
13418 case E_V4DFmode:
13419 case E_V8SFmode:
13420 case E_V8SImode:
13421 case E_V16HImode:
13422 case E_V32QImode:
13423 case E_V4SFmode:
13424 case E_V4SImode:
13425 case E_V8HImode:
13426 case E_V4HImode:
13427 break;
13428
13429 case E_V16QImode:
13430 if (TARGET_SSE4_1)
13431 break;
13432 wmode = V8HImode;
13433 goto widen;
13434 case E_V8QImode:
13435 wmode = V4HImode;
13436 goto widen;
13437 widen:
13438 /* There's no way to set one QImode entry easily. Combine
13439 the variable value with its adjacent constant value, and
13440 promote to an HImode set. */
13441 x = XVECEXP (vals, 0, one_var ^ 1);
13442 if (one_var & 1)
13443 {
13444 var = convert_modes (HImode, QImode, var, true);
13445 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13446 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13447 x = GEN_INT (INTVAL (x) & 0xff);
13448 }
13449 else
13450 {
13451 var = convert_modes (HImode, QImode, var, true);
13452 x = gen_int_mode (UINTVAL (x) << 8, HImode);
13453 }
13454 if (x != const0_rtx)
13455 var = expand_simple_binop (HImode, IOR, var, x, var,
13456 1, OPTAB_LIB_WIDEN);
13457
13458 x = gen_reg_rtx (wmode);
13459 emit_move_insn (x, gen_lowpart (wmode, const_vec));
13460 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
13461
13462 emit_move_insn (target, gen_lowpart (mode, x));
13463 return true;
13464
13465 default:
13466 return false;
13467 }
13468
13469 emit_move_insn (target, const_vec);
13470 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13471 return true;
13472 }
13473
13474 /* A subroutine of ix86_expand_vector_init_general. Use vector
13475 concatenate to handle the most general case: all values variable,
13476 and none identical. */
13477
13478 static void
13479 ix86_expand_vector_init_concat (machine_mode mode,
13480 rtx target, rtx *ops, int n)
13481 {
13482 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
13483 rtx first[16], second[8], third[4];
13484 rtvec v;
13485 int i, j;
13486
13487 switch (n)
13488 {
13489 case 2:
13490 switch (mode)
13491 {
13492 case E_V16SImode:
13493 cmode = V8SImode;
13494 break;
13495 case E_V16SFmode:
13496 cmode = V8SFmode;
13497 break;
13498 case E_V8DImode:
13499 cmode = V4DImode;
13500 break;
13501 case E_V8DFmode:
13502 cmode = V4DFmode;
13503 break;
13504 case E_V8SImode:
13505 cmode = V4SImode;
13506 break;
13507 case E_V8SFmode:
13508 cmode = V4SFmode;
13509 break;
13510 case E_V4DImode:
13511 cmode = V2DImode;
13512 break;
13513 case E_V4DFmode:
13514 cmode = V2DFmode;
13515 break;
13516 case E_V4SImode:
13517 cmode = V2SImode;
13518 break;
13519 case E_V4SFmode:
13520 cmode = V2SFmode;
13521 break;
13522 case E_V2DImode:
13523 cmode = DImode;
13524 break;
13525 case E_V2SImode:
13526 cmode = SImode;
13527 break;
13528 case E_V2DFmode:
13529 cmode = DFmode;
13530 break;
13531 case E_V2SFmode:
13532 cmode = SFmode;
13533 break;
13534 default:
13535 gcc_unreachable ();
13536 }
13537
13538 if (!register_operand (ops[1], cmode))
13539 ops[1] = force_reg (cmode, ops[1]);
13540 if (!register_operand (ops[0], cmode))
13541 ops[0] = force_reg (cmode, ops[0]);
13542 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
13543 ops[1])));
13544 break;
13545
13546 case 4:
13547 switch (mode)
13548 {
13549 case E_V4DImode:
13550 cmode = V2DImode;
13551 break;
13552 case E_V4DFmode:
13553 cmode = V2DFmode;
13554 break;
13555 case E_V4SImode:
13556 cmode = V2SImode;
13557 break;
13558 case E_V4SFmode:
13559 cmode = V2SFmode;
13560 break;
13561 default:
13562 gcc_unreachable ();
13563 }
13564 goto half;
13565
13566 case 8:
13567 switch (mode)
13568 {
13569 case E_V8DImode:
13570 cmode = V2DImode;
13571 hmode = V4DImode;
13572 break;
13573 case E_V8DFmode:
13574 cmode = V2DFmode;
13575 hmode = V4DFmode;
13576 break;
13577 case E_V8SImode:
13578 cmode = V2SImode;
13579 hmode = V4SImode;
13580 break;
13581 case E_V8SFmode:
13582 cmode = V2SFmode;
13583 hmode = V4SFmode;
13584 break;
13585 default:
13586 gcc_unreachable ();
13587 }
13588 goto half;
13589
13590 case 16:
13591 switch (mode)
13592 {
13593 case E_V16SImode:
13594 cmode = V2SImode;
13595 hmode = V4SImode;
13596 gmode = V8SImode;
13597 break;
13598 case E_V16SFmode:
13599 cmode = V2SFmode;
13600 hmode = V4SFmode;
13601 gmode = V8SFmode;
13602 break;
13603 default:
13604 gcc_unreachable ();
13605 }
13606 goto half;
13607
13608 half:
13609 /* FIXME: We process inputs backward to help RA. PR 36222. */
13610 i = n - 1;
13611 j = (n >> 1) - 1;
13612 for (; i > 0; i -= 2, j--)
13613 {
13614 first[j] = gen_reg_rtx (cmode);
13615 v = gen_rtvec (2, ops[i - 1], ops[i]);
13616 ix86_expand_vector_init (false, first[j],
13617 gen_rtx_PARALLEL (cmode, v));
13618 }
13619
13620 n >>= 1;
13621 if (n > 4)
13622 {
13623 gcc_assert (hmode != VOIDmode);
13624 gcc_assert (gmode != VOIDmode);
13625 for (i = j = 0; i < n; i += 2, j++)
13626 {
13627 second[j] = gen_reg_rtx (hmode);
13628 ix86_expand_vector_init_concat (hmode, second [j],
13629 &first [i], 2);
13630 }
13631 n >>= 1;
13632 for (i = j = 0; i < n; i += 2, j++)
13633 {
13634 third[j] = gen_reg_rtx (gmode);
13635 ix86_expand_vector_init_concat (gmode, third[j],
13636 &second[i], 2);
13637 }
13638 n >>= 1;
13639 ix86_expand_vector_init_concat (mode, target, third, n);
13640 }
13641 else if (n > 2)
13642 {
13643 gcc_assert (hmode != VOIDmode);
13644 for (i = j = 0; i < n; i += 2, j++)
13645 {
13646 second[j] = gen_reg_rtx (hmode);
13647 ix86_expand_vector_init_concat (hmode, second [j],
13648 &first [i], 2);
13649 }
13650 n >>= 1;
13651 ix86_expand_vector_init_concat (mode, target, second, n);
13652 }
13653 else
13654 ix86_expand_vector_init_concat (mode, target, first, n);
13655 break;
13656
13657 default:
13658 gcc_unreachable ();
13659 }
13660 }
13661
13662 /* A subroutine of ix86_expand_vector_init_general. Use vector
13663 interleave to handle the most general case: all values variable,
13664 and none identical. */
13665
13666 static void
13667 ix86_expand_vector_init_interleave (machine_mode mode,
13668 rtx target, rtx *ops, int n)
13669 {
13670 machine_mode first_imode, second_imode, third_imode, inner_mode;
13671 int i, j;
13672 rtx op0, op1;
13673 rtx (*gen_load_even) (rtx, rtx, rtx);
13674 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
13675 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
13676
13677 switch (mode)
13678 {
13679 case E_V8HImode:
13680 gen_load_even = gen_vec_setv8hi;
13681 gen_interleave_first_low = gen_vec_interleave_lowv4si;
13682 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13683 inner_mode = HImode;
13684 first_imode = V4SImode;
13685 second_imode = V2DImode;
13686 third_imode = VOIDmode;
13687 break;
13688 case E_V16QImode:
13689 gen_load_even = gen_vec_setv16qi;
13690 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
13691 gen_interleave_second_low = gen_vec_interleave_lowv4si;
13692 inner_mode = QImode;
13693 first_imode = V8HImode;
13694 second_imode = V4SImode;
13695 third_imode = V2DImode;
13696 break;
13697 default:
13698 gcc_unreachable ();
13699 }
13700
13701 for (i = 0; i < n; i++)
13702 {
13703 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13704 op0 = gen_reg_rtx (SImode);
13705 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
13706
13707 /* Insert the SImode value as low element of V4SImode vector. */
13708 op1 = gen_reg_rtx (V4SImode);
13709 op0 = gen_rtx_VEC_MERGE (V4SImode,
13710 gen_rtx_VEC_DUPLICATE (V4SImode,
13711 op0),
13712 CONST0_RTX (V4SImode),
13713 const1_rtx);
13714 emit_insn (gen_rtx_SET (op1, op0));
13715
13716 /* Cast the V4SImode vector back to a vector in orignal mode. */
13717 op0 = gen_reg_rtx (mode);
13718 emit_move_insn (op0, gen_lowpart (mode, op1));
13719
13720 /* Load even elements into the second position. */
13721 emit_insn (gen_load_even (op0,
13722 force_reg (inner_mode,
13723 ops [i + i + 1]),
13724 const1_rtx));
13725
13726 /* Cast vector to FIRST_IMODE vector. */
13727 ops[i] = gen_reg_rtx (first_imode);
13728 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
13729 }
13730
13731 /* Interleave low FIRST_IMODE vectors. */
13732 for (i = j = 0; i < n; i += 2, j++)
13733 {
13734 op0 = gen_reg_rtx (first_imode);
13735 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
13736
13737 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13738 ops[j] = gen_reg_rtx (second_imode);
13739 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
13740 }
13741
13742 /* Interleave low SECOND_IMODE vectors. */
13743 switch (second_imode)
13744 {
13745 case E_V4SImode:
13746 for (i = j = 0; i < n / 2; i += 2, j++)
13747 {
13748 op0 = gen_reg_rtx (second_imode);
13749 emit_insn (gen_interleave_second_low (op0, ops[i],
13750 ops[i + 1]));
13751
13752 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13753 vector. */
13754 ops[j] = gen_reg_rtx (third_imode);
13755 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
13756 }
13757 second_imode = V2DImode;
13758 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13759 /* FALLTHRU */
13760
13761 case E_V2DImode:
13762 op0 = gen_reg_rtx (second_imode);
13763 emit_insn (gen_interleave_second_low (op0, ops[0],
13764 ops[1]));
13765
13766 /* Cast the SECOND_IMODE vector back to a vector on original
13767 mode. */
13768 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
13769 break;
13770
13771 default:
13772 gcc_unreachable ();
13773 }
13774 }
13775
13776 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
13777 all values variable, and none identical. */
13778
13779 static void
13780 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
13781 rtx target, rtx vals)
13782 {
13783 rtx ops[64], op0, op1, op2, op3, op4, op5;
13784 machine_mode half_mode = VOIDmode;
13785 machine_mode quarter_mode = VOIDmode;
13786 int n, i;
13787
13788 switch (mode)
13789 {
13790 case E_V2SFmode:
13791 case E_V2SImode:
13792 if (!mmx_ok && !TARGET_SSE)
13793 break;
13794 /* FALLTHRU */
13795
13796 case E_V16SImode:
13797 case E_V16SFmode:
13798 case E_V8DFmode:
13799 case E_V8DImode:
13800 case E_V8SFmode:
13801 case E_V8SImode:
13802 case E_V4DFmode:
13803 case E_V4DImode:
13804 case E_V4SFmode:
13805 case E_V4SImode:
13806 case E_V2DFmode:
13807 case E_V2DImode:
13808 n = GET_MODE_NUNITS (mode);
13809 for (i = 0; i < n; i++)
13810 ops[i] = XVECEXP (vals, 0, i);
13811 ix86_expand_vector_init_concat (mode, target, ops, n);
13812 return;
13813
13814 case E_V2TImode:
13815 for (i = 0; i < 2; i++)
13816 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13817 op0 = gen_reg_rtx (V4DImode);
13818 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
13819 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13820 return;
13821
13822 case E_V4TImode:
13823 for (i = 0; i < 4; i++)
13824 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13825 ops[4] = gen_reg_rtx (V4DImode);
13826 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
13827 ops[5] = gen_reg_rtx (V4DImode);
13828 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
13829 op0 = gen_reg_rtx (V8DImode);
13830 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
13831 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13832 return;
13833
13834 case E_V32QImode:
13835 half_mode = V16QImode;
13836 goto half;
13837
13838 case E_V16HImode:
13839 half_mode = V8HImode;
13840 goto half;
13841
13842 half:
13843 n = GET_MODE_NUNITS (mode);
13844 for (i = 0; i < n; i++)
13845 ops[i] = XVECEXP (vals, 0, i);
13846 op0 = gen_reg_rtx (half_mode);
13847 op1 = gen_reg_rtx (half_mode);
13848 ix86_expand_vector_init_interleave (half_mode, op0, ops,
13849 n >> 2);
13850 ix86_expand_vector_init_interleave (half_mode, op1,
13851 &ops [n >> 1], n >> 2);
13852 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
13853 return;
13854
13855 case E_V64QImode:
13856 quarter_mode = V16QImode;
13857 half_mode = V32QImode;
13858 goto quarter;
13859
13860 case E_V32HImode:
13861 quarter_mode = V8HImode;
13862 half_mode = V16HImode;
13863 goto quarter;
13864
13865 quarter:
13866 n = GET_MODE_NUNITS (mode);
13867 for (i = 0; i < n; i++)
13868 ops[i] = XVECEXP (vals, 0, i);
13869 op0 = gen_reg_rtx (quarter_mode);
13870 op1 = gen_reg_rtx (quarter_mode);
13871 op2 = gen_reg_rtx (quarter_mode);
13872 op3 = gen_reg_rtx (quarter_mode);
13873 op4 = gen_reg_rtx (half_mode);
13874 op5 = gen_reg_rtx (half_mode);
13875 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
13876 n >> 3);
13877 ix86_expand_vector_init_interleave (quarter_mode, op1,
13878 &ops [n >> 2], n >> 3);
13879 ix86_expand_vector_init_interleave (quarter_mode, op2,
13880 &ops [n >> 1], n >> 3);
13881 ix86_expand_vector_init_interleave (quarter_mode, op3,
13882 &ops [(n >> 1) | (n >> 2)], n >> 3);
13883 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
13884 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
13885 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
13886 return;
13887
13888 case E_V16QImode:
13889 if (!TARGET_SSE4_1)
13890 break;
13891 /* FALLTHRU */
13892
13893 case E_V8HImode:
13894 if (!TARGET_SSE2)
13895 break;
13896
13897 /* Don't use ix86_expand_vector_init_interleave if we can't
13898 move from GPR to SSE register directly. */
13899 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
13900 break;
13901
13902 n = GET_MODE_NUNITS (mode);
13903 for (i = 0; i < n; i++)
13904 ops[i] = XVECEXP (vals, 0, i);
13905 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
13906 return;
13907
13908 case E_V4HImode:
13909 case E_V8QImode:
13910 break;
13911
13912 default:
13913 gcc_unreachable ();
13914 }
13915
13916 {
13917 int i, j, n_elts, n_words, n_elt_per_word;
13918 machine_mode inner_mode;
13919 rtx words[4], shift;
13920
13921 inner_mode = GET_MODE_INNER (mode);
13922 n_elts = GET_MODE_NUNITS (mode);
13923 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
13924 n_elt_per_word = n_elts / n_words;
13925 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
13926
13927 for (i = 0; i < n_words; ++i)
13928 {
13929 rtx word = NULL_RTX;
13930
13931 for (j = 0; j < n_elt_per_word; ++j)
13932 {
13933 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
13934 elt = convert_modes (word_mode, inner_mode, elt, true);
13935
13936 if (j == 0)
13937 word = elt;
13938 else
13939 {
13940 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
13941 word, 1, OPTAB_LIB_WIDEN);
13942 word = expand_simple_binop (word_mode, IOR, word, elt,
13943 word, 1, OPTAB_LIB_WIDEN);
13944 }
13945 }
13946
13947 words[i] = word;
13948 }
13949
13950 if (n_words == 1)
13951 emit_move_insn (target, gen_lowpart (mode, words[0]));
13952 else if (n_words == 2)
13953 {
13954 rtx tmp = gen_reg_rtx (mode);
13955 emit_clobber (tmp);
13956 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
13957 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
13958 emit_move_insn (target, tmp);
13959 }
13960 else if (n_words == 4)
13961 {
13962 rtx tmp = gen_reg_rtx (V4SImode);
13963 gcc_assert (word_mode == SImode);
13964 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
13965 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
13966 emit_move_insn (target, gen_lowpart (mode, tmp));
13967 }
13968 else
13969 gcc_unreachable ();
13970 }
13971 }
13972
13973 /* Initialize vector TARGET via VALS. Suppress the use of MMX
13974 instructions unless MMX_OK is true. */
13975
13976 void
13977 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
13978 {
13979 machine_mode mode = GET_MODE (target);
13980 machine_mode inner_mode = GET_MODE_INNER (mode);
13981 int n_elts = GET_MODE_NUNITS (mode);
13982 int n_var = 0, one_var = -1;
13983 bool all_same = true, all_const_zero = true;
13984 int i;
13985 rtx x;
13986
13987 /* Handle first initialization from vector elts. */
13988 if (n_elts != XVECLEN (vals, 0))
13989 {
13990 rtx subtarget = target;
13991 x = XVECEXP (vals, 0, 0);
13992 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
13993 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
13994 {
13995 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
13996 if (inner_mode == QImode || inner_mode == HImode)
13997 {
13998 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
13999 mode = mode_for_vector (SImode, n_bits / 4).require ();
14000 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
14001 ops[0] = gen_lowpart (inner_mode, ops[0]);
14002 ops[1] = gen_lowpart (inner_mode, ops[1]);
14003 subtarget = gen_reg_rtx (mode);
14004 }
14005 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14006 if (subtarget != target)
14007 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14008 return;
14009 }
14010 gcc_unreachable ();
14011 }
14012
14013 for (i = 0; i < n_elts; ++i)
14014 {
14015 x = XVECEXP (vals, 0, i);
14016 if (!(CONST_SCALAR_INT_P (x)
14017 || CONST_DOUBLE_P (x)
14018 || CONST_FIXED_P (x)))
14019 n_var++, one_var = i;
14020 else if (x != CONST0_RTX (inner_mode))
14021 all_const_zero = false;
14022 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14023 all_same = false;
14024 }
14025
14026 /* Constants are best loaded from the constant pool. */
14027 if (n_var == 0)
14028 {
14029 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14030 return;
14031 }
14032
14033 /* If all values are identical, broadcast the value. */
14034 if (all_same
14035 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14036 XVECEXP (vals, 0, 0)))
14037 return;
14038
14039 /* Values where only one field is non-constant are best loaded from
14040 the pool and overwritten via move later. */
14041 if (n_var == 1)
14042 {
14043 if (all_const_zero
14044 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14045 XVECEXP (vals, 0, one_var),
14046 one_var))
14047 return;
14048
14049 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14050 return;
14051 }
14052
14053 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14054 }
14055
14056 void
14057 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14058 {
14059 machine_mode mode = GET_MODE (target);
14060 machine_mode inner_mode = GET_MODE_INNER (mode);
14061 machine_mode half_mode;
14062 bool use_vec_merge = false;
14063 rtx tmp;
14064 static rtx (*gen_extract[6][2]) (rtx, rtx)
14065 = {
14066 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14067 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14068 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14069 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14070 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14071 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14072 };
14073 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14074 = {
14075 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14076 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14077 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14078 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14079 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14080 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14081 };
14082 int i, j, n;
14083 machine_mode mmode = VOIDmode;
14084 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14085
14086 switch (mode)
14087 {
14088 case E_V2SFmode:
14089 case E_V2SImode:
14090 if (mmx_ok)
14091 {
14092 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14093 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14094 if (elt == 0)
14095 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14096 else
14097 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14098 emit_insn (gen_rtx_SET (target, tmp));
14099 return;
14100 }
14101 break;
14102
14103 case E_V2DImode:
14104 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14105 if (use_vec_merge)
14106 break;
14107
14108 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14109 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14110 if (elt == 0)
14111 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14112 else
14113 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14114 emit_insn (gen_rtx_SET (target, tmp));
14115 return;
14116
14117 case E_V2DFmode:
14118 {
14119 rtx op0, op1;
14120
14121 /* For the two element vectors, we implement a VEC_CONCAT with
14122 the extraction of the other element. */
14123
14124 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14125 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14126
14127 if (elt == 0)
14128 op0 = val, op1 = tmp;
14129 else
14130 op0 = tmp, op1 = val;
14131
14132 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14133 emit_insn (gen_rtx_SET (target, tmp));
14134 }
14135 return;
14136
14137 case E_V4SFmode:
14138 use_vec_merge = TARGET_SSE4_1;
14139 if (use_vec_merge)
14140 break;
14141
14142 switch (elt)
14143 {
14144 case 0:
14145 use_vec_merge = true;
14146 break;
14147
14148 case 1:
14149 /* tmp = target = A B C D */
14150 tmp = copy_to_reg (target);
14151 /* target = A A B B */
14152 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14153 /* target = X A B B */
14154 ix86_expand_vector_set (false, target, val, 0);
14155 /* target = A X C D */
14156 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14157 const1_rtx, const0_rtx,
14158 GEN_INT (2+4), GEN_INT (3+4)));
14159 return;
14160
14161 case 2:
14162 /* tmp = target = A B C D */
14163 tmp = copy_to_reg (target);
14164 /* tmp = X B C D */
14165 ix86_expand_vector_set (false, tmp, val, 0);
14166 /* target = A B X D */
14167 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14168 const0_rtx, const1_rtx,
14169 GEN_INT (0+4), GEN_INT (3+4)));
14170 return;
14171
14172 case 3:
14173 /* tmp = target = A B C D */
14174 tmp = copy_to_reg (target);
14175 /* tmp = X B C D */
14176 ix86_expand_vector_set (false, tmp, val, 0);
14177 /* target = A B X D */
14178 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14179 const0_rtx, const1_rtx,
14180 GEN_INT (2+4), GEN_INT (0+4)));
14181 return;
14182
14183 default:
14184 gcc_unreachable ();
14185 }
14186 break;
14187
14188 case E_V4SImode:
14189 use_vec_merge = TARGET_SSE4_1;
14190 if (use_vec_merge)
14191 break;
14192
14193 /* Element 0 handled by vec_merge below. */
14194 if (elt == 0)
14195 {
14196 use_vec_merge = true;
14197 break;
14198 }
14199
14200 if (TARGET_SSE2)
14201 {
14202 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14203 store into element 0, then shuffle them back. */
14204
14205 rtx order[4];
14206
14207 order[0] = GEN_INT (elt);
14208 order[1] = const1_rtx;
14209 order[2] = const2_rtx;
14210 order[3] = GEN_INT (3);
14211 order[elt] = const0_rtx;
14212
14213 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14214 order[1], order[2], order[3]));
14215
14216 ix86_expand_vector_set (false, target, val, 0);
14217
14218 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14219 order[1], order[2], order[3]));
14220 }
14221 else
14222 {
14223 /* For SSE1, we have to reuse the V4SF code. */
14224 rtx t = gen_reg_rtx (V4SFmode);
14225 emit_move_insn (t, gen_lowpart (V4SFmode, target));
14226 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14227 emit_move_insn (target, gen_lowpart (mode, t));
14228 }
14229 return;
14230
14231 case E_V8HImode:
14232 use_vec_merge = TARGET_SSE2;
14233 break;
14234 case E_V4HImode:
14235 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14236 break;
14237
14238 case E_V16QImode:
14239 use_vec_merge = TARGET_SSE4_1;
14240 break;
14241
14242 case E_V8QImode:
14243 break;
14244
14245 case E_V32QImode:
14246 half_mode = V16QImode;
14247 j = 0;
14248 n = 16;
14249 goto half;
14250
14251 case E_V16HImode:
14252 half_mode = V8HImode;
14253 j = 1;
14254 n = 8;
14255 goto half;
14256
14257 case E_V8SImode:
14258 half_mode = V4SImode;
14259 j = 2;
14260 n = 4;
14261 goto half;
14262
14263 case E_V4DImode:
14264 half_mode = V2DImode;
14265 j = 3;
14266 n = 2;
14267 goto half;
14268
14269 case E_V8SFmode:
14270 half_mode = V4SFmode;
14271 j = 4;
14272 n = 4;
14273 goto half;
14274
14275 case E_V4DFmode:
14276 half_mode = V2DFmode;
14277 j = 5;
14278 n = 2;
14279 goto half;
14280
14281 half:
14282 /* Compute offset. */
14283 i = elt / n;
14284 elt %= n;
14285
14286 gcc_assert (i <= 1);
14287
14288 /* Extract the half. */
14289 tmp = gen_reg_rtx (half_mode);
14290 emit_insn (gen_extract[j][i] (tmp, target));
14291
14292 /* Put val in tmp at elt. */
14293 ix86_expand_vector_set (false, tmp, val, elt);
14294
14295 /* Put it back. */
14296 emit_insn (gen_insert[j][i] (target, target, tmp));
14297 return;
14298
14299 case E_V8DFmode:
14300 if (TARGET_AVX512F)
14301 {
14302 mmode = QImode;
14303 gen_blendm = gen_avx512f_blendmv8df;
14304 }
14305 break;
14306
14307 case E_V8DImode:
14308 if (TARGET_AVX512F)
14309 {
14310 mmode = QImode;
14311 gen_blendm = gen_avx512f_blendmv8di;
14312 }
14313 break;
14314
14315 case E_V16SFmode:
14316 if (TARGET_AVX512F)
14317 {
14318 mmode = HImode;
14319 gen_blendm = gen_avx512f_blendmv16sf;
14320 }
14321 break;
14322
14323 case E_V16SImode:
14324 if (TARGET_AVX512F)
14325 {
14326 mmode = HImode;
14327 gen_blendm = gen_avx512f_blendmv16si;
14328 }
14329 break;
14330
14331 case E_V32HImode:
14332 if (TARGET_AVX512BW)
14333 {
14334 mmode = SImode;
14335 gen_blendm = gen_avx512bw_blendmv32hi;
14336 }
14337 else if (TARGET_AVX512F)
14338 {
14339 half_mode = E_V8HImode;
14340 n = 8;
14341 goto quarter;
14342 }
14343 break;
14344
14345 case E_V64QImode:
14346 if (TARGET_AVX512BW)
14347 {
14348 mmode = DImode;
14349 gen_blendm = gen_avx512bw_blendmv64qi;
14350 }
14351 else if (TARGET_AVX512F)
14352 {
14353 half_mode = E_V16QImode;
14354 n = 16;
14355 goto quarter;
14356 }
14357 break;
14358
14359 quarter:
14360 /* Compute offset. */
14361 i = elt / n;
14362 elt %= n;
14363
14364 gcc_assert (i <= 3);
14365
14366 {
14367 /* Extract the quarter. */
14368 tmp = gen_reg_rtx (V4SImode);
14369 rtx tmp2 = gen_lowpart (V16SImode, target);
14370 rtx mask = gen_reg_rtx (QImode);
14371
14372 emit_move_insn (mask, constm1_rtx);
14373 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
14374 tmp, mask));
14375
14376 tmp2 = gen_reg_rtx (half_mode);
14377 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
14378 tmp = tmp2;
14379
14380 /* Put val in tmp at elt. */
14381 ix86_expand_vector_set (false, tmp, val, elt);
14382
14383 /* Put it back. */
14384 tmp2 = gen_reg_rtx (V16SImode);
14385 rtx tmp3 = gen_lowpart (V16SImode, target);
14386 mask = gen_reg_rtx (HImode);
14387 emit_move_insn (mask, constm1_rtx);
14388 tmp = gen_lowpart (V4SImode, tmp);
14389 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
14390 tmp3, mask));
14391 emit_move_insn (target, gen_lowpart (mode, tmp2));
14392 }
14393 return;
14394
14395 default:
14396 break;
14397 }
14398
14399 if (mmode != VOIDmode)
14400 {
14401 tmp = gen_reg_rtx (mode);
14402 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
14403 /* The avx512*_blendm<mode> expanders have different operand order
14404 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14405 elements where the mask is set and second input operand otherwise,
14406 in {sse,avx}*_*blend* the first input operand is used for elements
14407 where the mask is clear and second input operand otherwise. */
14408 emit_insn (gen_blendm (target, target, tmp,
14409 force_reg (mmode,
14410 gen_int_mode (HOST_WIDE_INT_1U << elt,
14411 mmode))));
14412 }
14413 else if (use_vec_merge)
14414 {
14415 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
14416 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
14417 GEN_INT (HOST_WIDE_INT_1U << elt));
14418 emit_insn (gen_rtx_SET (target, tmp));
14419 }
14420 else
14421 {
14422 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14423
14424 emit_move_insn (mem, target);
14425
14426 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
14427 emit_move_insn (tmp, val);
14428
14429 emit_move_insn (target, mem);
14430 }
14431 }
14432
14433 void
14434 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
14435 {
14436 machine_mode mode = GET_MODE (vec);
14437 machine_mode inner_mode = GET_MODE_INNER (mode);
14438 bool use_vec_extr = false;
14439 rtx tmp;
14440
14441 switch (mode)
14442 {
14443 case E_V2SImode:
14444 case E_V2SFmode:
14445 if (!mmx_ok)
14446 break;
14447 /* FALLTHRU */
14448
14449 case E_V2DFmode:
14450 case E_V2DImode:
14451 case E_V2TImode:
14452 case E_V4TImode:
14453 use_vec_extr = true;
14454 break;
14455
14456 case E_V4SFmode:
14457 use_vec_extr = TARGET_SSE4_1;
14458 if (use_vec_extr)
14459 break;
14460
14461 switch (elt)
14462 {
14463 case 0:
14464 tmp = vec;
14465 break;
14466
14467 case 1:
14468 case 3:
14469 tmp = gen_reg_rtx (mode);
14470 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
14471 GEN_INT (elt), GEN_INT (elt),
14472 GEN_INT (elt+4), GEN_INT (elt+4)));
14473 break;
14474
14475 case 2:
14476 tmp = gen_reg_rtx (mode);
14477 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
14478 break;
14479
14480 default:
14481 gcc_unreachable ();
14482 }
14483 vec = tmp;
14484 use_vec_extr = true;
14485 elt = 0;
14486 break;
14487
14488 case E_V4SImode:
14489 use_vec_extr = TARGET_SSE4_1;
14490 if (use_vec_extr)
14491 break;
14492
14493 if (TARGET_SSE2)
14494 {
14495 switch (elt)
14496 {
14497 case 0:
14498 tmp = vec;
14499 break;
14500
14501 case 1:
14502 case 3:
14503 tmp = gen_reg_rtx (mode);
14504 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
14505 GEN_INT (elt), GEN_INT (elt),
14506 GEN_INT (elt), GEN_INT (elt)));
14507 break;
14508
14509 case 2:
14510 tmp = gen_reg_rtx (mode);
14511 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
14512 break;
14513
14514 default:
14515 gcc_unreachable ();
14516 }
14517 vec = tmp;
14518 use_vec_extr = true;
14519 elt = 0;
14520 }
14521 else
14522 {
14523 /* For SSE1, we have to reuse the V4SF code. */
14524 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
14525 gen_lowpart (V4SFmode, vec), elt);
14526 return;
14527 }
14528 break;
14529
14530 case E_V8HImode:
14531 use_vec_extr = TARGET_SSE2;
14532 break;
14533 case E_V4HImode:
14534 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14535 break;
14536
14537 case E_V16QImode:
14538 use_vec_extr = TARGET_SSE4_1;
14539 break;
14540
14541 case E_V8SFmode:
14542 if (TARGET_AVX)
14543 {
14544 tmp = gen_reg_rtx (V4SFmode);
14545 if (elt < 4)
14546 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
14547 else
14548 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
14549 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14550 return;
14551 }
14552 break;
14553
14554 case E_V4DFmode:
14555 if (TARGET_AVX)
14556 {
14557 tmp = gen_reg_rtx (V2DFmode);
14558 if (elt < 2)
14559 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
14560 else
14561 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
14562 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14563 return;
14564 }
14565 break;
14566
14567 case E_V32QImode:
14568 if (TARGET_AVX)
14569 {
14570 tmp = gen_reg_rtx (V16QImode);
14571 if (elt < 16)
14572 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
14573 else
14574 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
14575 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14576 return;
14577 }
14578 break;
14579
14580 case E_V16HImode:
14581 if (TARGET_AVX)
14582 {
14583 tmp = gen_reg_rtx (V8HImode);
14584 if (elt < 8)
14585 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
14586 else
14587 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
14588 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14589 return;
14590 }
14591 break;
14592
14593 case E_V8SImode:
14594 if (TARGET_AVX)
14595 {
14596 tmp = gen_reg_rtx (V4SImode);
14597 if (elt < 4)
14598 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
14599 else
14600 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
14601 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14602 return;
14603 }
14604 break;
14605
14606 case E_V4DImode:
14607 if (TARGET_AVX)
14608 {
14609 tmp = gen_reg_rtx (V2DImode);
14610 if (elt < 2)
14611 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
14612 else
14613 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
14614 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14615 return;
14616 }
14617 break;
14618
14619 case E_V32HImode:
14620 if (TARGET_AVX512BW)
14621 {
14622 tmp = gen_reg_rtx (V16HImode);
14623 if (elt < 16)
14624 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
14625 else
14626 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
14627 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14628 return;
14629 }
14630 break;
14631
14632 case E_V64QImode:
14633 if (TARGET_AVX512BW)
14634 {
14635 tmp = gen_reg_rtx (V32QImode);
14636 if (elt < 32)
14637 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
14638 else
14639 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
14640 ix86_expand_vector_extract (false, target, tmp, elt & 31);
14641 return;
14642 }
14643 break;
14644
14645 case E_V16SFmode:
14646 tmp = gen_reg_rtx (V8SFmode);
14647 if (elt < 8)
14648 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
14649 else
14650 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
14651 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14652 return;
14653
14654 case E_V8DFmode:
14655 tmp = gen_reg_rtx (V4DFmode);
14656 if (elt < 4)
14657 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
14658 else
14659 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
14660 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14661 return;
14662
14663 case E_V16SImode:
14664 tmp = gen_reg_rtx (V8SImode);
14665 if (elt < 8)
14666 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
14667 else
14668 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
14669 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14670 return;
14671
14672 case E_V8DImode:
14673 tmp = gen_reg_rtx (V4DImode);
14674 if (elt < 4)
14675 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
14676 else
14677 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
14678 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14679 return;
14680
14681 case E_V8QImode:
14682 /* ??? Could extract the appropriate HImode element and shift. */
14683 default:
14684 break;
14685 }
14686
14687 if (use_vec_extr)
14688 {
14689 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
14690 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
14691
14692 /* Let the rtl optimizers know about the zero extension performed. */
14693 if (inner_mode == QImode || inner_mode == HImode)
14694 {
14695 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
14696 target = gen_lowpart (SImode, target);
14697 }
14698
14699 emit_insn (gen_rtx_SET (target, tmp));
14700 }
14701 else
14702 {
14703 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14704
14705 emit_move_insn (mem, vec);
14706
14707 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
14708 emit_move_insn (target, tmp);
14709 }
14710 }
14711
14712 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14713 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14714 The upper bits of DEST are undefined, though they shouldn't cause
14715 exceptions (some bits from src or all zeros are ok). */
14716
14717 static void
14718 emit_reduc_half (rtx dest, rtx src, int i)
14719 {
14720 rtx tem, d = dest;
14721 switch (GET_MODE (src))
14722 {
14723 case E_V4SFmode:
14724 if (i == 128)
14725 tem = gen_sse_movhlps (dest, src, src);
14726 else
14727 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
14728 GEN_INT (1 + 4), GEN_INT (1 + 4));
14729 break;
14730 case E_V2DFmode:
14731 tem = gen_vec_interleave_highv2df (dest, src, src);
14732 break;
14733 case E_V16QImode:
14734 case E_V8HImode:
14735 case E_V4SImode:
14736 case E_V2DImode:
14737 d = gen_reg_rtx (V1TImode);
14738 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
14739 GEN_INT (i / 2));
14740 break;
14741 case E_V8SFmode:
14742 if (i == 256)
14743 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
14744 else
14745 tem = gen_avx_shufps256 (dest, src, src,
14746 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
14747 break;
14748 case E_V4DFmode:
14749 if (i == 256)
14750 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
14751 else
14752 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
14753 break;
14754 case E_V32QImode:
14755 case E_V16HImode:
14756 case E_V8SImode:
14757 case E_V4DImode:
14758 if (i == 256)
14759 {
14760 if (GET_MODE (dest) != V4DImode)
14761 d = gen_reg_rtx (V4DImode);
14762 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
14763 gen_lowpart (V4DImode, src),
14764 const1_rtx);
14765 }
14766 else
14767 {
14768 d = gen_reg_rtx (V2TImode);
14769 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
14770 GEN_INT (i / 2));
14771 }
14772 break;
14773 case E_V64QImode:
14774 case E_V32HImode:
14775 case E_V16SImode:
14776 case E_V16SFmode:
14777 case E_V8DImode:
14778 case E_V8DFmode:
14779 if (i > 128)
14780 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
14781 gen_lowpart (V16SImode, src),
14782 gen_lowpart (V16SImode, src),
14783 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
14784 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
14785 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
14786 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
14787 GEN_INT (0xC), GEN_INT (0xD),
14788 GEN_INT (0xE), GEN_INT (0xF),
14789 GEN_INT (0x10), GEN_INT (0x11),
14790 GEN_INT (0x12), GEN_INT (0x13),
14791 GEN_INT (0x14), GEN_INT (0x15),
14792 GEN_INT (0x16), GEN_INT (0x17));
14793 else
14794 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
14795 gen_lowpart (V16SImode, src),
14796 GEN_INT (i == 128 ? 0x2 : 0x1),
14797 GEN_INT (0x3),
14798 GEN_INT (0x3),
14799 GEN_INT (0x3),
14800 GEN_INT (i == 128 ? 0x6 : 0x5),
14801 GEN_INT (0x7),
14802 GEN_INT (0x7),
14803 GEN_INT (0x7),
14804 GEN_INT (i == 128 ? 0xA : 0x9),
14805 GEN_INT (0xB),
14806 GEN_INT (0xB),
14807 GEN_INT (0xB),
14808 GEN_INT (i == 128 ? 0xE : 0xD),
14809 GEN_INT (0xF),
14810 GEN_INT (0xF),
14811 GEN_INT (0xF));
14812 break;
14813 default:
14814 gcc_unreachable ();
14815 }
14816 emit_insn (tem);
14817 if (d != dest)
14818 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
14819 }
14820
14821 /* Expand a vector reduction. FN is the binary pattern to reduce;
14822 DEST is the destination; IN is the input vector. */
14823
14824 void
14825 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
14826 {
14827 rtx half, dst, vec = in;
14828 machine_mode mode = GET_MODE (in);
14829 int i;
14830
14831 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
14832 if (TARGET_SSE4_1
14833 && mode == V8HImode
14834 && fn == gen_uminv8hi3)
14835 {
14836 emit_insn (gen_sse4_1_phminposuw (dest, in));
14837 return;
14838 }
14839
14840 for (i = GET_MODE_BITSIZE (mode);
14841 i > GET_MODE_UNIT_BITSIZE (mode);
14842 i >>= 1)
14843 {
14844 half = gen_reg_rtx (mode);
14845 emit_reduc_half (half, vec, i);
14846 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
14847 dst = dest;
14848 else
14849 dst = gen_reg_rtx (mode);
14850 emit_insn (fn (dst, half, vec));
14851 vec = dst;
14852 }
14853 }
14854
14855 /* Output code to perform a conditional jump to LABEL, if C2 flag in
14856 FP status register is set. */
14857
14858 void
14859 ix86_emit_fp_unordered_jump (rtx label)
14860 {
14861 rtx reg = gen_reg_rtx (HImode);
14862 rtx_insn *insn;
14863 rtx temp;
14864
14865 emit_insn (gen_x86_fnstsw_1 (reg));
14866
14867 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
14868 {
14869 emit_insn (gen_x86_sahf_1 (reg));
14870
14871 temp = gen_rtx_REG (CCmode, FLAGS_REG);
14872 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
14873 }
14874 else
14875 {
14876 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
14877
14878 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
14879 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
14880 }
14881
14882 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
14883 gen_rtx_LABEL_REF (VOIDmode, label),
14884 pc_rtx);
14885 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
14886 predict_jump (REG_BR_PROB_BASE * 10 / 100);
14887 JUMP_LABEL (insn) = label;
14888 }
14889
14890 /* Output code to perform an sinh XFmode calculation. */
14891
14892 void ix86_emit_i387_sinh (rtx op0, rtx op1)
14893 {
14894 rtx e1 = gen_reg_rtx (XFmode);
14895 rtx e2 = gen_reg_rtx (XFmode);
14896 rtx scratch = gen_reg_rtx (HImode);
14897 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
14898 rtx half = const_double_from_real_value (dconsthalf, XFmode);
14899 rtx cst1, tmp;
14900 rtx_code_label *jump_label = gen_label_rtx ();
14901 rtx_insn *insn;
14902
14903 /* scratch = fxam (op1) */
14904 emit_insn (gen_fxamxf2_i387 (scratch, op1));
14905
14906 /* e1 = expm1 (|op1|) */
14907 emit_insn (gen_absxf2 (e2, op1));
14908 emit_insn (gen_expm1xf2 (e1, e2));
14909
14910 /* e2 = e1 / (e1 + 1.0) + e1 */
14911 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
14912 emit_insn (gen_addxf3 (e2, e1, cst1));
14913 emit_insn (gen_divxf3 (e2, e1, e2));
14914 emit_insn (gen_addxf3 (e2, e2, e1));
14915
14916 /* flags = signbit (op1) */
14917 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
14918
14919 /* if (flags) then e2 = -e2 */
14920 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
14921 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
14922 gen_rtx_LABEL_REF (VOIDmode, jump_label),
14923 pc_rtx);
14924 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
14925 predict_jump (REG_BR_PROB_BASE * 50 / 100);
14926 JUMP_LABEL (insn) = jump_label;
14927
14928 emit_insn (gen_negxf2 (e2, e2));
14929
14930 emit_label (jump_label);
14931 LABEL_NUSES (jump_label) = 1;
14932
14933 /* op0 = 0.5 * e2 */
14934 half = force_reg (XFmode, half);
14935 emit_insn (gen_mulxf3 (op0, e2, half));
14936 }
14937
14938 /* Output code to perform an cosh XFmode calculation. */
14939
14940 void ix86_emit_i387_cosh (rtx op0, rtx op1)
14941 {
14942 rtx e1 = gen_reg_rtx (XFmode);
14943 rtx e2 = gen_reg_rtx (XFmode);
14944 rtx half = const_double_from_real_value (dconsthalf, XFmode);
14945 rtx cst1;
14946
14947 /* e1 = exp (op1) */
14948 emit_insn (gen_expxf2 (e1, op1));
14949
14950 /* e2 = e1 + 1.0 / e1 */
14951 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
14952 emit_insn (gen_divxf3 (e2, cst1, e1));
14953 emit_insn (gen_addxf3 (e2, e1, e2));
14954
14955 /* op0 = 0.5 * e2 */
14956 half = force_reg (XFmode, half);
14957 emit_insn (gen_mulxf3 (op0, e2, half));
14958 }
14959
14960 /* Output code to perform an tanh XFmode calculation. */
14961
14962 void ix86_emit_i387_tanh (rtx op0, rtx op1)
14963 {
14964 rtx e1 = gen_reg_rtx (XFmode);
14965 rtx e2 = gen_reg_rtx (XFmode);
14966 rtx scratch = gen_reg_rtx (HImode);
14967 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
14968 rtx cst2, tmp;
14969 rtx_code_label *jump_label = gen_label_rtx ();
14970 rtx_insn *insn;
14971
14972 /* scratch = fxam (op1) */
14973 emit_insn (gen_fxamxf2_i387 (scratch, op1));
14974
14975 /* e1 = expm1 (-|2 * op1|) */
14976 emit_insn (gen_addxf3 (e2, op1, op1));
14977 emit_insn (gen_absxf2 (e2, e2));
14978 emit_insn (gen_negxf2 (e2, e2));
14979 emit_insn (gen_expm1xf2 (e1, e2));
14980
14981 /* e2 = e1 / (e1 + 2.0) */
14982 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
14983 emit_insn (gen_addxf3 (e2, e1, cst2));
14984 emit_insn (gen_divxf3 (e2, e1, e2));
14985
14986 /* flags = signbit (op1) */
14987 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
14988
14989 /* if (!flags) then e2 = -e2 */
14990 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
14991 gen_rtx_NE (VOIDmode, flags, const0_rtx),
14992 gen_rtx_LABEL_REF (VOIDmode, jump_label),
14993 pc_rtx);
14994 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
14995 predict_jump (REG_BR_PROB_BASE * 50 / 100);
14996 JUMP_LABEL (insn) = jump_label;
14997
14998 emit_insn (gen_negxf2 (e2, e2));
14999
15000 emit_label (jump_label);
15001 LABEL_NUSES (jump_label) = 1;
15002
15003 emit_move_insn (op0, e2);
15004 }
15005
15006 /* Output code to perform an asinh XFmode calculation. */
15007
15008 void ix86_emit_i387_asinh (rtx op0, rtx op1)
15009 {
15010 rtx e1 = gen_reg_rtx (XFmode);
15011 rtx e2 = gen_reg_rtx (XFmode);
15012 rtx scratch = gen_reg_rtx (HImode);
15013 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15014 rtx cst1, tmp;
15015 rtx_code_label *jump_label = gen_label_rtx ();
15016 rtx_insn *insn;
15017
15018 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15019 emit_insn (gen_mulxf3 (e1, op1, op1));
15020 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15021 emit_insn (gen_addxf3 (e2, e1, cst1));
15022 emit_insn (gen_sqrtxf2 (e2, e2));
15023 emit_insn (gen_addxf3 (e2, e2, cst1));
15024
15025 /* e1 = e1 / e2 */
15026 emit_insn (gen_divxf3 (e1, e1, e2));
15027
15028 /* scratch = fxam (op1) */
15029 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15030
15031 /* e1 = e1 + |op1| */
15032 emit_insn (gen_absxf2 (e2, op1));
15033 emit_insn (gen_addxf3 (e1, e1, e2));
15034
15035 /* e2 = log1p (e1) */
15036 ix86_emit_i387_log1p (e2, e1);
15037
15038 /* flags = signbit (op1) */
15039 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15040
15041 /* if (flags) then e2 = -e2 */
15042 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15043 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15044 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15045 pc_rtx);
15046 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15047 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15048 JUMP_LABEL (insn) = jump_label;
15049
15050 emit_insn (gen_negxf2 (e2, e2));
15051
15052 emit_label (jump_label);
15053 LABEL_NUSES (jump_label) = 1;
15054
15055 emit_move_insn (op0, e2);
15056 }
15057
15058 /* Output code to perform an acosh XFmode calculation. */
15059
15060 void ix86_emit_i387_acosh (rtx op0, rtx op1)
15061 {
15062 rtx e1 = gen_reg_rtx (XFmode);
15063 rtx e2 = gen_reg_rtx (XFmode);
15064 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15065
15066 /* e2 = sqrt (op1 + 1.0) */
15067 emit_insn (gen_addxf3 (e2, op1, cst1));
15068 emit_insn (gen_sqrtxf2 (e2, e2));
15069
15070 /* e1 = sqrt (op1 - 1.0) */
15071 emit_insn (gen_subxf3 (e1, op1, cst1));
15072 emit_insn (gen_sqrtxf2 (e1, e1));
15073
15074 /* e1 = e1 * e2 */
15075 emit_insn (gen_mulxf3 (e1, e1, e2));
15076
15077 /* e1 = e1 + op1 */
15078 emit_insn (gen_addxf3 (e1, e1, op1));
15079
15080 /* op0 = log (e1) */
15081 emit_insn (gen_logxf2 (op0, e1));
15082 }
15083
15084 /* Output code to perform an atanh XFmode calculation. */
15085
15086 void ix86_emit_i387_atanh (rtx op0, rtx op1)
15087 {
15088 rtx e1 = gen_reg_rtx (XFmode);
15089 rtx e2 = gen_reg_rtx (XFmode);
15090 rtx scratch = gen_reg_rtx (HImode);
15091 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15092 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15093 rtx cst1, tmp;
15094 rtx_code_label *jump_label = gen_label_rtx ();
15095 rtx_insn *insn;
15096
15097 /* scratch = fxam (op1) */
15098 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15099
15100 /* e2 = |op1| */
15101 emit_insn (gen_absxf2 (e2, op1));
15102
15103 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15104 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15105 emit_insn (gen_addxf3 (e1, e2, cst1));
15106 emit_insn (gen_addxf3 (e2, e2, e2));
15107 emit_insn (gen_negxf2 (e2, e2));
15108 emit_insn (gen_divxf3 (e1, e2, e1));
15109
15110 /* e2 = log1p (e1) */
15111 ix86_emit_i387_log1p (e2, e1);
15112
15113 /* flags = signbit (op1) */
15114 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15115
15116 /* if (!flags) then e2 = -e2 */
15117 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15118 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15119 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15120 pc_rtx);
15121 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15122 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15123 JUMP_LABEL (insn) = jump_label;
15124
15125 emit_insn (gen_negxf2 (e2, e2));
15126
15127 emit_label (jump_label);
15128 LABEL_NUSES (jump_label) = 1;
15129
15130 /* op0 = 0.5 * e2 */
15131 half = force_reg (XFmode, half);
15132 emit_insn (gen_mulxf3 (op0, e2, half));
15133 }
15134
15135 /* Output code to perform a log1p XFmode calculation. */
15136
15137 void ix86_emit_i387_log1p (rtx op0, rtx op1)
15138 {
15139 rtx_code_label *label1 = gen_label_rtx ();
15140 rtx_code_label *label2 = gen_label_rtx ();
15141
15142 rtx tmp = gen_reg_rtx (XFmode);
15143 rtx res = gen_reg_rtx (XFmode);
15144 rtx cst, cstln2, cst1;
15145 rtx_insn *insn;
15146
15147 cst = const_double_from_real_value
15148 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15149 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15150
15151 emit_insn (gen_absxf2 (tmp, op1));
15152
15153 cst = force_reg (XFmode, cst);
15154 ix86_expand_branch (GE, tmp, cst, label1);
15155 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15156 insn = get_last_insn ();
15157 JUMP_LABEL (insn) = label1;
15158
15159 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15160 emit_jump (label2);
15161
15162 emit_label (label1);
15163 LABEL_NUSES (label1) = 1;
15164
15165 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15166 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15167 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15168
15169 emit_label (label2);
15170 LABEL_NUSES (label2) = 1;
15171
15172 emit_move_insn (op0, res);
15173 }
15174
15175 /* Emit code for round calculation. */
15176 void ix86_emit_i387_round (rtx op0, rtx op1)
15177 {
15178 machine_mode inmode = GET_MODE (op1);
15179 machine_mode outmode = GET_MODE (op0);
15180 rtx e1 = gen_reg_rtx (XFmode);
15181 rtx e2 = gen_reg_rtx (XFmode);
15182 rtx scratch = gen_reg_rtx (HImode);
15183 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15184 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15185 rtx res = gen_reg_rtx (outmode);
15186 rtx_code_label *jump_label = gen_label_rtx ();
15187 rtx (*floor_insn) (rtx, rtx);
15188 rtx (*neg_insn) (rtx, rtx);
15189 rtx_insn *insn;
15190 rtx tmp;
15191
15192 switch (inmode)
15193 {
15194 case E_SFmode:
15195 case E_DFmode:
15196 tmp = gen_reg_rtx (XFmode);
15197
15198 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15199 op1 = tmp;
15200 break;
15201 case E_XFmode:
15202 break;
15203 default:
15204 gcc_unreachable ();
15205 }
15206
15207 switch (outmode)
15208 {
15209 case E_SFmode:
15210 floor_insn = gen_frndintxf2_floor;
15211 neg_insn = gen_negsf2;
15212 break;
15213 case E_DFmode:
15214 floor_insn = gen_frndintxf2_floor;
15215 neg_insn = gen_negdf2;
15216 break;
15217 case E_XFmode:
15218 floor_insn = gen_frndintxf2_floor;
15219 neg_insn = gen_negxf2;
15220 break;
15221 case E_HImode:
15222 floor_insn = gen_lfloorxfhi2;
15223 neg_insn = gen_neghi2;
15224 break;
15225 case E_SImode:
15226 floor_insn = gen_lfloorxfsi2;
15227 neg_insn = gen_negsi2;
15228 break;
15229 case E_DImode:
15230 floor_insn = gen_lfloorxfdi2;
15231 neg_insn = gen_negdi2;
15232 break;
15233 default:
15234 gcc_unreachable ();
15235 }
15236
15237 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15238
15239 /* scratch = fxam(op1) */
15240 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15241
15242 /* e1 = fabs(op1) */
15243 emit_insn (gen_absxf2 (e1, op1));
15244
15245 /* e2 = e1 + 0.5 */
15246 half = force_reg (XFmode, half);
15247 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15248
15249 /* res = floor(e2) */
15250 switch (outmode)
15251 {
15252 case E_SFmode:
15253 case E_DFmode:
15254 {
15255 tmp = gen_reg_rtx (XFmode);
15256
15257 emit_insn (floor_insn (tmp, e2));
15258 emit_insn (gen_rtx_SET (res,
15259 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15260 UNSPEC_TRUNC_NOOP)));
15261 }
15262 break;
15263 default:
15264 emit_insn (floor_insn (res, e2));
15265 }
15266
15267 /* flags = signbit(a) */
15268 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15269
15270 /* if (flags) then res = -res */
15271 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15272 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15273 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15274 pc_rtx);
15275 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15276 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15277 JUMP_LABEL (insn) = jump_label;
15278
15279 emit_insn (neg_insn (res, res));
15280
15281 emit_label (jump_label);
15282 LABEL_NUSES (jump_label) = 1;
15283
15284 emit_move_insn (op0, res);
15285 }
15286
15287 /* Output code to perform a Newton-Rhapson approximation of a single precision
15288 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15289
15290 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15291 {
15292 rtx x0, x1, e0, e1;
15293
15294 x0 = gen_reg_rtx (mode);
15295 e0 = gen_reg_rtx (mode);
15296 e1 = gen_reg_rtx (mode);
15297 x1 = gen_reg_rtx (mode);
15298
15299 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15300
15301 b = force_reg (mode, b);
15302
15303 /* x0 = rcp(b) estimate */
15304 if (mode == V16SFmode || mode == V8DFmode)
15305 {
15306 if (TARGET_AVX512ER)
15307 {
15308 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15309 UNSPEC_RCP28)));
15310 /* res = a * x0 */
15311 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15312 return;
15313 }
15314 else
15315 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15316 UNSPEC_RCP14)));
15317 }
15318 else
15319 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15320 UNSPEC_RCP)));
15321
15322 /* e0 = x0 * b */
15323 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15324
15325 /* e0 = x0 * e0 */
15326 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
15327
15328 /* e1 = x0 + x0 */
15329 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
15330
15331 /* x1 = e1 - e0 */
15332 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
15333
15334 /* res = a * x1 */
15335 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
15336 }
15337
15338 /* Output code to perform a Newton-Rhapson approximation of a
15339 single precision floating point [reciprocal] square root. */
15340
15341 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
15342 {
15343 rtx x0, e0, e1, e2, e3, mthree, mhalf;
15344 REAL_VALUE_TYPE r;
15345 int unspec;
15346
15347 x0 = gen_reg_rtx (mode);
15348 e0 = gen_reg_rtx (mode);
15349 e1 = gen_reg_rtx (mode);
15350 e2 = gen_reg_rtx (mode);
15351 e3 = gen_reg_rtx (mode);
15352
15353 if (TARGET_AVX512ER && mode == V16SFmode)
15354 {
15355 if (recip)
15356 /* res = rsqrt28(a) estimate */
15357 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15358 UNSPEC_RSQRT28)));
15359 else
15360 {
15361 /* x0 = rsqrt28(a) estimate */
15362 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15363 UNSPEC_RSQRT28)));
15364 /* res = rcp28(x0) estimate */
15365 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
15366 UNSPEC_RCP28)));
15367 }
15368 return;
15369 }
15370
15371 real_from_integer (&r, VOIDmode, -3, SIGNED);
15372 mthree = const_double_from_real_value (r, SFmode);
15373
15374 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
15375 mhalf = const_double_from_real_value (r, SFmode);
15376 unspec = UNSPEC_RSQRT;
15377
15378 if (VECTOR_MODE_P (mode))
15379 {
15380 mthree = ix86_build_const_vector (mode, true, mthree);
15381 mhalf = ix86_build_const_vector (mode, true, mhalf);
15382 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15383 if (GET_MODE_SIZE (mode) == 64)
15384 unspec = UNSPEC_RSQRT14;
15385 }
15386
15387 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15388 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15389
15390 a = force_reg (mode, a);
15391
15392 /* x0 = rsqrt(a) estimate */
15393 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15394 unspec)));
15395
15396 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15397 if (!recip)
15398 {
15399 rtx zero = force_reg (mode, CONST0_RTX(mode));
15400 rtx mask;
15401
15402 /* Handle masked compare. */
15403 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
15404 {
15405 mask = gen_reg_rtx (HImode);
15406 /* Imm value 0x4 corresponds to not-equal comparison. */
15407 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
15408 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
15409 }
15410 else
15411 {
15412 mask = gen_reg_rtx (mode);
15413 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
15414 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
15415 }
15416 }
15417
15418 /* e0 = x0 * a */
15419 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
15420 /* e1 = e0 * x0 */
15421 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
15422
15423 /* e2 = e1 - 3. */
15424 mthree = force_reg (mode, mthree);
15425 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
15426
15427 mhalf = force_reg (mode, mhalf);
15428 if (recip)
15429 /* e3 = -.5 * x0 */
15430 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
15431 else
15432 /* e3 = -.5 * e0 */
15433 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
15434 /* ret = e2 * e3 */
15435 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
15436 }
15437
15438 /* Expand fabs (OP0) and return a new rtx that holds the result. The
15439 mask for masking out the sign-bit is stored in *SMASK, if that is
15440 non-null. */
15441
15442 static rtx
15443 ix86_expand_sse_fabs (rtx op0, rtx *smask)
15444 {
15445 machine_mode vmode, mode = GET_MODE (op0);
15446 rtx xa, mask;
15447
15448 xa = gen_reg_rtx (mode);
15449 if (mode == SFmode)
15450 vmode = V4SFmode;
15451 else if (mode == DFmode)
15452 vmode = V2DFmode;
15453 else
15454 vmode = mode;
15455 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
15456 if (!VECTOR_MODE_P (mode))
15457 {
15458 /* We need to generate a scalar mode mask in this case. */
15459 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15460 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15461 mask = gen_reg_rtx (mode);
15462 emit_insn (gen_rtx_SET (mask, tmp));
15463 }
15464 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
15465
15466 if (smask)
15467 *smask = mask;
15468
15469 return xa;
15470 }
15471
15472 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15473 swapping the operands if SWAP_OPERANDS is true. The expanded
15474 code is a forward jump to a newly created label in case the
15475 comparison is true. The generated label rtx is returned. */
15476 static rtx_code_label *
15477 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
15478 bool swap_operands)
15479 {
15480 bool unordered_compare = ix86_unordered_fp_compare (code);
15481 rtx_code_label *label;
15482 rtx tmp, reg;
15483
15484 if (swap_operands)
15485 std::swap (op0, op1);
15486
15487 label = gen_label_rtx ();
15488 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
15489 if (unordered_compare)
15490 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
15491 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
15492 emit_insn (gen_rtx_SET (reg, tmp));
15493 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
15494 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
15495 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
15496 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15497 JUMP_LABEL (tmp) = label;
15498
15499 return label;
15500 }
15501
15502 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15503 using comparison code CODE. Operands are swapped for the comparison if
15504 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15505 static rtx
15506 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
15507 bool swap_operands)
15508 {
15509 rtx (*insn)(rtx, rtx, rtx, rtx);
15510 machine_mode mode = GET_MODE (op0);
15511 rtx mask = gen_reg_rtx (mode);
15512
15513 if (swap_operands)
15514 std::swap (op0, op1);
15515
15516 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
15517
15518 emit_insn (insn (mask, op0, op1,
15519 gen_rtx_fmt_ee (code, mode, op0, op1)));
15520 return mask;
15521 }
15522
15523 /* Expand copysign from SIGN to the positive value ABS_VALUE
15524 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15525 the sign-bit. */
15526
15527 static void
15528 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
15529 {
15530 machine_mode mode = GET_MODE (sign);
15531 rtx sgn = gen_reg_rtx (mode);
15532 if (mask == NULL_RTX)
15533 {
15534 machine_mode vmode;
15535
15536 if (mode == SFmode)
15537 vmode = V4SFmode;
15538 else if (mode == DFmode)
15539 vmode = V2DFmode;
15540 else
15541 vmode = mode;
15542
15543 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
15544 if (!VECTOR_MODE_P (mode))
15545 {
15546 /* We need to generate a scalar mode mask in this case. */
15547 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15548 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15549 mask = gen_reg_rtx (mode);
15550 emit_insn (gen_rtx_SET (mask, tmp));
15551 }
15552 }
15553 else
15554 mask = gen_rtx_NOT (mode, mask);
15555 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
15556 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
15557 }
15558
15559 /* Expand SSE sequence for computing lround from OP1 storing
15560 into OP0. */
15561
15562 void
15563 ix86_expand_lround (rtx op0, rtx op1)
15564 {
15565 /* C code for the stuff we're doing below:
15566 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15567 return (long)tmp;
15568 */
15569 machine_mode mode = GET_MODE (op1);
15570 const struct real_format *fmt;
15571 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
15572 rtx adj;
15573
15574 /* load nextafter (0.5, 0.0) */
15575 fmt = REAL_MODE_FORMAT (mode);
15576 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
15577 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
15578
15579 /* adj = copysign (0.5, op1) */
15580 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
15581 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
15582
15583 /* adj = op1 + adj */
15584 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
15585
15586 /* op0 = (imode)adj */
15587 expand_fix (op0, adj, 0);
15588 }
15589
15590 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15591 into OPERAND0. */
15592
15593 void
15594 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
15595 {
15596 /* C code for the stuff we're doing below (for do_floor):
15597 xi = (long)op1;
15598 xi -= (double)xi > op1 ? 1 : 0;
15599 return xi;
15600 */
15601 machine_mode fmode = GET_MODE (op1);
15602 machine_mode imode = GET_MODE (op0);
15603 rtx ireg, freg, tmp;
15604 rtx_code_label *label;
15605
15606 /* reg = (long)op1 */
15607 ireg = gen_reg_rtx (imode);
15608 expand_fix (ireg, op1, 0);
15609
15610 /* freg = (double)reg */
15611 freg = gen_reg_rtx (fmode);
15612 expand_float (freg, ireg, 0);
15613
15614 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15615 label = ix86_expand_sse_compare_and_jump (UNLE,
15616 freg, op1, !do_floor);
15617 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
15618 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
15619 emit_move_insn (ireg, tmp);
15620
15621 emit_label (label);
15622 LABEL_NUSES (label) = 1;
15623
15624 emit_move_insn (op0, ireg);
15625 }
15626
15627 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15628 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15629
15630 static rtx
15631 ix86_gen_TWO52 (machine_mode mode)
15632 {
15633 REAL_VALUE_TYPE TWO52r;
15634 rtx TWO52;
15635
15636 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
15637 TWO52 = const_double_from_real_value (TWO52r, mode);
15638 TWO52 = force_reg (mode, TWO52);
15639
15640 return TWO52;
15641 }
15642
15643 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15644
15645 void
15646 ix86_expand_rint (rtx operand0, rtx operand1)
15647 {
15648 /* C code for the stuff we're doing below:
15649 xa = fabs (operand1);
15650 if (!isless (xa, 2**52))
15651 return operand1;
15652 two52 = 2**52;
15653 if (flag_rounding_math)
15654 {
15655 two52 = copysign (two52, operand1);
15656 xa = operand1;
15657 }
15658 xa = xa + two52 - two52;
15659 return copysign (xa, operand1);
15660 */
15661 machine_mode mode = GET_MODE (operand0);
15662 rtx res, xa, TWO52, two52, mask;
15663 rtx_code_label *label;
15664
15665 res = gen_reg_rtx (mode);
15666 emit_move_insn (res, operand1);
15667
15668 /* xa = abs (operand1) */
15669 xa = ix86_expand_sse_fabs (res, &mask);
15670
15671 /* if (!isless (xa, TWO52)) goto label; */
15672 TWO52 = ix86_gen_TWO52 (mode);
15673 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15674
15675 two52 = TWO52;
15676 if (flag_rounding_math)
15677 {
15678 two52 = gen_reg_rtx (mode);
15679 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
15680 xa = res;
15681 }
15682
15683 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
15684 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
15685
15686 ix86_sse_copysign_to_positive (res, xa, res, mask);
15687
15688 emit_label (label);
15689 LABEL_NUSES (label) = 1;
15690
15691 emit_move_insn (operand0, res);
15692 }
15693
15694 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15695 into OPERAND0. */
15696 void
15697 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
15698 {
15699 /* C code for the stuff we expand below.
15700 double xa = fabs (x), x2;
15701 if (!isless (xa, TWO52))
15702 return x;
15703 xa = xa + TWO52 - TWO52;
15704 x2 = copysign (xa, x);
15705 Compensate. Floor:
15706 if (x2 > x)
15707 x2 -= 1;
15708 Compensate. Ceil:
15709 if (x2 < x)
15710 x2 += 1;
15711 if (HONOR_SIGNED_ZEROS (mode))
15712 x2 = copysign (x2, x);
15713 return x2;
15714 */
15715 machine_mode mode = GET_MODE (operand0);
15716 rtx xa, TWO52, tmp, one, res, mask;
15717 rtx_code_label *label;
15718
15719 TWO52 = ix86_gen_TWO52 (mode);
15720
15721 /* Temporary for holding the result, initialized to the input
15722 operand to ease control flow. */
15723 res = gen_reg_rtx (mode);
15724 emit_move_insn (res, operand1);
15725
15726 /* xa = abs (operand1) */
15727 xa = ix86_expand_sse_fabs (res, &mask);
15728
15729 /* if (!isless (xa, TWO52)) goto label; */
15730 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15731
15732 /* xa = xa + TWO52 - TWO52; */
15733 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
15734 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
15735
15736 /* xa = copysign (xa, operand1) */
15737 ix86_sse_copysign_to_positive (xa, xa, res, mask);
15738
15739 /* generate 1.0 */
15740 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15741
15742 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15743 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15744 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15745 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15746 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15747 if (!do_floor && HONOR_SIGNED_ZEROS (mode))
15748 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
15749 emit_move_insn (res, tmp);
15750
15751 emit_label (label);
15752 LABEL_NUSES (label) = 1;
15753
15754 emit_move_insn (operand0, res);
15755 }
15756
15757 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15758 into OPERAND0. */
15759 void
15760 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
15761 {
15762 /* C code for the stuff we expand below.
15763 double xa = fabs (x), x2;
15764 if (!isless (xa, TWO52))
15765 return x;
15766 x2 = (double)(long)x;
15767 Compensate. Floor:
15768 if (x2 > x)
15769 x2 -= 1;
15770 Compensate. Ceil:
15771 if (x2 < x)
15772 x2 += 1;
15773 if (HONOR_SIGNED_ZEROS (mode))
15774 return copysign (x2, x);
15775 return x2;
15776 */
15777 machine_mode mode = GET_MODE (operand0);
15778 rtx xa, xi, TWO52, tmp, one, res, mask;
15779 rtx_code_label *label;
15780
15781 TWO52 = ix86_gen_TWO52 (mode);
15782
15783 /* Temporary for holding the result, initialized to the input
15784 operand to ease control flow. */
15785 res = gen_reg_rtx (mode);
15786 emit_move_insn (res, operand1);
15787
15788 /* xa = abs (operand1) */
15789 xa = ix86_expand_sse_fabs (res, &mask);
15790
15791 /* if (!isless (xa, TWO52)) goto label; */
15792 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15793
15794 /* xa = (double)(long)x */
15795 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
15796 expand_fix (xi, res, 0);
15797 expand_float (xa, xi, 0);
15798
15799 /* generate 1.0 */
15800 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15801
15802 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15803 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15804 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15805 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15806 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15807 emit_move_insn (res, tmp);
15808
15809 if (HONOR_SIGNED_ZEROS (mode))
15810 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
15811
15812 emit_label (label);
15813 LABEL_NUSES (label) = 1;
15814
15815 emit_move_insn (operand0, res);
15816 }
15817
15818 /* Expand SSE sequence for computing round from OPERAND1 storing
15819 into OPERAND0. Sequence that works without relying on DImode truncation
15820 via cvttsd2siq that is only available on 64bit targets. */
15821 void
15822 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
15823 {
15824 /* C code for the stuff we expand below.
15825 double xa = fabs (x), xa2, x2;
15826 if (!isless (xa, TWO52))
15827 return x;
15828 Using the absolute value and copying back sign makes
15829 -0.0 -> -0.0 correct.
15830 xa2 = xa + TWO52 - TWO52;
15831 Compensate.
15832 dxa = xa2 - xa;
15833 if (dxa <= -0.5)
15834 xa2 += 1;
15835 else if (dxa > 0.5)
15836 xa2 -= 1;
15837 x2 = copysign (xa2, x);
15838 return x2;
15839 */
15840 machine_mode mode = GET_MODE (operand0);
15841 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
15842 rtx_code_label *label;
15843
15844 TWO52 = ix86_gen_TWO52 (mode);
15845
15846 /* Temporary for holding the result, initialized to the input
15847 operand to ease control flow. */
15848 res = gen_reg_rtx (mode);
15849 emit_move_insn (res, operand1);
15850
15851 /* xa = abs (operand1) */
15852 xa = ix86_expand_sse_fabs (res, &mask);
15853
15854 /* if (!isless (xa, TWO52)) goto label; */
15855 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15856
15857 /* xa2 = xa + TWO52 - TWO52; */
15858 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
15859 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
15860
15861 /* dxa = xa2 - xa; */
15862 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
15863
15864 /* generate 0.5, 1.0 and -0.5 */
15865 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
15866 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
15867 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
15868 0, OPTAB_DIRECT);
15869
15870 /* Compensate. */
15871 tmp = gen_reg_rtx (mode);
15872 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
15873 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
15874 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15875 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15876 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
15877 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
15878 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15879 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15880
15881 /* res = copysign (xa2, operand1) */
15882 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
15883
15884 emit_label (label);
15885 LABEL_NUSES (label) = 1;
15886
15887 emit_move_insn (operand0, res);
15888 }
15889
15890 /* Expand SSE sequence for computing trunc from OPERAND1 storing
15891 into OPERAND0. */
15892 void
15893 ix86_expand_trunc (rtx operand0, rtx operand1)
15894 {
15895 /* C code for SSE variant we expand below.
15896 double xa = fabs (x), x2;
15897 if (!isless (xa, TWO52))
15898 return x;
15899 x2 = (double)(long)x;
15900 if (HONOR_SIGNED_ZEROS (mode))
15901 return copysign (x2, x);
15902 return x2;
15903 */
15904 machine_mode mode = GET_MODE (operand0);
15905 rtx xa, xi, TWO52, res, mask;
15906 rtx_code_label *label;
15907
15908 TWO52 = ix86_gen_TWO52 (mode);
15909
15910 /* Temporary for holding the result, initialized to the input
15911 operand to ease control flow. */
15912 res = gen_reg_rtx (mode);
15913 emit_move_insn (res, operand1);
15914
15915 /* xa = abs (operand1) */
15916 xa = ix86_expand_sse_fabs (res, &mask);
15917
15918 /* if (!isless (xa, TWO52)) goto label; */
15919 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15920
15921 /* x = (double)(long)x */
15922 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
15923 expand_fix (xi, res, 0);
15924 expand_float (res, xi, 0);
15925
15926 if (HONOR_SIGNED_ZEROS (mode))
15927 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
15928
15929 emit_label (label);
15930 LABEL_NUSES (label) = 1;
15931
15932 emit_move_insn (operand0, res);
15933 }
15934
15935 /* Expand SSE sequence for computing trunc from OPERAND1 storing
15936 into OPERAND0. */
15937 void
15938 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
15939 {
15940 machine_mode mode = GET_MODE (operand0);
15941 rtx xa, mask, TWO52, one, res, smask, tmp;
15942 rtx_code_label *label;
15943
15944 /* C code for SSE variant we expand below.
15945 double xa = fabs (x), x2;
15946 if (!isless (xa, TWO52))
15947 return x;
15948 xa2 = xa + TWO52 - TWO52;
15949 Compensate:
15950 if (xa2 > xa)
15951 xa2 -= 1.0;
15952 x2 = copysign (xa2, x);
15953 return x2;
15954 */
15955
15956 TWO52 = ix86_gen_TWO52 (mode);
15957
15958 /* Temporary for holding the result, initialized to the input
15959 operand to ease control flow. */
15960 res = gen_reg_rtx (mode);
15961 emit_move_insn (res, operand1);
15962
15963 /* xa = abs (operand1) */
15964 xa = ix86_expand_sse_fabs (res, &smask);
15965
15966 /* if (!isless (xa, TWO52)) goto label; */
15967 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15968
15969 /* res = xa + TWO52 - TWO52; */
15970 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
15971 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
15972 emit_move_insn (res, tmp);
15973
15974 /* generate 1.0 */
15975 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15976
15977 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
15978 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
15979 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
15980 tmp = expand_simple_binop (mode, MINUS,
15981 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
15982 emit_move_insn (res, tmp);
15983
15984 /* res = copysign (res, operand1) */
15985 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
15986
15987 emit_label (label);
15988 LABEL_NUSES (label) = 1;
15989
15990 emit_move_insn (operand0, res);
15991 }
15992
15993 /* Expand SSE sequence for computing round from OPERAND1 storing
15994 into OPERAND0. */
15995 void
15996 ix86_expand_round (rtx operand0, rtx operand1)
15997 {
15998 /* C code for the stuff we're doing below:
15999 double xa = fabs (x);
16000 if (!isless (xa, TWO52))
16001 return x;
16002 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16003 return copysign (xa, x);
16004 */
16005 machine_mode mode = GET_MODE (operand0);
16006 rtx res, TWO52, xa, xi, half, mask;
16007 rtx_code_label *label;
16008 const struct real_format *fmt;
16009 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16010
16011 /* Temporary for holding the result, initialized to the input
16012 operand to ease control flow. */
16013 res = gen_reg_rtx (mode);
16014 emit_move_insn (res, operand1);
16015
16016 TWO52 = ix86_gen_TWO52 (mode);
16017 xa = ix86_expand_sse_fabs (res, &mask);
16018 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16019
16020 /* load nextafter (0.5, 0.0) */
16021 fmt = REAL_MODE_FORMAT (mode);
16022 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16023 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16024
16025 /* xa = xa + 0.5 */
16026 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16027 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16028
16029 /* xa = (double)(int64_t)xa */
16030 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16031 expand_fix (xi, xa, 0);
16032 expand_float (xa, xi, 0);
16033
16034 /* res = copysign (xa, operand1) */
16035 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
16036
16037 emit_label (label);
16038 LABEL_NUSES (label) = 1;
16039
16040 emit_move_insn (operand0, res);
16041 }
16042
16043 /* Expand SSE sequence for computing round
16044 from OP1 storing into OP0 using sse4 round insn. */
16045 void
16046 ix86_expand_round_sse4 (rtx op0, rtx op1)
16047 {
16048 machine_mode mode = GET_MODE (op0);
16049 rtx e1, e2, res, half;
16050 const struct real_format *fmt;
16051 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16052 rtx (*gen_copysign) (rtx, rtx, rtx);
16053 rtx (*gen_round) (rtx, rtx, rtx);
16054
16055 switch (mode)
16056 {
16057 case E_SFmode:
16058 gen_copysign = gen_copysignsf3;
16059 gen_round = gen_sse4_1_roundsf2;
16060 break;
16061 case E_DFmode:
16062 gen_copysign = gen_copysigndf3;
16063 gen_round = gen_sse4_1_rounddf2;
16064 break;
16065 default:
16066 gcc_unreachable ();
16067 }
16068
16069 /* round (a) = trunc (a + copysign (0.5, a)) */
16070
16071 /* load nextafter (0.5, 0.0) */
16072 fmt = REAL_MODE_FORMAT (mode);
16073 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16074 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16075 half = const_double_from_real_value (pred_half, mode);
16076
16077 /* e1 = copysign (0.5, op1) */
16078 e1 = gen_reg_rtx (mode);
16079 emit_insn (gen_copysign (e1, half, op1));
16080
16081 /* e2 = op1 + e1 */
16082 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16083
16084 /* res = trunc (e2) */
16085 res = gen_reg_rtx (mode);
16086 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16087
16088 emit_move_insn (op0, res);
16089 }
16090
16091 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16092 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16093 insn every time. */
16094
16095 static GTY(()) rtx_insn *vselect_insn;
16096
16097 /* Initialize vselect_insn. */
16098
16099 static void
16100 init_vselect_insn (void)
16101 {
16102 unsigned i;
16103 rtx x;
16104
16105 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16106 for (i = 0; i < MAX_VECT_LEN; ++i)
16107 XVECEXP (x, 0, i) = const0_rtx;
16108 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16109 const0_rtx), x);
16110 x = gen_rtx_SET (const0_rtx, x);
16111 start_sequence ();
16112 vselect_insn = emit_insn (x);
16113 end_sequence ();
16114 }
16115
16116 /* Construct (set target (vec_select op0 (parallel perm))) and
16117 return true if that's a valid instruction in the active ISA. */
16118
16119 static bool
16120 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16121 unsigned nelt, bool testing_p)
16122 {
16123 unsigned int i;
16124 rtx x, save_vconcat;
16125 int icode;
16126
16127 if (vselect_insn == NULL_RTX)
16128 init_vselect_insn ();
16129
16130 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16131 PUT_NUM_ELEM (XVEC (x, 0), nelt);
16132 for (i = 0; i < nelt; ++i)
16133 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16134 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16135 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16136 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16137 SET_DEST (PATTERN (vselect_insn)) = target;
16138 icode = recog_memoized (vselect_insn);
16139
16140 if (icode >= 0 && !testing_p)
16141 emit_insn (copy_rtx (PATTERN (vselect_insn)));
16142
16143 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16144 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16145 INSN_CODE (vselect_insn) = -1;
16146
16147 return icode >= 0;
16148 }
16149
16150 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16151
16152 static bool
16153 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16154 const unsigned char *perm, unsigned nelt,
16155 bool testing_p)
16156 {
16157 machine_mode v2mode;
16158 rtx x;
16159 bool ok;
16160
16161 if (vselect_insn == NULL_RTX)
16162 init_vselect_insn ();
16163
16164 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16165 return false;
16166 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16167 PUT_MODE (x, v2mode);
16168 XEXP (x, 0) = op0;
16169 XEXP (x, 1) = op1;
16170 ok = expand_vselect (target, x, perm, nelt, testing_p);
16171 XEXP (x, 0) = const0_rtx;
16172 XEXP (x, 1) = const0_rtx;
16173 return ok;
16174 }
16175
16176 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16177 using movss or movsd. */
16178 static bool
16179 expand_vec_perm_movs (struct expand_vec_perm_d *d)
16180 {
16181 machine_mode vmode = d->vmode;
16182 unsigned i, nelt = d->nelt;
16183 rtx x;
16184
16185 if (d->one_operand_p)
16186 return false;
16187
16188 if (!(TARGET_SSE && vmode == V4SFmode)
16189 && !(TARGET_SSE2 && vmode == V2DFmode))
16190 return false;
16191
16192 /* Only the first element is changed. */
16193 if (d->perm[0] != nelt && d->perm[0] != 0)
16194 return false;
16195 for (i = 1; i < nelt; ++i)
16196 if (d->perm[i] != i + nelt - d->perm[0])
16197 return false;
16198
16199 if (d->testing_p)
16200 return true;
16201
16202 if (d->perm[0] == nelt)
16203 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16204 else
16205 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16206
16207 emit_insn (gen_rtx_SET (d->target, x));
16208
16209 return true;
16210 }
16211
16212 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16213 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16214
16215 static bool
16216 expand_vec_perm_blend (struct expand_vec_perm_d *d)
16217 {
16218 machine_mode mmode, vmode = d->vmode;
16219 unsigned i, mask, nelt = d->nelt;
16220 rtx target, op0, op1, maskop, x;
16221 rtx rperm[32], vperm;
16222
16223 if (d->one_operand_p)
16224 return false;
16225 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16226 && (TARGET_AVX512BW
16227 || GET_MODE_UNIT_SIZE (vmode) >= 4))
16228 ;
16229 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16230 ;
16231 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16232 ;
16233 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16234 ;
16235 else
16236 return false;
16237
16238 /* This is a blend, not a permute. Elements must stay in their
16239 respective lanes. */
16240 for (i = 0; i < nelt; ++i)
16241 {
16242 unsigned e = d->perm[i];
16243 if (!(e == i || e == i + nelt))
16244 return false;
16245 }
16246
16247 if (d->testing_p)
16248 return true;
16249
16250 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16251 decision should be extracted elsewhere, so that we only try that
16252 sequence once all budget==3 options have been tried. */
16253 target = d->target;
16254 op0 = d->op0;
16255 op1 = d->op1;
16256 mask = 0;
16257
16258 switch (vmode)
16259 {
16260 case E_V8DFmode:
16261 case E_V16SFmode:
16262 case E_V4DFmode:
16263 case E_V8SFmode:
16264 case E_V2DFmode:
16265 case E_V4SFmode:
16266 case E_V8HImode:
16267 case E_V8SImode:
16268 case E_V32HImode:
16269 case E_V64QImode:
16270 case E_V16SImode:
16271 case E_V8DImode:
16272 for (i = 0; i < nelt; ++i)
16273 mask |= (d->perm[i] >= nelt) << i;
16274 break;
16275
16276 case E_V2DImode:
16277 for (i = 0; i < 2; ++i)
16278 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16279 vmode = V8HImode;
16280 goto do_subreg;
16281
16282 case E_V4SImode:
16283 for (i = 0; i < 4; ++i)
16284 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16285 vmode = V8HImode;
16286 goto do_subreg;
16287
16288 case E_V16QImode:
16289 /* See if bytes move in pairs so we can use pblendw with
16290 an immediate argument, rather than pblendvb with a vector
16291 argument. */
16292 for (i = 0; i < 16; i += 2)
16293 if (d->perm[i] + 1 != d->perm[i + 1])
16294 {
16295 use_pblendvb:
16296 for (i = 0; i < nelt; ++i)
16297 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
16298
16299 finish_pblendvb:
16300 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16301 vperm = force_reg (vmode, vperm);
16302
16303 if (GET_MODE_SIZE (vmode) == 16)
16304 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
16305 else
16306 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
16307 if (target != d->target)
16308 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16309 return true;
16310 }
16311
16312 for (i = 0; i < 8; ++i)
16313 mask |= (d->perm[i * 2] >= 16) << i;
16314 vmode = V8HImode;
16315 /* FALLTHRU */
16316
16317 do_subreg:
16318 target = gen_reg_rtx (vmode);
16319 op0 = gen_lowpart (vmode, op0);
16320 op1 = gen_lowpart (vmode, op1);
16321 break;
16322
16323 case E_V32QImode:
16324 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16325 for (i = 0; i < 32; i += 2)
16326 if (d->perm[i] + 1 != d->perm[i + 1])
16327 goto use_pblendvb;
16328 /* See if bytes move in quadruplets. If yes, vpblendd
16329 with immediate can be used. */
16330 for (i = 0; i < 32; i += 4)
16331 if (d->perm[i] + 2 != d->perm[i + 2])
16332 break;
16333 if (i < 32)
16334 {
16335 /* See if bytes move the same in both lanes. If yes,
16336 vpblendw with immediate can be used. */
16337 for (i = 0; i < 16; i += 2)
16338 if (d->perm[i] + 16 != d->perm[i + 16])
16339 goto use_pblendvb;
16340
16341 /* Use vpblendw. */
16342 for (i = 0; i < 16; ++i)
16343 mask |= (d->perm[i * 2] >= 32) << i;
16344 vmode = V16HImode;
16345 goto do_subreg;
16346 }
16347
16348 /* Use vpblendd. */
16349 for (i = 0; i < 8; ++i)
16350 mask |= (d->perm[i * 4] >= 32) << i;
16351 vmode = V8SImode;
16352 goto do_subreg;
16353
16354 case E_V16HImode:
16355 /* See if words move in pairs. If yes, vpblendd can be used. */
16356 for (i = 0; i < 16; i += 2)
16357 if (d->perm[i] + 1 != d->perm[i + 1])
16358 break;
16359 if (i < 16)
16360 {
16361 /* See if words move the same in both lanes. If not,
16362 vpblendvb must be used. */
16363 for (i = 0; i < 8; i++)
16364 if (d->perm[i] + 8 != d->perm[i + 8])
16365 {
16366 /* Use vpblendvb. */
16367 for (i = 0; i < 32; ++i)
16368 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
16369
16370 vmode = V32QImode;
16371 nelt = 32;
16372 target = gen_reg_rtx (vmode);
16373 op0 = gen_lowpart (vmode, op0);
16374 op1 = gen_lowpart (vmode, op1);
16375 goto finish_pblendvb;
16376 }
16377
16378 /* Use vpblendw. */
16379 for (i = 0; i < 16; ++i)
16380 mask |= (d->perm[i] >= 16) << i;
16381 break;
16382 }
16383
16384 /* Use vpblendd. */
16385 for (i = 0; i < 8; ++i)
16386 mask |= (d->perm[i * 2] >= 16) << i;
16387 vmode = V8SImode;
16388 goto do_subreg;
16389
16390 case E_V4DImode:
16391 /* Use vpblendd. */
16392 for (i = 0; i < 4; ++i)
16393 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16394 vmode = V8SImode;
16395 goto do_subreg;
16396
16397 default:
16398 gcc_unreachable ();
16399 }
16400
16401 switch (vmode)
16402 {
16403 case E_V8DFmode:
16404 case E_V8DImode:
16405 mmode = QImode;
16406 break;
16407 case E_V16SFmode:
16408 case E_V16SImode:
16409 mmode = HImode;
16410 break;
16411 case E_V32HImode:
16412 mmode = SImode;
16413 break;
16414 case E_V64QImode:
16415 mmode = DImode;
16416 break;
16417 default:
16418 mmode = VOIDmode;
16419 }
16420
16421 if (mmode != VOIDmode)
16422 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
16423 else
16424 maskop = GEN_INT (mask);
16425
16426 /* This matches five different patterns with the different modes. */
16427 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
16428 x = gen_rtx_SET (target, x);
16429 emit_insn (x);
16430 if (target != d->target)
16431 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16432
16433 return true;
16434 }
16435
16436 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16437 in terms of the variable form of vpermilps.
16438
16439 Note that we will have already failed the immediate input vpermilps,
16440 which requires that the high and low part shuffle be identical; the
16441 variable form doesn't require that. */
16442
16443 static bool
16444 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
16445 {
16446 rtx rperm[8], vperm;
16447 unsigned i;
16448
16449 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
16450 return false;
16451
16452 /* We can only permute within the 128-bit lane. */
16453 for (i = 0; i < 8; ++i)
16454 {
16455 unsigned e = d->perm[i];
16456 if (i < 4 ? e >= 4 : e < 4)
16457 return false;
16458 }
16459
16460 if (d->testing_p)
16461 return true;
16462
16463 for (i = 0; i < 8; ++i)
16464 {
16465 unsigned e = d->perm[i];
16466
16467 /* Within each 128-bit lane, the elements of op0 are numbered
16468 from 0 and the elements of op1 are numbered from 4. */
16469 if (e >= 8 + 4)
16470 e -= 8;
16471 else if (e >= 4)
16472 e -= 4;
16473
16474 rperm[i] = GEN_INT (e);
16475 }
16476
16477 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
16478 vperm = force_reg (V8SImode, vperm);
16479 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
16480
16481 return true;
16482 }
16483
16484 /* Return true if permutation D can be performed as VMODE permutation
16485 instead. */
16486
16487 static bool
16488 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
16489 {
16490 unsigned int i, j, chunk;
16491
16492 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
16493 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
16494 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
16495 return false;
16496
16497 if (GET_MODE_NUNITS (vmode) >= d->nelt)
16498 return true;
16499
16500 chunk = d->nelt / GET_MODE_NUNITS (vmode);
16501 for (i = 0; i < d->nelt; i += chunk)
16502 if (d->perm[i] & (chunk - 1))
16503 return false;
16504 else
16505 for (j = 1; j < chunk; ++j)
16506 if (d->perm[i] + j != d->perm[i + j])
16507 return false;
16508
16509 return true;
16510 }
16511
16512 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16513 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16514
16515 static bool
16516 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
16517 {
16518 unsigned i, nelt, eltsz, mask;
16519 unsigned char perm[64];
16520 machine_mode vmode = V16QImode;
16521 rtx rperm[64], vperm, target, op0, op1;
16522
16523 nelt = d->nelt;
16524
16525 if (!d->one_operand_p)
16526 {
16527 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
16528 {
16529 if (TARGET_AVX2
16530 && valid_perm_using_mode_p (V2TImode, d))
16531 {
16532 if (d->testing_p)
16533 return true;
16534
16535 /* Use vperm2i128 insn. The pattern uses
16536 V4DImode instead of V2TImode. */
16537 target = d->target;
16538 if (d->vmode != V4DImode)
16539 target = gen_reg_rtx (V4DImode);
16540 op0 = gen_lowpart (V4DImode, d->op0);
16541 op1 = gen_lowpart (V4DImode, d->op1);
16542 rperm[0]
16543 = GEN_INT ((d->perm[0] / (nelt / 2))
16544 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
16545 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
16546 if (target != d->target)
16547 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16548 return true;
16549 }
16550 return false;
16551 }
16552 }
16553 else
16554 {
16555 if (GET_MODE_SIZE (d->vmode) == 16)
16556 {
16557 if (!TARGET_SSSE3)
16558 return false;
16559 }
16560 else if (GET_MODE_SIZE (d->vmode) == 32)
16561 {
16562 if (!TARGET_AVX2)
16563 return false;
16564
16565 /* V4DImode should be already handled through
16566 expand_vselect by vpermq instruction. */
16567 gcc_assert (d->vmode != V4DImode);
16568
16569 vmode = V32QImode;
16570 if (d->vmode == V8SImode
16571 || d->vmode == V16HImode
16572 || d->vmode == V32QImode)
16573 {
16574 /* First see if vpermq can be used for
16575 V8SImode/V16HImode/V32QImode. */
16576 if (valid_perm_using_mode_p (V4DImode, d))
16577 {
16578 for (i = 0; i < 4; i++)
16579 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
16580 if (d->testing_p)
16581 return true;
16582 target = gen_reg_rtx (V4DImode);
16583 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
16584 perm, 4, false))
16585 {
16586 emit_move_insn (d->target,
16587 gen_lowpart (d->vmode, target));
16588 return true;
16589 }
16590 return false;
16591 }
16592
16593 /* Next see if vpermd can be used. */
16594 if (valid_perm_using_mode_p (V8SImode, d))
16595 vmode = V8SImode;
16596 }
16597 /* Or if vpermps can be used. */
16598 else if (d->vmode == V8SFmode)
16599 vmode = V8SImode;
16600
16601 if (vmode == V32QImode)
16602 {
16603 /* vpshufb only works intra lanes, it is not
16604 possible to shuffle bytes in between the lanes. */
16605 for (i = 0; i < nelt; ++i)
16606 if ((d->perm[i] ^ i) & (nelt / 2))
16607 return false;
16608 }
16609 }
16610 else if (GET_MODE_SIZE (d->vmode) == 64)
16611 {
16612 if (!TARGET_AVX512BW)
16613 return false;
16614
16615 /* If vpermq didn't work, vpshufb won't work either. */
16616 if (d->vmode == V8DFmode || d->vmode == V8DImode)
16617 return false;
16618
16619 vmode = V64QImode;
16620 if (d->vmode == V16SImode
16621 || d->vmode == V32HImode
16622 || d->vmode == V64QImode)
16623 {
16624 /* First see if vpermq can be used for
16625 V16SImode/V32HImode/V64QImode. */
16626 if (valid_perm_using_mode_p (V8DImode, d))
16627 {
16628 for (i = 0; i < 8; i++)
16629 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
16630 if (d->testing_p)
16631 return true;
16632 target = gen_reg_rtx (V8DImode);
16633 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
16634 perm, 8, false))
16635 {
16636 emit_move_insn (d->target,
16637 gen_lowpart (d->vmode, target));
16638 return true;
16639 }
16640 return false;
16641 }
16642
16643 /* Next see if vpermd can be used. */
16644 if (valid_perm_using_mode_p (V16SImode, d))
16645 vmode = V16SImode;
16646 }
16647 /* Or if vpermps can be used. */
16648 else if (d->vmode == V16SFmode)
16649 vmode = V16SImode;
16650 if (vmode == V64QImode)
16651 {
16652 /* vpshufb only works intra lanes, it is not
16653 possible to shuffle bytes in between the lanes. */
16654 for (i = 0; i < nelt; ++i)
16655 if ((d->perm[i] ^ i) & (nelt / 4))
16656 return false;
16657 }
16658 }
16659 else
16660 return false;
16661 }
16662
16663 if (d->testing_p)
16664 return true;
16665
16666 if (vmode == V8SImode)
16667 for (i = 0; i < 8; ++i)
16668 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
16669 else if (vmode == V16SImode)
16670 for (i = 0; i < 16; ++i)
16671 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
16672 else
16673 {
16674 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
16675 if (!d->one_operand_p)
16676 mask = 2 * nelt - 1;
16677 else if (vmode == V16QImode)
16678 mask = nelt - 1;
16679 else if (vmode == V64QImode)
16680 mask = nelt / 4 - 1;
16681 else
16682 mask = nelt / 2 - 1;
16683
16684 for (i = 0; i < nelt; ++i)
16685 {
16686 unsigned j, e = d->perm[i] & mask;
16687 for (j = 0; j < eltsz; ++j)
16688 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
16689 }
16690 }
16691
16692 vperm = gen_rtx_CONST_VECTOR (vmode,
16693 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
16694 vperm = force_reg (vmode, vperm);
16695
16696 target = d->target;
16697 if (d->vmode != vmode)
16698 target = gen_reg_rtx (vmode);
16699 op0 = gen_lowpart (vmode, d->op0);
16700 if (d->one_operand_p)
16701 {
16702 if (vmode == V16QImode)
16703 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
16704 else if (vmode == V32QImode)
16705 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
16706 else if (vmode == V64QImode)
16707 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
16708 else if (vmode == V8SFmode)
16709 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
16710 else if (vmode == V8SImode)
16711 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
16712 else if (vmode == V16SFmode)
16713 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
16714 else if (vmode == V16SImode)
16715 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
16716 else
16717 gcc_unreachable ();
16718 }
16719 else
16720 {
16721 op1 = gen_lowpart (vmode, d->op1);
16722 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
16723 }
16724 if (target != d->target)
16725 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16726
16727 return true;
16728 }
16729
16730 /* For V*[QHS]Imode permutations, check if the same permutation
16731 can't be performed in a 2x, 4x or 8x wider inner mode. */
16732
16733 static bool
16734 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
16735 struct expand_vec_perm_d *nd)
16736 {
16737 int i;
16738 machine_mode mode = VOIDmode;
16739
16740 switch (d->vmode)
16741 {
16742 case E_V16QImode: mode = V8HImode; break;
16743 case E_V32QImode: mode = V16HImode; break;
16744 case E_V64QImode: mode = V32HImode; break;
16745 case E_V8HImode: mode = V4SImode; break;
16746 case E_V16HImode: mode = V8SImode; break;
16747 case E_V32HImode: mode = V16SImode; break;
16748 case E_V4SImode: mode = V2DImode; break;
16749 case E_V8SImode: mode = V4DImode; break;
16750 case E_V16SImode: mode = V8DImode; break;
16751 default: return false;
16752 }
16753 for (i = 0; i < d->nelt; i += 2)
16754 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
16755 return false;
16756 nd->vmode = mode;
16757 nd->nelt = d->nelt / 2;
16758 for (i = 0; i < nd->nelt; i++)
16759 nd->perm[i] = d->perm[2 * i] / 2;
16760 if (GET_MODE_INNER (mode) != DImode)
16761 canonicalize_vector_int_perm (nd, nd);
16762 if (nd != d)
16763 {
16764 nd->one_operand_p = d->one_operand_p;
16765 nd->testing_p = d->testing_p;
16766 if (d->op0 == d->op1)
16767 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
16768 else
16769 {
16770 nd->op0 = gen_lowpart (nd->vmode, d->op0);
16771 nd->op1 = gen_lowpart (nd->vmode, d->op1);
16772 }
16773 if (d->testing_p)
16774 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
16775 else
16776 nd->target = gen_reg_rtx (nd->vmode);
16777 }
16778 return true;
16779 }
16780
16781 /* Try to expand one-operand permutation with constant mask. */
16782
16783 static bool
16784 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
16785 {
16786 machine_mode mode = GET_MODE (d->op0);
16787 machine_mode maskmode = mode;
16788 rtx (*gen) (rtx, rtx, rtx) = NULL;
16789 rtx target, op0, mask;
16790 rtx vec[64];
16791
16792 if (!rtx_equal_p (d->op0, d->op1))
16793 return false;
16794
16795 if (!TARGET_AVX512F)
16796 return false;
16797
16798 switch (mode)
16799 {
16800 case E_V16SImode:
16801 gen = gen_avx512f_permvarv16si;
16802 break;
16803 case E_V16SFmode:
16804 gen = gen_avx512f_permvarv16sf;
16805 maskmode = V16SImode;
16806 break;
16807 case E_V8DImode:
16808 gen = gen_avx512f_permvarv8di;
16809 break;
16810 case E_V8DFmode:
16811 gen = gen_avx512f_permvarv8df;
16812 maskmode = V8DImode;
16813 break;
16814 default:
16815 return false;
16816 }
16817
16818 target = d->target;
16819 op0 = d->op0;
16820 for (int i = 0; i < d->nelt; ++i)
16821 vec[i] = GEN_INT (d->perm[i]);
16822 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
16823 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
16824 return true;
16825 }
16826
16827 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
16828
16829 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
16830 in a single instruction. */
16831
16832 static bool
16833 expand_vec_perm_1 (struct expand_vec_perm_d *d)
16834 {
16835 unsigned i, nelt = d->nelt;
16836 struct expand_vec_perm_d nd;
16837
16838 /* Check plain VEC_SELECT first, because AVX has instructions that could
16839 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
16840 input where SEL+CONCAT may not. */
16841 if (d->one_operand_p)
16842 {
16843 int mask = nelt - 1;
16844 bool identity_perm = true;
16845 bool broadcast_perm = true;
16846
16847 for (i = 0; i < nelt; i++)
16848 {
16849 nd.perm[i] = d->perm[i] & mask;
16850 if (nd.perm[i] != i)
16851 identity_perm = false;
16852 if (nd.perm[i])
16853 broadcast_perm = false;
16854 }
16855
16856 if (identity_perm)
16857 {
16858 if (!d->testing_p)
16859 emit_move_insn (d->target, d->op0);
16860 return true;
16861 }
16862 else if (broadcast_perm && TARGET_AVX2)
16863 {
16864 /* Use vpbroadcast{b,w,d}. */
16865 rtx (*gen) (rtx, rtx) = NULL;
16866 switch (d->vmode)
16867 {
16868 case E_V64QImode:
16869 if (TARGET_AVX512BW)
16870 gen = gen_avx512bw_vec_dupv64qi_1;
16871 break;
16872 case E_V32QImode:
16873 gen = gen_avx2_pbroadcastv32qi_1;
16874 break;
16875 case E_V32HImode:
16876 if (TARGET_AVX512BW)
16877 gen = gen_avx512bw_vec_dupv32hi_1;
16878 break;
16879 case E_V16HImode:
16880 gen = gen_avx2_pbroadcastv16hi_1;
16881 break;
16882 case E_V16SImode:
16883 if (TARGET_AVX512F)
16884 gen = gen_avx512f_vec_dupv16si_1;
16885 break;
16886 case E_V8SImode:
16887 gen = gen_avx2_pbroadcastv8si_1;
16888 break;
16889 case E_V16QImode:
16890 gen = gen_avx2_pbroadcastv16qi;
16891 break;
16892 case E_V8HImode:
16893 gen = gen_avx2_pbroadcastv8hi;
16894 break;
16895 case E_V16SFmode:
16896 if (TARGET_AVX512F)
16897 gen = gen_avx512f_vec_dupv16sf_1;
16898 break;
16899 case E_V8SFmode:
16900 gen = gen_avx2_vec_dupv8sf_1;
16901 break;
16902 case E_V8DFmode:
16903 if (TARGET_AVX512F)
16904 gen = gen_avx512f_vec_dupv8df_1;
16905 break;
16906 case E_V8DImode:
16907 if (TARGET_AVX512F)
16908 gen = gen_avx512f_vec_dupv8di_1;
16909 break;
16910 /* For other modes prefer other shuffles this function creates. */
16911 default: break;
16912 }
16913 if (gen != NULL)
16914 {
16915 if (!d->testing_p)
16916 emit_insn (gen (d->target, d->op0));
16917 return true;
16918 }
16919 }
16920
16921 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
16922 return true;
16923
16924 /* There are plenty of patterns in sse.md that are written for
16925 SEL+CONCAT and are not replicated for a single op. Perhaps
16926 that should be changed, to avoid the nastiness here. */
16927
16928 /* Recognize interleave style patterns, which means incrementing
16929 every other permutation operand. */
16930 for (i = 0; i < nelt; i += 2)
16931 {
16932 nd.perm[i] = d->perm[i] & mask;
16933 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
16934 }
16935 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
16936 d->testing_p))
16937 return true;
16938
16939 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
16940 if (nelt >= 4)
16941 {
16942 for (i = 0; i < nelt; i += 4)
16943 {
16944 nd.perm[i + 0] = d->perm[i + 0] & mask;
16945 nd.perm[i + 1] = d->perm[i + 1] & mask;
16946 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
16947 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
16948 }
16949
16950 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
16951 d->testing_p))
16952 return true;
16953 }
16954 }
16955
16956 /* Try movss/movsd instructions. */
16957 if (expand_vec_perm_movs (d))
16958 return true;
16959
16960 /* Finally, try the fully general two operand permute. */
16961 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
16962 d->testing_p))
16963 return true;
16964
16965 /* Recognize interleave style patterns with reversed operands. */
16966 if (!d->one_operand_p)
16967 {
16968 for (i = 0; i < nelt; ++i)
16969 {
16970 unsigned e = d->perm[i];
16971 if (e >= nelt)
16972 e -= nelt;
16973 else
16974 e += nelt;
16975 nd.perm[i] = e;
16976 }
16977
16978 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
16979 d->testing_p))
16980 return true;
16981 }
16982
16983 /* Try the SSE4.1 blend variable merge instructions. */
16984 if (expand_vec_perm_blend (d))
16985 return true;
16986
16987 /* Try one of the AVX vpermil variable permutations. */
16988 if (expand_vec_perm_vpermil (d))
16989 return true;
16990
16991 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
16992 vpshufb, vpermd, vpermps or vpermq variable permutation. */
16993 if (expand_vec_perm_pshufb (d))
16994 return true;
16995
16996 /* Try the AVX2 vpalignr instruction. */
16997 if (expand_vec_perm_palignr (d, true))
16998 return true;
16999
17000 /* Try the AVX512F vperm{s,d} instructions. */
17001 if (ix86_expand_vec_one_operand_perm_avx512 (d))
17002 return true;
17003
17004 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17005 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17006 return true;
17007
17008 /* See if we can get the same permutation in different vector integer
17009 mode. */
17010 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17011 {
17012 if (!d->testing_p)
17013 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17014 return true;
17015 }
17016 return false;
17017 }
17018
17019 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
17020 in terms of a pair of pshuflw + pshufhw instructions. */
17021
17022 static bool
17023 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17024 {
17025 unsigned char perm2[MAX_VECT_LEN];
17026 unsigned i;
17027 bool ok;
17028
17029 if (d->vmode != V8HImode || !d->one_operand_p)
17030 return false;
17031
17032 /* The two permutations only operate in 64-bit lanes. */
17033 for (i = 0; i < 4; ++i)
17034 if (d->perm[i] >= 4)
17035 return false;
17036 for (i = 4; i < 8; ++i)
17037 if (d->perm[i] < 4)
17038 return false;
17039
17040 if (d->testing_p)
17041 return true;
17042
17043 /* Emit the pshuflw. */
17044 memcpy (perm2, d->perm, 4);
17045 for (i = 4; i < 8; ++i)
17046 perm2[i] = i;
17047 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17048 gcc_assert (ok);
17049
17050 /* Emit the pshufhw. */
17051 memcpy (perm2 + 4, d->perm + 4, 4);
17052 for (i = 0; i < 4; ++i)
17053 perm2[i] = i;
17054 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17055 gcc_assert (ok);
17056
17057 return true;
17058 }
17059
17060 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17061 the permutation using the SSSE3 palignr instruction. This succeeds
17062 when all of the elements in PERM fit within one vector and we merely
17063 need to shift them down so that a single vector permutation has a
17064 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17065 the vpalignr instruction itself can perform the requested permutation. */
17066
17067 static bool
17068 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17069 {
17070 unsigned i, nelt = d->nelt;
17071 unsigned min, max, minswap, maxswap;
17072 bool in_order, ok, swap = false;
17073 rtx shift, target;
17074 struct expand_vec_perm_d dcopy;
17075
17076 /* Even with AVX, palignr only operates on 128-bit vectors,
17077 in AVX2 palignr operates on both 128-bit lanes. */
17078 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17079 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17080 return false;
17081
17082 min = 2 * nelt;
17083 max = 0;
17084 minswap = 2 * nelt;
17085 maxswap = 0;
17086 for (i = 0; i < nelt; ++i)
17087 {
17088 unsigned e = d->perm[i];
17089 unsigned eswap = d->perm[i] ^ nelt;
17090 if (GET_MODE_SIZE (d->vmode) == 32)
17091 {
17092 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17093 eswap = e ^ (nelt / 2);
17094 }
17095 if (e < min)
17096 min = e;
17097 if (e > max)
17098 max = e;
17099 if (eswap < minswap)
17100 minswap = eswap;
17101 if (eswap > maxswap)
17102 maxswap = eswap;
17103 }
17104 if (min == 0
17105 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17106 {
17107 if (d->one_operand_p
17108 || minswap == 0
17109 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17110 ? nelt / 2 : nelt))
17111 return false;
17112 swap = true;
17113 min = minswap;
17114 max = maxswap;
17115 }
17116
17117 /* Given that we have SSSE3, we know we'll be able to implement the
17118 single operand permutation after the palignr with pshufb for
17119 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17120 first. */
17121 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17122 return true;
17123
17124 dcopy = *d;
17125 if (swap)
17126 {
17127 dcopy.op0 = d->op1;
17128 dcopy.op1 = d->op0;
17129 for (i = 0; i < nelt; ++i)
17130 dcopy.perm[i] ^= nelt;
17131 }
17132
17133 in_order = true;
17134 for (i = 0; i < nelt; ++i)
17135 {
17136 unsigned e = dcopy.perm[i];
17137 if (GET_MODE_SIZE (d->vmode) == 32
17138 && e >= nelt
17139 && (e & (nelt / 2 - 1)) < min)
17140 e = e - min - (nelt / 2);
17141 else
17142 e = e - min;
17143 if (e != i)
17144 in_order = false;
17145 dcopy.perm[i] = e;
17146 }
17147 dcopy.one_operand_p = true;
17148
17149 if (single_insn_only_p && !in_order)
17150 return false;
17151
17152 /* For AVX2, test whether we can permute the result in one instruction. */
17153 if (d->testing_p)
17154 {
17155 if (in_order)
17156 return true;
17157 dcopy.op1 = dcopy.op0;
17158 return expand_vec_perm_1 (&dcopy);
17159 }
17160
17161 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17162 if (GET_MODE_SIZE (d->vmode) == 16)
17163 {
17164 target = gen_reg_rtx (TImode);
17165 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17166 gen_lowpart (TImode, dcopy.op0), shift));
17167 }
17168 else
17169 {
17170 target = gen_reg_rtx (V2TImode);
17171 emit_insn (gen_avx2_palignrv2ti (target,
17172 gen_lowpart (V2TImode, dcopy.op1),
17173 gen_lowpart (V2TImode, dcopy.op0),
17174 shift));
17175 }
17176
17177 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17178
17179 /* Test for the degenerate case where the alignment by itself
17180 produces the desired permutation. */
17181 if (in_order)
17182 {
17183 emit_move_insn (d->target, dcopy.op0);
17184 return true;
17185 }
17186
17187 ok = expand_vec_perm_1 (&dcopy);
17188 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17189
17190 return ok;
17191 }
17192
17193 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17194 the permutation using the SSE4_1 pblendv instruction. Potentially
17195 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17196
17197 static bool
17198 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17199 {
17200 unsigned i, which, nelt = d->nelt;
17201 struct expand_vec_perm_d dcopy, dcopy1;
17202 machine_mode vmode = d->vmode;
17203 bool ok;
17204
17205 /* Use the same checks as in expand_vec_perm_blend. */
17206 if (d->one_operand_p)
17207 return false;
17208 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17209 ;
17210 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17211 ;
17212 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17213 ;
17214 else
17215 return false;
17216
17217 /* Figure out where permutation elements stay not in their
17218 respective lanes. */
17219 for (i = 0, which = 0; i < nelt; ++i)
17220 {
17221 unsigned e = d->perm[i];
17222 if (e != i)
17223 which |= (e < nelt ? 1 : 2);
17224 }
17225 /* We can pblend the part where elements stay not in their
17226 respective lanes only when these elements are all in one
17227 half of a permutation.
17228 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17229 lanes, but both 8 and 9 >= 8
17230 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17231 respective lanes and 8 >= 8, but 2 not. */
17232 if (which != 1 && which != 2)
17233 return false;
17234 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17235 return true;
17236
17237 /* First we apply one operand permutation to the part where
17238 elements stay not in their respective lanes. */
17239 dcopy = *d;
17240 if (which == 2)
17241 dcopy.op0 = dcopy.op1 = d->op1;
17242 else
17243 dcopy.op0 = dcopy.op1 = d->op0;
17244 if (!d->testing_p)
17245 dcopy.target = gen_reg_rtx (vmode);
17246 dcopy.one_operand_p = true;
17247
17248 for (i = 0; i < nelt; ++i)
17249 dcopy.perm[i] = d->perm[i] & (nelt - 1);
17250
17251 ok = expand_vec_perm_1 (&dcopy);
17252 if (GET_MODE_SIZE (vmode) != 16 && !ok)
17253 return false;
17254 else
17255 gcc_assert (ok);
17256 if (d->testing_p)
17257 return true;
17258
17259 /* Next we put permuted elements into their positions. */
17260 dcopy1 = *d;
17261 if (which == 2)
17262 dcopy1.op1 = dcopy.target;
17263 else
17264 dcopy1.op0 = dcopy.target;
17265
17266 for (i = 0; i < nelt; ++i)
17267 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17268
17269 ok = expand_vec_perm_blend (&dcopy1);
17270 gcc_assert (ok);
17271
17272 return true;
17273 }
17274
17275 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17276
17277 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17278 a two vector permutation into a single vector permutation by using
17279 an interleave operation to merge the vectors. */
17280
17281 static bool
17282 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17283 {
17284 struct expand_vec_perm_d dremap, dfinal;
17285 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17286 unsigned HOST_WIDE_INT contents;
17287 unsigned char remap[2 * MAX_VECT_LEN];
17288 rtx_insn *seq;
17289 bool ok, same_halves = false;
17290
17291 if (GET_MODE_SIZE (d->vmode) == 16)
17292 {
17293 if (d->one_operand_p)
17294 return false;
17295 }
17296 else if (GET_MODE_SIZE (d->vmode) == 32)
17297 {
17298 if (!TARGET_AVX)
17299 return false;
17300 /* For 32-byte modes allow even d->one_operand_p.
17301 The lack of cross-lane shuffling in some instructions
17302 might prevent a single insn shuffle. */
17303 dfinal = *d;
17304 dfinal.testing_p = true;
17305 /* If expand_vec_perm_interleave3 can expand this into
17306 a 3 insn sequence, give up and let it be expanded as
17307 3 insn sequence. While that is one insn longer,
17308 it doesn't need a memory operand and in the common
17309 case that both interleave low and high permutations
17310 with the same operands are adjacent needs 4 insns
17311 for both after CSE. */
17312 if (expand_vec_perm_interleave3 (&dfinal))
17313 return false;
17314 }
17315 else
17316 return false;
17317
17318 /* Examine from whence the elements come. */
17319 contents = 0;
17320 for (i = 0; i < nelt; ++i)
17321 contents |= HOST_WIDE_INT_1U << d->perm[i];
17322
17323 memset (remap, 0xff, sizeof (remap));
17324 dremap = *d;
17325
17326 if (GET_MODE_SIZE (d->vmode) == 16)
17327 {
17328 unsigned HOST_WIDE_INT h1, h2, h3, h4;
17329
17330 /* Split the two input vectors into 4 halves. */
17331 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
17332 h2 = h1 << nelt2;
17333 h3 = h2 << nelt2;
17334 h4 = h3 << nelt2;
17335
17336 /* If the elements from the low halves use interleave low, and similarly
17337 for interleave high. If the elements are from mis-matched halves, we
17338 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17339 if ((contents & (h1 | h3)) == contents)
17340 {
17341 /* punpckl* */
17342 for (i = 0; i < nelt2; ++i)
17343 {
17344 remap[i] = i * 2;
17345 remap[i + nelt] = i * 2 + 1;
17346 dremap.perm[i * 2] = i;
17347 dremap.perm[i * 2 + 1] = i + nelt;
17348 }
17349 if (!TARGET_SSE2 && d->vmode == V4SImode)
17350 dremap.vmode = V4SFmode;
17351 }
17352 else if ((contents & (h2 | h4)) == contents)
17353 {
17354 /* punpckh* */
17355 for (i = 0; i < nelt2; ++i)
17356 {
17357 remap[i + nelt2] = i * 2;
17358 remap[i + nelt + nelt2] = i * 2 + 1;
17359 dremap.perm[i * 2] = i + nelt2;
17360 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
17361 }
17362 if (!TARGET_SSE2 && d->vmode == V4SImode)
17363 dremap.vmode = V4SFmode;
17364 }
17365 else if ((contents & (h1 | h4)) == contents)
17366 {
17367 /* shufps */
17368 for (i = 0; i < nelt2; ++i)
17369 {
17370 remap[i] = i;
17371 remap[i + nelt + nelt2] = i + nelt2;
17372 dremap.perm[i] = i;
17373 dremap.perm[i + nelt2] = i + nelt + nelt2;
17374 }
17375 if (nelt != 4)
17376 {
17377 /* shufpd */
17378 dremap.vmode = V2DImode;
17379 dremap.nelt = 2;
17380 dremap.perm[0] = 0;
17381 dremap.perm[1] = 3;
17382 }
17383 }
17384 else if ((contents & (h2 | h3)) == contents)
17385 {
17386 /* shufps */
17387 for (i = 0; i < nelt2; ++i)
17388 {
17389 remap[i + nelt2] = i;
17390 remap[i + nelt] = i + nelt2;
17391 dremap.perm[i] = i + nelt2;
17392 dremap.perm[i + nelt2] = i + nelt;
17393 }
17394 if (nelt != 4)
17395 {
17396 /* shufpd */
17397 dremap.vmode = V2DImode;
17398 dremap.nelt = 2;
17399 dremap.perm[0] = 1;
17400 dremap.perm[1] = 2;
17401 }
17402 }
17403 else
17404 return false;
17405 }
17406 else
17407 {
17408 unsigned int nelt4 = nelt / 4, nzcnt = 0;
17409 unsigned HOST_WIDE_INT q[8];
17410 unsigned int nonzero_halves[4];
17411
17412 /* Split the two input vectors into 8 quarters. */
17413 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
17414 for (i = 1; i < 8; ++i)
17415 q[i] = q[0] << (nelt4 * i);
17416 for (i = 0; i < 4; ++i)
17417 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
17418 {
17419 nonzero_halves[nzcnt] = i;
17420 ++nzcnt;
17421 }
17422
17423 if (nzcnt == 1)
17424 {
17425 gcc_assert (d->one_operand_p);
17426 nonzero_halves[1] = nonzero_halves[0];
17427 same_halves = true;
17428 }
17429 else if (d->one_operand_p)
17430 {
17431 gcc_assert (nonzero_halves[0] == 0);
17432 gcc_assert (nonzero_halves[1] == 1);
17433 }
17434
17435 if (nzcnt <= 2)
17436 {
17437 if (d->perm[0] / nelt2 == nonzero_halves[1])
17438 {
17439 /* Attempt to increase the likelihood that dfinal
17440 shuffle will be intra-lane. */
17441 std::swap (nonzero_halves[0], nonzero_halves[1]);
17442 }
17443
17444 /* vperm2f128 or vperm2i128. */
17445 for (i = 0; i < nelt2; ++i)
17446 {
17447 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
17448 remap[i + nonzero_halves[0] * nelt2] = i;
17449 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
17450 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
17451 }
17452
17453 if (d->vmode != V8SFmode
17454 && d->vmode != V4DFmode
17455 && d->vmode != V8SImode)
17456 {
17457 dremap.vmode = V8SImode;
17458 dremap.nelt = 8;
17459 for (i = 0; i < 4; ++i)
17460 {
17461 dremap.perm[i] = i + nonzero_halves[0] * 4;
17462 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
17463 }
17464 }
17465 }
17466 else if (d->one_operand_p)
17467 return false;
17468 else if (TARGET_AVX2
17469 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
17470 {
17471 /* vpunpckl* */
17472 for (i = 0; i < nelt4; ++i)
17473 {
17474 remap[i] = i * 2;
17475 remap[i + nelt] = i * 2 + 1;
17476 remap[i + nelt2] = i * 2 + nelt2;
17477 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
17478 dremap.perm[i * 2] = i;
17479 dremap.perm[i * 2 + 1] = i + nelt;
17480 dremap.perm[i * 2 + nelt2] = i + nelt2;
17481 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
17482 }
17483 }
17484 else if (TARGET_AVX2
17485 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
17486 {
17487 /* vpunpckh* */
17488 for (i = 0; i < nelt4; ++i)
17489 {
17490 remap[i + nelt4] = i * 2;
17491 remap[i + nelt + nelt4] = i * 2 + 1;
17492 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
17493 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
17494 dremap.perm[i * 2] = i + nelt4;
17495 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
17496 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
17497 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
17498 }
17499 }
17500 else
17501 return false;
17502 }
17503
17504 /* Use the remapping array set up above to move the elements from their
17505 swizzled locations into their final destinations. */
17506 dfinal = *d;
17507 for (i = 0; i < nelt; ++i)
17508 {
17509 unsigned e = remap[d->perm[i]];
17510 gcc_assert (e < nelt);
17511 /* If same_halves is true, both halves of the remapped vector are the
17512 same. Avoid cross-lane accesses if possible. */
17513 if (same_halves && i >= nelt2)
17514 {
17515 gcc_assert (e < nelt2);
17516 dfinal.perm[i] = e + nelt2;
17517 }
17518 else
17519 dfinal.perm[i] = e;
17520 }
17521 if (!d->testing_p)
17522 {
17523 dremap.target = gen_reg_rtx (dremap.vmode);
17524 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17525 }
17526 dfinal.op1 = dfinal.op0;
17527 dfinal.one_operand_p = true;
17528
17529 /* Test if the final remap can be done with a single insn. For V4SFmode or
17530 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17531 start_sequence ();
17532 ok = expand_vec_perm_1 (&dfinal);
17533 seq = get_insns ();
17534 end_sequence ();
17535
17536 if (!ok)
17537 return false;
17538
17539 if (d->testing_p)
17540 return true;
17541
17542 if (dremap.vmode != dfinal.vmode)
17543 {
17544 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
17545 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
17546 }
17547
17548 ok = expand_vec_perm_1 (&dremap);
17549 gcc_assert (ok);
17550
17551 emit_insn (seq);
17552 return true;
17553 }
17554
17555 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17556 a single vector cross-lane permutation into vpermq followed
17557 by any of the single insn permutations. */
17558
17559 static bool
17560 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
17561 {
17562 struct expand_vec_perm_d dremap, dfinal;
17563 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
17564 unsigned contents[2];
17565 bool ok;
17566
17567 if (!(TARGET_AVX2
17568 && (d->vmode == V32QImode || d->vmode == V16HImode)
17569 && d->one_operand_p))
17570 return false;
17571
17572 contents[0] = 0;
17573 contents[1] = 0;
17574 for (i = 0; i < nelt2; ++i)
17575 {
17576 contents[0] |= 1u << (d->perm[i] / nelt4);
17577 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
17578 }
17579
17580 for (i = 0; i < 2; ++i)
17581 {
17582 unsigned int cnt = 0;
17583 for (j = 0; j < 4; ++j)
17584 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
17585 return false;
17586 }
17587
17588 if (d->testing_p)
17589 return true;
17590
17591 dremap = *d;
17592 dremap.vmode = V4DImode;
17593 dremap.nelt = 4;
17594 dremap.target = gen_reg_rtx (V4DImode);
17595 dremap.op0 = gen_lowpart (V4DImode, d->op0);
17596 dremap.op1 = dremap.op0;
17597 dremap.one_operand_p = true;
17598 for (i = 0; i < 2; ++i)
17599 {
17600 unsigned int cnt = 0;
17601 for (j = 0; j < 4; ++j)
17602 if ((contents[i] & (1u << j)) != 0)
17603 dremap.perm[2 * i + cnt++] = j;
17604 for (; cnt < 2; ++cnt)
17605 dremap.perm[2 * i + cnt] = 0;
17606 }
17607
17608 dfinal = *d;
17609 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17610 dfinal.op1 = dfinal.op0;
17611 dfinal.one_operand_p = true;
17612 for (i = 0, j = 0; i < nelt; ++i)
17613 {
17614 if (i == nelt2)
17615 j = 2;
17616 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
17617 if ((d->perm[i] / nelt4) == dremap.perm[j])
17618 ;
17619 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
17620 dfinal.perm[i] |= nelt4;
17621 else
17622 gcc_unreachable ();
17623 }
17624
17625 ok = expand_vec_perm_1 (&dremap);
17626 gcc_assert (ok);
17627
17628 ok = expand_vec_perm_1 (&dfinal);
17629 gcc_assert (ok);
17630
17631 return true;
17632 }
17633
17634 static bool canonicalize_perm (struct expand_vec_perm_d *d);
17635
17636 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
17637 a vector permutation using two instructions, vperm2f128 resp.
17638 vperm2i128 followed by any single in-lane permutation. */
17639
17640 static bool
17641 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
17642 {
17643 struct expand_vec_perm_d dfirst, dsecond;
17644 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
17645 bool ok;
17646
17647 if (!TARGET_AVX
17648 || GET_MODE_SIZE (d->vmode) != 32
17649 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
17650 return false;
17651
17652 dsecond = *d;
17653 dsecond.one_operand_p = false;
17654 dsecond.testing_p = true;
17655
17656 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17657 immediate. For perm < 16 the second permutation uses
17658 d->op0 as first operand, for perm >= 16 it uses d->op1
17659 as first operand. The second operand is the result of
17660 vperm2[fi]128. */
17661 for (perm = 0; perm < 32; perm++)
17662 {
17663 /* Ignore permutations which do not move anything cross-lane. */
17664 if (perm < 16)
17665 {
17666 /* The second shuffle for e.g. V4DFmode has
17667 0123 and ABCD operands.
17668 Ignore AB23, as 23 is already in the second lane
17669 of the first operand. */
17670 if ((perm & 0xc) == (1 << 2)) continue;
17671 /* And 01CD, as 01 is in the first lane of the first
17672 operand. */
17673 if ((perm & 3) == 0) continue;
17674 /* And 4567, as then the vperm2[fi]128 doesn't change
17675 anything on the original 4567 second operand. */
17676 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
17677 }
17678 else
17679 {
17680 /* The second shuffle for e.g. V4DFmode has
17681 4567 and ABCD operands.
17682 Ignore AB67, as 67 is already in the second lane
17683 of the first operand. */
17684 if ((perm & 0xc) == (3 << 2)) continue;
17685 /* And 45CD, as 45 is in the first lane of the first
17686 operand. */
17687 if ((perm & 3) == 2) continue;
17688 /* And 0123, as then the vperm2[fi]128 doesn't change
17689 anything on the original 0123 first operand. */
17690 if ((perm & 0xf) == (1 << 2)) continue;
17691 }
17692
17693 for (i = 0; i < nelt; i++)
17694 {
17695 j = d->perm[i] / nelt2;
17696 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
17697 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
17698 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
17699 dsecond.perm[i] = d->perm[i] & (nelt - 1);
17700 else
17701 break;
17702 }
17703
17704 if (i == nelt)
17705 {
17706 start_sequence ();
17707 ok = expand_vec_perm_1 (&dsecond);
17708 end_sequence ();
17709 }
17710 else
17711 ok = false;
17712
17713 if (ok)
17714 {
17715 if (d->testing_p)
17716 return true;
17717
17718 /* Found a usable second shuffle. dfirst will be
17719 vperm2f128 on d->op0 and d->op1. */
17720 dsecond.testing_p = false;
17721 dfirst = *d;
17722 dfirst.target = gen_reg_rtx (d->vmode);
17723 for (i = 0; i < nelt; i++)
17724 dfirst.perm[i] = (i & (nelt2 - 1))
17725 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
17726
17727 canonicalize_perm (&dfirst);
17728 ok = expand_vec_perm_1 (&dfirst);
17729 gcc_assert (ok);
17730
17731 /* And dsecond is some single insn shuffle, taking
17732 d->op0 and result of vperm2f128 (if perm < 16) or
17733 d->op1 and result of vperm2f128 (otherwise). */
17734 if (perm >= 16)
17735 dsecond.op0 = dsecond.op1;
17736 dsecond.op1 = dfirst.target;
17737
17738 ok = expand_vec_perm_1 (&dsecond);
17739 gcc_assert (ok);
17740
17741 return true;
17742 }
17743
17744 /* For one operand, the only useful vperm2f128 permutation is 0x01
17745 aka lanes swap. */
17746 if (d->one_operand_p)
17747 return false;
17748 }
17749
17750 return false;
17751 }
17752
17753 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17754 a two vector permutation using 2 intra-lane interleave insns
17755 and cross-lane shuffle for 32-byte vectors. */
17756
17757 static bool
17758 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
17759 {
17760 unsigned i, nelt;
17761 rtx (*gen) (rtx, rtx, rtx);
17762
17763 if (d->one_operand_p)
17764 return false;
17765 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
17766 ;
17767 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
17768 ;
17769 else
17770 return false;
17771
17772 nelt = d->nelt;
17773 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
17774 return false;
17775 for (i = 0; i < nelt; i += 2)
17776 if (d->perm[i] != d->perm[0] + i / 2
17777 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
17778 return false;
17779
17780 if (d->testing_p)
17781 return true;
17782
17783 switch (d->vmode)
17784 {
17785 case E_V32QImode:
17786 if (d->perm[0])
17787 gen = gen_vec_interleave_highv32qi;
17788 else
17789 gen = gen_vec_interleave_lowv32qi;
17790 break;
17791 case E_V16HImode:
17792 if (d->perm[0])
17793 gen = gen_vec_interleave_highv16hi;
17794 else
17795 gen = gen_vec_interleave_lowv16hi;
17796 break;
17797 case E_V8SImode:
17798 if (d->perm[0])
17799 gen = gen_vec_interleave_highv8si;
17800 else
17801 gen = gen_vec_interleave_lowv8si;
17802 break;
17803 case E_V4DImode:
17804 if (d->perm[0])
17805 gen = gen_vec_interleave_highv4di;
17806 else
17807 gen = gen_vec_interleave_lowv4di;
17808 break;
17809 case E_V8SFmode:
17810 if (d->perm[0])
17811 gen = gen_vec_interleave_highv8sf;
17812 else
17813 gen = gen_vec_interleave_lowv8sf;
17814 break;
17815 case E_V4DFmode:
17816 if (d->perm[0])
17817 gen = gen_vec_interleave_highv4df;
17818 else
17819 gen = gen_vec_interleave_lowv4df;
17820 break;
17821 default:
17822 gcc_unreachable ();
17823 }
17824
17825 emit_insn (gen (d->target, d->op0, d->op1));
17826 return true;
17827 }
17828
17829 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
17830 a single vector permutation using a single intra-lane vector
17831 permutation, vperm2f128 swapping the lanes and vblend* insn blending
17832 the non-swapped and swapped vectors together. */
17833
17834 static bool
17835 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
17836 {
17837 struct expand_vec_perm_d dfirst, dsecond;
17838 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
17839 rtx_insn *seq;
17840 bool ok;
17841 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
17842
17843 if (!TARGET_AVX
17844 || TARGET_AVX2
17845 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
17846 || !d->one_operand_p)
17847 return false;
17848
17849 dfirst = *d;
17850 for (i = 0; i < nelt; i++)
17851 dfirst.perm[i] = 0xff;
17852 for (i = 0, msk = 0; i < nelt; i++)
17853 {
17854 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
17855 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
17856 return false;
17857 dfirst.perm[j] = d->perm[i];
17858 if (j != i)
17859 msk |= (1 << i);
17860 }
17861 for (i = 0; i < nelt; i++)
17862 if (dfirst.perm[i] == 0xff)
17863 dfirst.perm[i] = i;
17864
17865 if (!d->testing_p)
17866 dfirst.target = gen_reg_rtx (dfirst.vmode);
17867
17868 start_sequence ();
17869 ok = expand_vec_perm_1 (&dfirst);
17870 seq = get_insns ();
17871 end_sequence ();
17872
17873 if (!ok)
17874 return false;
17875
17876 if (d->testing_p)
17877 return true;
17878
17879 emit_insn (seq);
17880
17881 dsecond = *d;
17882 dsecond.op0 = dfirst.target;
17883 dsecond.op1 = dfirst.target;
17884 dsecond.one_operand_p = true;
17885 dsecond.target = gen_reg_rtx (dsecond.vmode);
17886 for (i = 0; i < nelt; i++)
17887 dsecond.perm[i] = i ^ nelt2;
17888
17889 ok = expand_vec_perm_1 (&dsecond);
17890 gcc_assert (ok);
17891
17892 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
17893 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
17894 return true;
17895 }
17896
17897 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
17898 permutation using two vperm2f128, followed by a vshufpd insn blending
17899 the two vectors together. */
17900
17901 static bool
17902 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
17903 {
17904 struct expand_vec_perm_d dfirst, dsecond, dthird;
17905 bool ok;
17906
17907 if (!TARGET_AVX || (d->vmode != V4DFmode))
17908 return false;
17909
17910 if (d->testing_p)
17911 return true;
17912
17913 dfirst = *d;
17914 dsecond = *d;
17915 dthird = *d;
17916
17917 dfirst.perm[0] = (d->perm[0] & ~1);
17918 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
17919 dfirst.perm[2] = (d->perm[2] & ~1);
17920 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
17921 dsecond.perm[0] = (d->perm[1] & ~1);
17922 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
17923 dsecond.perm[2] = (d->perm[3] & ~1);
17924 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
17925 dthird.perm[0] = (d->perm[0] % 2);
17926 dthird.perm[1] = (d->perm[1] % 2) + 4;
17927 dthird.perm[2] = (d->perm[2] % 2) + 2;
17928 dthird.perm[3] = (d->perm[3] % 2) + 6;
17929
17930 dfirst.target = gen_reg_rtx (dfirst.vmode);
17931 dsecond.target = gen_reg_rtx (dsecond.vmode);
17932 dthird.op0 = dfirst.target;
17933 dthird.op1 = dsecond.target;
17934 dthird.one_operand_p = false;
17935
17936 canonicalize_perm (&dfirst);
17937 canonicalize_perm (&dsecond);
17938
17939 ok = expand_vec_perm_1 (&dfirst)
17940 && expand_vec_perm_1 (&dsecond)
17941 && expand_vec_perm_1 (&dthird);
17942
17943 gcc_assert (ok);
17944
17945 return true;
17946 }
17947
17948 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
17949 permutation with two pshufb insns and an ior. We should have already
17950 failed all two instruction sequences. */
17951
17952 static bool
17953 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
17954 {
17955 rtx rperm[2][16], vperm, l, h, op, m128;
17956 unsigned int i, nelt, eltsz;
17957
17958 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17959 return false;
17960 gcc_assert (!d->one_operand_p);
17961
17962 if (d->testing_p)
17963 return true;
17964
17965 nelt = d->nelt;
17966 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
17967
17968 /* Generate two permutation masks. If the required element is within
17969 the given vector it is shuffled into the proper lane. If the required
17970 element is in the other vector, force a zero into the lane by setting
17971 bit 7 in the permutation mask. */
17972 m128 = GEN_INT (-128);
17973 for (i = 0; i < nelt; ++i)
17974 {
17975 unsigned j, e = d->perm[i];
17976 unsigned which = (e >= nelt);
17977 if (e >= nelt)
17978 e -= nelt;
17979
17980 for (j = 0; j < eltsz; ++j)
17981 {
17982 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
17983 rperm[1-which][i*eltsz + j] = m128;
17984 }
17985 }
17986
17987 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
17988 vperm = force_reg (V16QImode, vperm);
17989
17990 l = gen_reg_rtx (V16QImode);
17991 op = gen_lowpart (V16QImode, d->op0);
17992 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
17993
17994 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
17995 vperm = force_reg (V16QImode, vperm);
17996
17997 h = gen_reg_rtx (V16QImode);
17998 op = gen_lowpart (V16QImode, d->op1);
17999 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18000
18001 op = d->target;
18002 if (d->vmode != V16QImode)
18003 op = gen_reg_rtx (V16QImode);
18004 emit_insn (gen_iorv16qi3 (op, l, h));
18005 if (op != d->target)
18006 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18007
18008 return true;
18009 }
18010
18011 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18012 with two vpshufb insns, vpermq and vpor. We should have already failed
18013 all two or three instruction sequences. */
18014
18015 static bool
18016 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18017 {
18018 rtx rperm[2][32], vperm, l, h, hp, op, m128;
18019 unsigned int i, nelt, eltsz;
18020
18021 if (!TARGET_AVX2
18022 || !d->one_operand_p
18023 || (d->vmode != V32QImode && d->vmode != V16HImode))
18024 return false;
18025
18026 if (d->testing_p)
18027 return true;
18028
18029 nelt = d->nelt;
18030 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18031
18032 /* Generate two permutation masks. If the required element is within
18033 the same lane, it is shuffled in. If the required element from the
18034 other lane, force a zero by setting bit 7 in the permutation mask.
18035 In the other mask the mask has non-negative elements if element
18036 is requested from the other lane, but also moved to the other lane,
18037 so that the result of vpshufb can have the two V2TImode halves
18038 swapped. */
18039 m128 = GEN_INT (-128);
18040 for (i = 0; i < nelt; ++i)
18041 {
18042 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18043 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18044
18045 for (j = 0; j < eltsz; ++j)
18046 {
18047 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18048 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18049 }
18050 }
18051
18052 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18053 vperm = force_reg (V32QImode, vperm);
18054
18055 h = gen_reg_rtx (V32QImode);
18056 op = gen_lowpart (V32QImode, d->op0);
18057 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18058
18059 /* Swap the 128-byte lanes of h into hp. */
18060 hp = gen_reg_rtx (V4DImode);
18061 op = gen_lowpart (V4DImode, h);
18062 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18063 const1_rtx));
18064
18065 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18066 vperm = force_reg (V32QImode, vperm);
18067
18068 l = gen_reg_rtx (V32QImode);
18069 op = gen_lowpart (V32QImode, d->op0);
18070 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18071
18072 op = d->target;
18073 if (d->vmode != V32QImode)
18074 op = gen_reg_rtx (V32QImode);
18075 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18076 if (op != d->target)
18077 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18078
18079 return true;
18080 }
18081
18082 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18083 and extract-odd permutations of two V32QImode and V16QImode operand
18084 with two vpshufb insns, vpor and vpermq. We should have already
18085 failed all two or three instruction sequences. */
18086
18087 static bool
18088 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18089 {
18090 rtx rperm[2][32], vperm, l, h, ior, op, m128;
18091 unsigned int i, nelt, eltsz;
18092
18093 if (!TARGET_AVX2
18094 || d->one_operand_p
18095 || (d->vmode != V32QImode && d->vmode != V16HImode))
18096 return false;
18097
18098 for (i = 0; i < d->nelt; ++i)
18099 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18100 return false;
18101
18102 if (d->testing_p)
18103 return true;
18104
18105 nelt = d->nelt;
18106 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18107
18108 /* Generate two permutation masks. In the first permutation mask
18109 the first quarter will contain indexes for the first half
18110 of the op0, the second quarter will contain bit 7 set, third quarter
18111 will contain indexes for the second half of the op0 and the
18112 last quarter bit 7 set. In the second permutation mask
18113 the first quarter will contain bit 7 set, the second quarter
18114 indexes for the first half of the op1, the third quarter bit 7 set
18115 and last quarter indexes for the second half of the op1.
18116 I.e. the first mask e.g. for V32QImode extract even will be:
18117 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18118 (all values masked with 0xf except for -128) and second mask
18119 for extract even will be
18120 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18121 m128 = GEN_INT (-128);
18122 for (i = 0; i < nelt; ++i)
18123 {
18124 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18125 unsigned which = d->perm[i] >= nelt;
18126 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18127
18128 for (j = 0; j < eltsz; ++j)
18129 {
18130 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18131 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18132 }
18133 }
18134
18135 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18136 vperm = force_reg (V32QImode, vperm);
18137
18138 l = gen_reg_rtx (V32QImode);
18139 op = gen_lowpart (V32QImode, d->op0);
18140 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18141
18142 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18143 vperm = force_reg (V32QImode, vperm);
18144
18145 h = gen_reg_rtx (V32QImode);
18146 op = gen_lowpart (V32QImode, d->op1);
18147 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18148
18149 ior = gen_reg_rtx (V32QImode);
18150 emit_insn (gen_iorv32qi3 (ior, l, h));
18151
18152 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18153 op = gen_reg_rtx (V4DImode);
18154 ior = gen_lowpart (V4DImode, ior);
18155 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18156 const1_rtx, GEN_INT (3)));
18157 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18158
18159 return true;
18160 }
18161
18162 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18163 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18164 with two "and" and "pack" or two "shift" and "pack" insns. We should
18165 have already failed all two instruction sequences. */
18166
18167 static bool
18168 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18169 {
18170 rtx op, dop0, dop1, t;
18171 unsigned i, odd, c, s, nelt = d->nelt;
18172 bool end_perm = false;
18173 machine_mode half_mode;
18174 rtx (*gen_and) (rtx, rtx, rtx);
18175 rtx (*gen_pack) (rtx, rtx, rtx);
18176 rtx (*gen_shift) (rtx, rtx, rtx);
18177
18178 if (d->one_operand_p)
18179 return false;
18180
18181 switch (d->vmode)
18182 {
18183 case E_V8HImode:
18184 /* Required for "pack". */
18185 if (!TARGET_SSE4_1)
18186 return false;
18187 c = 0xffff;
18188 s = 16;
18189 half_mode = V4SImode;
18190 gen_and = gen_andv4si3;
18191 gen_pack = gen_sse4_1_packusdw;
18192 gen_shift = gen_lshrv4si3;
18193 break;
18194 case E_V16QImode:
18195 /* No check as all instructions are SSE2. */
18196 c = 0xff;
18197 s = 8;
18198 half_mode = V8HImode;
18199 gen_and = gen_andv8hi3;
18200 gen_pack = gen_sse2_packuswb;
18201 gen_shift = gen_lshrv8hi3;
18202 break;
18203 case E_V16HImode:
18204 if (!TARGET_AVX2)
18205 return false;
18206 c = 0xffff;
18207 s = 16;
18208 half_mode = V8SImode;
18209 gen_and = gen_andv8si3;
18210 gen_pack = gen_avx2_packusdw;
18211 gen_shift = gen_lshrv8si3;
18212 end_perm = true;
18213 break;
18214 case E_V32QImode:
18215 if (!TARGET_AVX2)
18216 return false;
18217 c = 0xff;
18218 s = 8;
18219 half_mode = V16HImode;
18220 gen_and = gen_andv16hi3;
18221 gen_pack = gen_avx2_packuswb;
18222 gen_shift = gen_lshrv16hi3;
18223 end_perm = true;
18224 break;
18225 default:
18226 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18227 general shuffles. */
18228 return false;
18229 }
18230
18231 /* Check that permutation is even or odd. */
18232 odd = d->perm[0];
18233 if (odd > 1)
18234 return false;
18235
18236 for (i = 1; i < nelt; ++i)
18237 if (d->perm[i] != 2 * i + odd)
18238 return false;
18239
18240 if (d->testing_p)
18241 return true;
18242
18243 dop0 = gen_reg_rtx (half_mode);
18244 dop1 = gen_reg_rtx (half_mode);
18245 if (odd == 0)
18246 {
18247 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
18248 t = force_reg (half_mode, t);
18249 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
18250 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
18251 }
18252 else
18253 {
18254 emit_insn (gen_shift (dop0,
18255 gen_lowpart (half_mode, d->op0),
18256 GEN_INT (s)));
18257 emit_insn (gen_shift (dop1,
18258 gen_lowpart (half_mode, d->op1),
18259 GEN_INT (s)));
18260 }
18261 /* In AVX2 for 256 bit case we need to permute pack result. */
18262 if (TARGET_AVX2 && end_perm)
18263 {
18264 op = gen_reg_rtx (d->vmode);
18265 t = gen_reg_rtx (V4DImode);
18266 emit_insn (gen_pack (op, dop0, dop1));
18267 emit_insn (gen_avx2_permv4di_1 (t,
18268 gen_lowpart (V4DImode, op),
18269 const0_rtx,
18270 const2_rtx,
18271 const1_rtx,
18272 GEN_INT (3)));
18273 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
18274 }
18275 else
18276 emit_insn (gen_pack (d->target, dop0, dop1));
18277
18278 return true;
18279 }
18280
18281 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18282 and extract-odd permutations of two V64QI operands
18283 with two "shifts", two "truncs" and one "concat" insns for "odd"
18284 and two "truncs" and one concat insn for "even."
18285 Have already failed all two instruction sequences. */
18286
18287 static bool
18288 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
18289 {
18290 rtx t1, t2, t3, t4;
18291 unsigned i, odd, nelt = d->nelt;
18292
18293 if (!TARGET_AVX512BW
18294 || d->one_operand_p
18295 || d->vmode != V64QImode)
18296 return false;
18297
18298 /* Check that permutation is even or odd. */
18299 odd = d->perm[0];
18300 if (odd > 1)
18301 return false;
18302
18303 for (i = 1; i < nelt; ++i)
18304 if (d->perm[i] != 2 * i + odd)
18305 return false;
18306
18307 if (d->testing_p)
18308 return true;
18309
18310
18311 if (odd)
18312 {
18313 t1 = gen_reg_rtx (V32HImode);
18314 t2 = gen_reg_rtx (V32HImode);
18315 emit_insn (gen_lshrv32hi3 (t1,
18316 gen_lowpart (V32HImode, d->op0),
18317 GEN_INT (8)));
18318 emit_insn (gen_lshrv32hi3 (t2,
18319 gen_lowpart (V32HImode, d->op1),
18320 GEN_INT (8)));
18321 }
18322 else
18323 {
18324 t1 = gen_lowpart (V32HImode, d->op0);
18325 t2 = gen_lowpart (V32HImode, d->op1);
18326 }
18327
18328 t3 = gen_reg_rtx (V32QImode);
18329 t4 = gen_reg_rtx (V32QImode);
18330 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
18331 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
18332 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
18333
18334 return true;
18335 }
18336
18337 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
18338 and extract-odd permutations. */
18339
18340 static bool
18341 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
18342 {
18343 rtx t1, t2, t3, t4, t5;
18344
18345 switch (d->vmode)
18346 {
18347 case E_V4DFmode:
18348 if (d->testing_p)
18349 break;
18350 t1 = gen_reg_rtx (V4DFmode);
18351 t2 = gen_reg_rtx (V4DFmode);
18352
18353 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18354 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
18355 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
18356
18357 /* Now an unpck[lh]pd will produce the result required. */
18358 if (odd)
18359 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
18360 else
18361 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
18362 emit_insn (t3);
18363 break;
18364
18365 case E_V8SFmode:
18366 {
18367 int mask = odd ? 0xdd : 0x88;
18368
18369 if (d->testing_p)
18370 break;
18371 t1 = gen_reg_rtx (V8SFmode);
18372 t2 = gen_reg_rtx (V8SFmode);
18373 t3 = gen_reg_rtx (V8SFmode);
18374
18375 /* Shuffle within the 128-bit lanes to produce:
18376 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18377 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
18378 GEN_INT (mask)));
18379
18380 /* Shuffle the lanes around to produce:
18381 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18382 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
18383 GEN_INT (0x3)));
18384
18385 /* Shuffle within the 128-bit lanes to produce:
18386 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18387 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
18388
18389 /* Shuffle within the 128-bit lanes to produce:
18390 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18391 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
18392
18393 /* Shuffle the lanes around to produce:
18394 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18395 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
18396 GEN_INT (0x20)));
18397 }
18398 break;
18399
18400 case E_V2DFmode:
18401 case E_V4SFmode:
18402 case E_V2DImode:
18403 case E_V4SImode:
18404 /* These are always directly implementable by expand_vec_perm_1. */
18405 gcc_unreachable ();
18406
18407 case E_V8HImode:
18408 if (TARGET_SSE4_1)
18409 return expand_vec_perm_even_odd_pack (d);
18410 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
18411 return expand_vec_perm_pshufb2 (d);
18412 else
18413 {
18414 if (d->testing_p)
18415 break;
18416 /* We need 2*log2(N)-1 operations to achieve odd/even
18417 with interleave. */
18418 t1 = gen_reg_rtx (V8HImode);
18419 t2 = gen_reg_rtx (V8HImode);
18420 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
18421 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
18422 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
18423 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
18424 if (odd)
18425 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
18426 else
18427 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
18428 emit_insn (t3);
18429 }
18430 break;
18431
18432 case E_V16QImode:
18433 return expand_vec_perm_even_odd_pack (d);
18434
18435 case E_V16HImode:
18436 case E_V32QImode:
18437 return expand_vec_perm_even_odd_pack (d);
18438
18439 case E_V64QImode:
18440 return expand_vec_perm_even_odd_trunc (d);
18441
18442 case E_V4DImode:
18443 if (!TARGET_AVX2)
18444 {
18445 struct expand_vec_perm_d d_copy = *d;
18446 d_copy.vmode = V4DFmode;
18447 if (d->testing_p)
18448 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
18449 else
18450 d_copy.target = gen_reg_rtx (V4DFmode);
18451 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
18452 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
18453 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18454 {
18455 if (!d->testing_p)
18456 emit_move_insn (d->target,
18457 gen_lowpart (V4DImode, d_copy.target));
18458 return true;
18459 }
18460 return false;
18461 }
18462
18463 if (d->testing_p)
18464 break;
18465
18466 t1 = gen_reg_rtx (V4DImode);
18467 t2 = gen_reg_rtx (V4DImode);
18468
18469 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18470 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
18471 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
18472
18473 /* Now an vpunpck[lh]qdq will produce the result required. */
18474 if (odd)
18475 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
18476 else
18477 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
18478 emit_insn (t3);
18479 break;
18480
18481 case E_V8SImode:
18482 if (!TARGET_AVX2)
18483 {
18484 struct expand_vec_perm_d d_copy = *d;
18485 d_copy.vmode = V8SFmode;
18486 if (d->testing_p)
18487 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
18488 else
18489 d_copy.target = gen_reg_rtx (V8SFmode);
18490 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
18491 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
18492 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18493 {
18494 if (!d->testing_p)
18495 emit_move_insn (d->target,
18496 gen_lowpart (V8SImode, d_copy.target));
18497 return true;
18498 }
18499 return false;
18500 }
18501
18502 if (d->testing_p)
18503 break;
18504
18505 t1 = gen_reg_rtx (V8SImode);
18506 t2 = gen_reg_rtx (V8SImode);
18507 t3 = gen_reg_rtx (V4DImode);
18508 t4 = gen_reg_rtx (V4DImode);
18509 t5 = gen_reg_rtx (V4DImode);
18510
18511 /* Shuffle the lanes around into
18512 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18513 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
18514 gen_lowpart (V4DImode, d->op1),
18515 GEN_INT (0x20)));
18516 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
18517 gen_lowpart (V4DImode, d->op1),
18518 GEN_INT (0x31)));
18519
18520 /* Swap the 2nd and 3rd position in each lane into
18521 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18522 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
18523 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18524 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
18525 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18526
18527 /* Now an vpunpck[lh]qdq will produce
18528 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18529 if (odd)
18530 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
18531 gen_lowpart (V4DImode, t2));
18532 else
18533 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
18534 gen_lowpart (V4DImode, t2));
18535 emit_insn (t3);
18536 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
18537 break;
18538
18539 default:
18540 gcc_unreachable ();
18541 }
18542
18543 return true;
18544 }
18545
18546 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
18547 extract-even and extract-odd permutations. */
18548
18549 static bool
18550 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
18551 {
18552 unsigned i, odd, nelt = d->nelt;
18553
18554 odd = d->perm[0];
18555 if (odd != 0 && odd != 1)
18556 return false;
18557
18558 for (i = 1; i < nelt; ++i)
18559 if (d->perm[i] != 2 * i + odd)
18560 return false;
18561
18562 return expand_vec_perm_even_odd_1 (d, odd);
18563 }
18564
18565 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
18566 permutations. We assume that expand_vec_perm_1 has already failed. */
18567
18568 static bool
18569 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
18570 {
18571 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
18572 machine_mode vmode = d->vmode;
18573 unsigned char perm2[4];
18574 rtx op0 = d->op0, dest;
18575 bool ok;
18576
18577 switch (vmode)
18578 {
18579 case E_V4DFmode:
18580 case E_V8SFmode:
18581 /* These are special-cased in sse.md so that we can optionally
18582 use the vbroadcast instruction. They expand to two insns
18583 if the input happens to be in a register. */
18584 gcc_unreachable ();
18585
18586 case E_V2DFmode:
18587 case E_V2DImode:
18588 case E_V4SFmode:
18589 case E_V4SImode:
18590 /* These are always implementable using standard shuffle patterns. */
18591 gcc_unreachable ();
18592
18593 case E_V8HImode:
18594 case E_V16QImode:
18595 /* These can be implemented via interleave. We save one insn by
18596 stopping once we have promoted to V4SImode and then use pshufd. */
18597 if (d->testing_p)
18598 return true;
18599 do
18600 {
18601 rtx dest;
18602 rtx (*gen) (rtx, rtx, rtx)
18603 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
18604 : gen_vec_interleave_lowv8hi;
18605
18606 if (elt >= nelt2)
18607 {
18608 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
18609 : gen_vec_interleave_highv8hi;
18610 elt -= nelt2;
18611 }
18612 nelt2 /= 2;
18613
18614 dest = gen_reg_rtx (vmode);
18615 emit_insn (gen (dest, op0, op0));
18616 vmode = get_mode_wider_vector (vmode);
18617 op0 = gen_lowpart (vmode, dest);
18618 }
18619 while (vmode != V4SImode);
18620
18621 memset (perm2, elt, 4);
18622 dest = gen_reg_rtx (V4SImode);
18623 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
18624 gcc_assert (ok);
18625 if (!d->testing_p)
18626 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
18627 return true;
18628
18629 case E_V64QImode:
18630 case E_V32QImode:
18631 case E_V16HImode:
18632 case E_V8SImode:
18633 case E_V4DImode:
18634 /* For AVX2 broadcasts of the first element vpbroadcast* or
18635 vpermq should be used by expand_vec_perm_1. */
18636 gcc_assert (!TARGET_AVX2 || d->perm[0]);
18637 return false;
18638
18639 default:
18640 gcc_unreachable ();
18641 }
18642 }
18643
18644 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
18645 broadcast permutations. */
18646
18647 static bool
18648 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
18649 {
18650 unsigned i, elt, nelt = d->nelt;
18651
18652 if (!d->one_operand_p)
18653 return false;
18654
18655 elt = d->perm[0];
18656 for (i = 1; i < nelt; ++i)
18657 if (d->perm[i] != elt)
18658 return false;
18659
18660 return expand_vec_perm_broadcast_1 (d);
18661 }
18662
18663 /* Implement arbitrary permutations of two V64QImode operands
18664 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18665 static bool
18666 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
18667 {
18668 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
18669 return false;
18670
18671 if (d->testing_p)
18672 return true;
18673
18674 struct expand_vec_perm_d ds[2];
18675 rtx rperm[128], vperm, target0, target1;
18676 unsigned int i, nelt;
18677 machine_mode vmode;
18678
18679 nelt = d->nelt;
18680 vmode = V64QImode;
18681
18682 for (i = 0; i < 2; i++)
18683 {
18684 ds[i] = *d;
18685 ds[i].vmode = V32HImode;
18686 ds[i].nelt = 32;
18687 ds[i].target = gen_reg_rtx (V32HImode);
18688 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
18689 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
18690 }
18691
18692 /* Prepare permutations such that the first one takes care of
18693 putting the even bytes into the right positions or one higher
18694 positions (ds[0]) and the second one takes care of
18695 putting the odd bytes into the right positions or one below
18696 (ds[1]). */
18697
18698 for (i = 0; i < nelt; i++)
18699 {
18700 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
18701 if (i & 1)
18702 {
18703 rperm[i] = constm1_rtx;
18704 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
18705 }
18706 else
18707 {
18708 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
18709 rperm[i + 64] = constm1_rtx;
18710 }
18711 }
18712
18713 bool ok = expand_vec_perm_1 (&ds[0]);
18714 gcc_assert (ok);
18715 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
18716
18717 ok = expand_vec_perm_1 (&ds[1]);
18718 gcc_assert (ok);
18719 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
18720
18721 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
18722 vperm = force_reg (vmode, vperm);
18723 target0 = gen_reg_rtx (V64QImode);
18724 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
18725
18726 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
18727 vperm = force_reg (vmode, vperm);
18728 target1 = gen_reg_rtx (V64QImode);
18729 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
18730
18731 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
18732 return true;
18733 }
18734
18735 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
18736 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
18737 all the shorter instruction sequences. */
18738
18739 static bool
18740 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
18741 {
18742 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
18743 unsigned int i, nelt, eltsz;
18744 bool used[4];
18745
18746 if (!TARGET_AVX2
18747 || d->one_operand_p
18748 || (d->vmode != V32QImode && d->vmode != V16HImode))
18749 return false;
18750
18751 if (d->testing_p)
18752 return true;
18753
18754 nelt = d->nelt;
18755 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18756
18757 /* Generate 4 permutation masks. If the required element is within
18758 the same lane, it is shuffled in. If the required element from the
18759 other lane, force a zero by setting bit 7 in the permutation mask.
18760 In the other mask the mask has non-negative elements if element
18761 is requested from the other lane, but also moved to the other lane,
18762 so that the result of vpshufb can have the two V2TImode halves
18763 swapped. */
18764 m128 = GEN_INT (-128);
18765 for (i = 0; i < 32; ++i)
18766 {
18767 rperm[0][i] = m128;
18768 rperm[1][i] = m128;
18769 rperm[2][i] = m128;
18770 rperm[3][i] = m128;
18771 }
18772 used[0] = false;
18773 used[1] = false;
18774 used[2] = false;
18775 used[3] = false;
18776 for (i = 0; i < nelt; ++i)
18777 {
18778 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18779 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18780 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
18781
18782 for (j = 0; j < eltsz; ++j)
18783 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
18784 used[which] = true;
18785 }
18786
18787 for (i = 0; i < 2; ++i)
18788 {
18789 if (!used[2 * i + 1])
18790 {
18791 h[i] = NULL_RTX;
18792 continue;
18793 }
18794 vperm = gen_rtx_CONST_VECTOR (V32QImode,
18795 gen_rtvec_v (32, rperm[2 * i + 1]));
18796 vperm = force_reg (V32QImode, vperm);
18797 h[i] = gen_reg_rtx (V32QImode);
18798 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
18799 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
18800 }
18801
18802 /* Swap the 128-byte lanes of h[X]. */
18803 for (i = 0; i < 2; ++i)
18804 {
18805 if (h[i] == NULL_RTX)
18806 continue;
18807 op = gen_reg_rtx (V4DImode);
18808 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
18809 const2_rtx, GEN_INT (3), const0_rtx,
18810 const1_rtx));
18811 h[i] = gen_lowpart (V32QImode, op);
18812 }
18813
18814 for (i = 0; i < 2; ++i)
18815 {
18816 if (!used[2 * i])
18817 {
18818 l[i] = NULL_RTX;
18819 continue;
18820 }
18821 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
18822 vperm = force_reg (V32QImode, vperm);
18823 l[i] = gen_reg_rtx (V32QImode);
18824 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
18825 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
18826 }
18827
18828 for (i = 0; i < 2; ++i)
18829 {
18830 if (h[i] && l[i])
18831 {
18832 op = gen_reg_rtx (V32QImode);
18833 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
18834 l[i] = op;
18835 }
18836 else if (h[i])
18837 l[i] = h[i];
18838 }
18839
18840 gcc_assert (l[0] && l[1]);
18841 op = d->target;
18842 if (d->vmode != V32QImode)
18843 op = gen_reg_rtx (V32QImode);
18844 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
18845 if (op != d->target)
18846 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18847 return true;
18848 }
18849
18850 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
18851 taken care of, perform the expansion in D and return true on success. */
18852
18853 static bool
18854 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
18855 {
18856 /* Try a single instruction expansion. */
18857 if (expand_vec_perm_1 (d))
18858 return true;
18859
18860 /* Try sequences of two instructions. */
18861
18862 if (expand_vec_perm_pshuflw_pshufhw (d))
18863 return true;
18864
18865 if (expand_vec_perm_palignr (d, false))
18866 return true;
18867
18868 if (expand_vec_perm_interleave2 (d))
18869 return true;
18870
18871 if (expand_vec_perm_broadcast (d))
18872 return true;
18873
18874 if (expand_vec_perm_vpermq_perm_1 (d))
18875 return true;
18876
18877 if (expand_vec_perm_vperm2f128 (d))
18878 return true;
18879
18880 if (expand_vec_perm_pblendv (d))
18881 return true;
18882
18883 /* Try sequences of three instructions. */
18884
18885 if (expand_vec_perm_even_odd_pack (d))
18886 return true;
18887
18888 if (expand_vec_perm_2vperm2f128_vshuf (d))
18889 return true;
18890
18891 if (expand_vec_perm_pshufb2 (d))
18892 return true;
18893
18894 if (expand_vec_perm_interleave3 (d))
18895 return true;
18896
18897 if (expand_vec_perm_vperm2f128_vblend (d))
18898 return true;
18899
18900 /* Try sequences of four instructions. */
18901
18902 if (expand_vec_perm_even_odd_trunc (d))
18903 return true;
18904 if (expand_vec_perm_vpshufb2_vpermq (d))
18905 return true;
18906
18907 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
18908 return true;
18909
18910 if (expand_vec_perm_vpermt2_vpshub2 (d))
18911 return true;
18912
18913 /* ??? Look for narrow permutations whose element orderings would
18914 allow the promotion to a wider mode. */
18915
18916 /* ??? Look for sequences of interleave or a wider permute that place
18917 the data into the correct lanes for a half-vector shuffle like
18918 pshuf[lh]w or vpermilps. */
18919
18920 /* ??? Look for sequences of interleave that produce the desired results.
18921 The combinatorics of punpck[lh] get pretty ugly... */
18922
18923 if (expand_vec_perm_even_odd (d))
18924 return true;
18925
18926 /* Even longer sequences. */
18927 if (expand_vec_perm_vpshufb4_vpermq2 (d))
18928 return true;
18929
18930 /* See if we can get the same permutation in different vector integer
18931 mode. */
18932 struct expand_vec_perm_d nd;
18933 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
18934 {
18935 if (!d->testing_p)
18936 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
18937 return true;
18938 }
18939
18940 return false;
18941 }
18942
18943 /* If a permutation only uses one operand, make it clear. Returns true
18944 if the permutation references both operands. */
18945
18946 static bool
18947 canonicalize_perm (struct expand_vec_perm_d *d)
18948 {
18949 int i, which, nelt = d->nelt;
18950
18951 for (i = which = 0; i < nelt; ++i)
18952 which |= (d->perm[i] < nelt ? 1 : 2);
18953
18954 d->one_operand_p = true;
18955 switch (which)
18956 {
18957 default:
18958 gcc_unreachable();
18959
18960 case 3:
18961 if (!rtx_equal_p (d->op0, d->op1))
18962 {
18963 d->one_operand_p = false;
18964 break;
18965 }
18966 /* The elements of PERM do not suggest that only the first operand
18967 is used, but both operands are identical. Allow easier matching
18968 of the permutation by folding the permutation into the single
18969 input vector. */
18970 /* FALLTHRU */
18971
18972 case 2:
18973 for (i = 0; i < nelt; ++i)
18974 d->perm[i] &= nelt - 1;
18975 d->op0 = d->op1;
18976 break;
18977
18978 case 1:
18979 d->op1 = d->op0;
18980 break;
18981 }
18982
18983 return (which == 3);
18984 }
18985
18986 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
18987
18988 bool
18989 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
18990 rtx op1, const vec_perm_indices &sel)
18991 {
18992 struct expand_vec_perm_d d;
18993 unsigned char perm[MAX_VECT_LEN];
18994 unsigned int i, nelt, which;
18995 bool two_args;
18996
18997 d.target = target;
18998 d.op0 = op0;
18999 d.op1 = op1;
19000
19001 d.vmode = vmode;
19002 gcc_assert (VECTOR_MODE_P (d.vmode));
19003 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19004 d.testing_p = !target;
19005
19006 gcc_assert (sel.length () == nelt);
19007 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19008
19009 /* Given sufficient ISA support we can just return true here
19010 for selected vector modes. */
19011 switch (d.vmode)
19012 {
19013 case E_V16SFmode:
19014 case E_V16SImode:
19015 case E_V8DImode:
19016 case E_V8DFmode:
19017 if (!TARGET_AVX512F)
19018 return false;
19019 /* All implementable with a single vperm[it]2 insn. */
19020 if (d.testing_p)
19021 return true;
19022 break;
19023 case E_V32HImode:
19024 if (!TARGET_AVX512BW)
19025 return false;
19026 if (d.testing_p)
19027 /* All implementable with a single vperm[it]2 insn. */
19028 return true;
19029 break;
19030 case E_V64QImode:
19031 if (!TARGET_AVX512BW)
19032 return false;
19033 if (d.testing_p)
19034 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19035 return true;
19036 break;
19037 case E_V8SImode:
19038 case E_V8SFmode:
19039 case E_V4DFmode:
19040 case E_V4DImode:
19041 if (!TARGET_AVX)
19042 return false;
19043 if (d.testing_p && TARGET_AVX512VL)
19044 /* All implementable with a single vperm[it]2 insn. */
19045 return true;
19046 break;
19047 case E_V16HImode:
19048 if (!TARGET_SSE2)
19049 return false;
19050 if (d.testing_p && TARGET_AVX2)
19051 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19052 return true;
19053 break;
19054 case E_V32QImode:
19055 if (!TARGET_SSE2)
19056 return false;
19057 if (d.testing_p && TARGET_AVX2)
19058 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19059 return true;
19060 break;
19061 case E_V8HImode:
19062 case E_V16QImode:
19063 if (!TARGET_SSE2)
19064 return false;
19065 /* Fall through. */
19066 case E_V4SImode:
19067 case E_V4SFmode:
19068 if (!TARGET_SSE)
19069 return false;
19070 /* All implementable with a single vpperm insn. */
19071 if (d.testing_p && TARGET_XOP)
19072 return true;
19073 /* All implementable with 2 pshufb + 1 ior. */
19074 if (d.testing_p && TARGET_SSSE3)
19075 return true;
19076 break;
19077 case E_V2DImode:
19078 case E_V2DFmode:
19079 if (!TARGET_SSE)
19080 return false;
19081 /* All implementable with shufpd or unpck[lh]pd. */
19082 if (d.testing_p)
19083 return true;
19084 break;
19085 default:
19086 return false;
19087 }
19088
19089 for (i = which = 0; i < nelt; ++i)
19090 {
19091 unsigned char e = sel[i];
19092 gcc_assert (e < 2 * nelt);
19093 d.perm[i] = e;
19094 perm[i] = e;
19095 which |= (e < nelt ? 1 : 2);
19096 }
19097
19098 if (d.testing_p)
19099 {
19100 /* For all elements from second vector, fold the elements to first. */
19101 if (which == 2)
19102 for (i = 0; i < nelt; ++i)
19103 d.perm[i] -= nelt;
19104
19105 /* Check whether the mask can be applied to the vector type. */
19106 d.one_operand_p = (which != 3);
19107
19108 /* Implementable with shufps or pshufd. */
19109 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
19110 return true;
19111
19112 /* Otherwise we have to go through the motions and see if we can
19113 figure out how to generate the requested permutation. */
19114 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19115 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19116 if (!d.one_operand_p)
19117 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19118
19119 start_sequence ();
19120 bool ret = ix86_expand_vec_perm_const_1 (&d);
19121 end_sequence ();
19122
19123 return ret;
19124 }
19125
19126 two_args = canonicalize_perm (&d);
19127
19128 if (ix86_expand_vec_perm_const_1 (&d))
19129 return true;
19130
19131 /* If the selector says both arguments are needed, but the operands are the
19132 same, the above tried to expand with one_operand_p and flattened selector.
19133 If that didn't work, retry without one_operand_p; we succeeded with that
19134 during testing. */
19135 if (two_args && d.one_operand_p)
19136 {
19137 d.one_operand_p = false;
19138 memcpy (d.perm, perm, sizeof (perm));
19139 return ix86_expand_vec_perm_const_1 (&d);
19140 }
19141
19142 return false;
19143 }
19144
19145 void
19146 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
19147 {
19148 struct expand_vec_perm_d d;
19149 unsigned i, nelt;
19150
19151 d.target = targ;
19152 d.op0 = op0;
19153 d.op1 = op1;
19154 d.vmode = GET_MODE (targ);
19155 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19156 d.one_operand_p = false;
19157 d.testing_p = false;
19158
19159 for (i = 0; i < nelt; ++i)
19160 d.perm[i] = i * 2 + odd;
19161
19162 /* We'll either be able to implement the permutation directly... */
19163 if (expand_vec_perm_1 (&d))
19164 return;
19165
19166 /* ... or we use the special-case patterns. */
19167 expand_vec_perm_even_odd_1 (&d, odd);
19168 }
19169
19170 static void
19171 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
19172 {
19173 struct expand_vec_perm_d d;
19174 unsigned i, nelt, base;
19175 bool ok;
19176
19177 d.target = targ;
19178 d.op0 = op0;
19179 d.op1 = op1;
19180 d.vmode = GET_MODE (targ);
19181 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19182 d.one_operand_p = false;
19183 d.testing_p = false;
19184
19185 base = high_p ? nelt / 2 : 0;
19186 for (i = 0; i < nelt / 2; ++i)
19187 {
19188 d.perm[i * 2] = i + base;
19189 d.perm[i * 2 + 1] = i + base + nelt;
19190 }
19191
19192 /* Note that for AVX this isn't one instruction. */
19193 ok = ix86_expand_vec_perm_const_1 (&d);
19194 gcc_assert (ok);
19195 }
19196
19197
19198 /* Expand a vector operation CODE for a V*QImode in terms of the
19199 same operation on V*HImode. */
19200
19201 void
19202 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
19203 {
19204 machine_mode qimode = GET_MODE (dest);
19205 machine_mode himode;
19206 rtx (*gen_il) (rtx, rtx, rtx);
19207 rtx (*gen_ih) (rtx, rtx, rtx);
19208 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
19209 struct expand_vec_perm_d d;
19210 bool ok, full_interleave;
19211 bool uns_p = false;
19212 int i;
19213
19214 switch (qimode)
19215 {
19216 case E_V16QImode:
19217 himode = V8HImode;
19218 gen_il = gen_vec_interleave_lowv16qi;
19219 gen_ih = gen_vec_interleave_highv16qi;
19220 break;
19221 case E_V32QImode:
19222 himode = V16HImode;
19223 gen_il = gen_avx2_interleave_lowv32qi;
19224 gen_ih = gen_avx2_interleave_highv32qi;
19225 break;
19226 case E_V64QImode:
19227 himode = V32HImode;
19228 gen_il = gen_avx512bw_interleave_lowv64qi;
19229 gen_ih = gen_avx512bw_interleave_highv64qi;
19230 break;
19231 default:
19232 gcc_unreachable ();
19233 }
19234
19235 op2_l = op2_h = op2;
19236 switch (code)
19237 {
19238 case MULT:
19239 /* Unpack data such that we've got a source byte in each low byte of
19240 each word. We don't care what goes into the high byte of each word.
19241 Rather than trying to get zero in there, most convenient is to let
19242 it be a copy of the low byte. */
19243 op2_l = gen_reg_rtx (qimode);
19244 op2_h = gen_reg_rtx (qimode);
19245 emit_insn (gen_il (op2_l, op2, op2));
19246 emit_insn (gen_ih (op2_h, op2, op2));
19247
19248 op1_l = gen_reg_rtx (qimode);
19249 op1_h = gen_reg_rtx (qimode);
19250 emit_insn (gen_il (op1_l, op1, op1));
19251 emit_insn (gen_ih (op1_h, op1, op1));
19252 full_interleave = qimode == V16QImode;
19253 break;
19254
19255 case ASHIFT:
19256 case LSHIFTRT:
19257 uns_p = true;
19258 /* FALLTHRU */
19259 case ASHIFTRT:
19260 op1_l = gen_reg_rtx (himode);
19261 op1_h = gen_reg_rtx (himode);
19262 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
19263 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
19264 full_interleave = true;
19265 break;
19266 default:
19267 gcc_unreachable ();
19268 }
19269
19270 /* Perform the operation. */
19271 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
19272 1, OPTAB_DIRECT);
19273 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
19274 1, OPTAB_DIRECT);
19275 gcc_assert (res_l && res_h);
19276
19277 /* Merge the data back into the right place. */
19278 d.target = dest;
19279 d.op0 = gen_lowpart (qimode, res_l);
19280 d.op1 = gen_lowpart (qimode, res_h);
19281 d.vmode = qimode;
19282 d.nelt = GET_MODE_NUNITS (qimode);
19283 d.one_operand_p = false;
19284 d.testing_p = false;
19285
19286 if (full_interleave)
19287 {
19288 /* For SSE2, we used an full interleave, so the desired
19289 results are in the even elements. */
19290 for (i = 0; i < d.nelt; ++i)
19291 d.perm[i] = i * 2;
19292 }
19293 else
19294 {
19295 /* For AVX, the interleave used above was not cross-lane. So the
19296 extraction is evens but with the second and third quarter swapped.
19297 Happily, that is even one insn shorter than even extraction.
19298 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19299 always first from the first and then from the second source operand,
19300 the index bits above the low 4 bits remains the same.
19301 Thus, for d.nelt == 32 we want permutation
19302 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19303 and for d.nelt == 64 we want permutation
19304 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19305 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19306 for (i = 0; i < d.nelt; ++i)
19307 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
19308 }
19309
19310 ok = ix86_expand_vec_perm_const_1 (&d);
19311 gcc_assert (ok);
19312
19313 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19314 gen_rtx_fmt_ee (code, qimode, op1, op2));
19315 }
19316
19317 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
19318 if op is CONST_VECTOR with all odd elements equal to their
19319 preceding element. */
19320
19321 static bool
19322 const_vector_equal_evenodd_p (rtx op)
19323 {
19324 machine_mode mode = GET_MODE (op);
19325 int i, nunits = GET_MODE_NUNITS (mode);
19326 if (GET_CODE (op) != CONST_VECTOR
19327 || nunits != CONST_VECTOR_NUNITS (op))
19328 return false;
19329 for (i = 0; i < nunits; i += 2)
19330 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
19331 return false;
19332 return true;
19333 }
19334
19335 void
19336 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
19337 bool uns_p, bool odd_p)
19338 {
19339 machine_mode mode = GET_MODE (op1);
19340 machine_mode wmode = GET_MODE (dest);
19341 rtx x;
19342 rtx orig_op1 = op1, orig_op2 = op2;
19343
19344 if (!nonimmediate_operand (op1, mode))
19345 op1 = force_reg (mode, op1);
19346 if (!nonimmediate_operand (op2, mode))
19347 op2 = force_reg (mode, op2);
19348
19349 /* We only play even/odd games with vectors of SImode. */
19350 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
19351
19352 /* If we're looking for the odd results, shift those members down to
19353 the even slots. For some cpus this is faster than a PSHUFD. */
19354 if (odd_p)
19355 {
19356 /* For XOP use vpmacsdqh, but only for smult, as it is only
19357 signed. */
19358 if (TARGET_XOP && mode == V4SImode && !uns_p)
19359 {
19360 x = force_reg (wmode, CONST0_RTX (wmode));
19361 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
19362 return;
19363 }
19364
19365 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
19366 if (!const_vector_equal_evenodd_p (orig_op1))
19367 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
19368 x, NULL, 1, OPTAB_DIRECT);
19369 if (!const_vector_equal_evenodd_p (orig_op2))
19370 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
19371 x, NULL, 1, OPTAB_DIRECT);
19372 op1 = gen_lowpart (mode, op1);
19373 op2 = gen_lowpart (mode, op2);
19374 }
19375
19376 if (mode == V16SImode)
19377 {
19378 if (uns_p)
19379 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
19380 else
19381 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
19382 }
19383 else if (mode == V8SImode)
19384 {
19385 if (uns_p)
19386 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
19387 else
19388 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
19389 }
19390 else if (uns_p)
19391 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
19392 else if (TARGET_SSE4_1)
19393 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
19394 else
19395 {
19396 rtx s1, s2, t0, t1, t2;
19397
19398 /* The easiest way to implement this without PMULDQ is to go through
19399 the motions as if we are performing a full 64-bit multiply. With
19400 the exception that we need to do less shuffling of the elements. */
19401
19402 /* Compute the sign-extension, aka highparts, of the two operands. */
19403 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19404 op1, pc_rtx, pc_rtx);
19405 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19406 op2, pc_rtx, pc_rtx);
19407
19408 /* Multiply LO(A) * HI(B), and vice-versa. */
19409 t1 = gen_reg_rtx (wmode);
19410 t2 = gen_reg_rtx (wmode);
19411 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
19412 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
19413
19414 /* Multiply LO(A) * LO(B). */
19415 t0 = gen_reg_rtx (wmode);
19416 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
19417
19418 /* Combine and shift the highparts into place. */
19419 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
19420 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
19421 1, OPTAB_DIRECT);
19422
19423 /* Combine high and low parts. */
19424 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
19425 return;
19426 }
19427 emit_insn (x);
19428 }
19429
19430 void
19431 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
19432 bool uns_p, bool high_p)
19433 {
19434 machine_mode wmode = GET_MODE (dest);
19435 machine_mode mode = GET_MODE (op1);
19436 rtx t1, t2, t3, t4, mask;
19437
19438 switch (mode)
19439 {
19440 case E_V4SImode:
19441 t1 = gen_reg_rtx (mode);
19442 t2 = gen_reg_rtx (mode);
19443 if (TARGET_XOP && !uns_p)
19444 {
19445 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19446 shuffle the elements once so that all elements are in the right
19447 place for immediate use: { A C B D }. */
19448 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
19449 const1_rtx, GEN_INT (3)));
19450 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
19451 const1_rtx, GEN_INT (3)));
19452 }
19453 else
19454 {
19455 /* Put the elements into place for the multiply. */
19456 ix86_expand_vec_interleave (t1, op1, op1, high_p);
19457 ix86_expand_vec_interleave (t2, op2, op2, high_p);
19458 high_p = false;
19459 }
19460 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
19461 break;
19462
19463 case E_V8SImode:
19464 /* Shuffle the elements between the lanes. After this we
19465 have { A B E F | C D G H } for each operand. */
19466 t1 = gen_reg_rtx (V4DImode);
19467 t2 = gen_reg_rtx (V4DImode);
19468 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
19469 const0_rtx, const2_rtx,
19470 const1_rtx, GEN_INT (3)));
19471 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
19472 const0_rtx, const2_rtx,
19473 const1_rtx, GEN_INT (3)));
19474
19475 /* Shuffle the elements within the lanes. After this we
19476 have { A A B B | C C D D } or { E E F F | G G H H }. */
19477 t3 = gen_reg_rtx (V8SImode);
19478 t4 = gen_reg_rtx (V8SImode);
19479 mask = GEN_INT (high_p
19480 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19481 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19482 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
19483 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
19484
19485 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
19486 break;
19487
19488 case E_V8HImode:
19489 case E_V16HImode:
19490 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
19491 uns_p, OPTAB_DIRECT);
19492 t2 = expand_binop (mode,
19493 uns_p ? umul_highpart_optab : smul_highpart_optab,
19494 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
19495 gcc_assert (t1 && t2);
19496
19497 t3 = gen_reg_rtx (mode);
19498 ix86_expand_vec_interleave (t3, t1, t2, high_p);
19499 emit_move_insn (dest, gen_lowpart (wmode, t3));
19500 break;
19501
19502 case E_V16QImode:
19503 case E_V32QImode:
19504 case E_V32HImode:
19505 case E_V16SImode:
19506 case E_V64QImode:
19507 t1 = gen_reg_rtx (wmode);
19508 t2 = gen_reg_rtx (wmode);
19509 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
19510 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
19511
19512 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
19513 break;
19514
19515 default:
19516 gcc_unreachable ();
19517 }
19518 }
19519
19520 void
19521 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
19522 {
19523 rtx res_1, res_2, res_3, res_4;
19524
19525 res_1 = gen_reg_rtx (V4SImode);
19526 res_2 = gen_reg_rtx (V4SImode);
19527 res_3 = gen_reg_rtx (V2DImode);
19528 res_4 = gen_reg_rtx (V2DImode);
19529 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
19530 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
19531
19532 /* Move the results in element 2 down to element 1; we don't care
19533 what goes in elements 2 and 3. Then we can merge the parts
19534 back together with an interleave.
19535
19536 Note that two other sequences were tried:
19537 (1) Use interleaves at the start instead of psrldq, which allows
19538 us to use a single shufps to merge things back at the end.
19539 (2) Use shufps here to combine the two vectors, then pshufd to
19540 put the elements in the correct order.
19541 In both cases the cost of the reformatting stall was too high
19542 and the overall sequence slower. */
19543
19544 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
19545 const0_rtx, const2_rtx,
19546 const0_rtx, const0_rtx));
19547 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
19548 const0_rtx, const2_rtx,
19549 const0_rtx, const0_rtx));
19550 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
19551
19552 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
19553 }
19554
19555 void
19556 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
19557 {
19558 machine_mode mode = GET_MODE (op0);
19559 rtx t1, t2, t3, t4, t5, t6;
19560
19561 if (TARGET_AVX512DQ && mode == V8DImode)
19562 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
19563 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
19564 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
19565 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
19566 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
19567 else if (TARGET_XOP && mode == V2DImode)
19568 {
19569 /* op1: A,B,C,D, op2: E,F,G,H */
19570 op1 = gen_lowpart (V4SImode, op1);
19571 op2 = gen_lowpart (V4SImode, op2);
19572
19573 t1 = gen_reg_rtx (V4SImode);
19574 t2 = gen_reg_rtx (V4SImode);
19575 t3 = gen_reg_rtx (V2DImode);
19576 t4 = gen_reg_rtx (V2DImode);
19577
19578 /* t1: B,A,D,C */
19579 emit_insn (gen_sse2_pshufd_1 (t1, op1,
19580 GEN_INT (1),
19581 GEN_INT (0),
19582 GEN_INT (3),
19583 GEN_INT (2)));
19584
19585 /* t2: (B*E),(A*F),(D*G),(C*H) */
19586 emit_insn (gen_mulv4si3 (t2, t1, op2));
19587
19588 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
19589 emit_insn (gen_xop_phadddq (t3, t2));
19590
19591 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
19592 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
19593
19594 /* Multiply lower parts and add all */
19595 t5 = gen_reg_rtx (V2DImode);
19596 emit_insn (gen_vec_widen_umult_even_v4si (t5,
19597 gen_lowpart (V4SImode, op1),
19598 gen_lowpart (V4SImode, op2)));
19599 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
19600
19601 }
19602 else
19603 {
19604 machine_mode nmode;
19605 rtx (*umul) (rtx, rtx, rtx);
19606
19607 if (mode == V2DImode)
19608 {
19609 umul = gen_vec_widen_umult_even_v4si;
19610 nmode = V4SImode;
19611 }
19612 else if (mode == V4DImode)
19613 {
19614 umul = gen_vec_widen_umult_even_v8si;
19615 nmode = V8SImode;
19616 }
19617 else if (mode == V8DImode)
19618 {
19619 umul = gen_vec_widen_umult_even_v16si;
19620 nmode = V16SImode;
19621 }
19622 else
19623 gcc_unreachable ();
19624
19625
19626 /* Multiply low parts. */
19627 t1 = gen_reg_rtx (mode);
19628 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
19629
19630 /* Shift input vectors right 32 bits so we can multiply high parts. */
19631 t6 = GEN_INT (32);
19632 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
19633 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
19634
19635 /* Multiply high parts by low parts. */
19636 t4 = gen_reg_rtx (mode);
19637 t5 = gen_reg_rtx (mode);
19638 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
19639 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
19640
19641 /* Combine and shift the highparts back. */
19642 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
19643 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
19644
19645 /* Combine high and low parts. */
19646 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
19647 }
19648
19649 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19650 gen_rtx_MULT (mode, op1, op2));
19651 }
19652
19653 /* Return 1 if control tansfer instruction INSN
19654 should be encoded with notrack prefix. */
19655
19656 bool
19657 ix86_notrack_prefixed_insn_p (rtx insn)
19658 {
19659 if (!insn || !((flag_cf_protection & CF_BRANCH)))
19660 return false;
19661
19662 if (CALL_P (insn))
19663 {
19664 rtx call = get_call_rtx_from (insn);
19665 gcc_assert (call != NULL_RTX);
19666 rtx addr = XEXP (call, 0);
19667
19668 /* Do not emit 'notrack' if it's not an indirect call. */
19669 if (MEM_P (addr)
19670 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
19671 return false;
19672 else
19673 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
19674 }
19675
19676 if (JUMP_P (insn) && !flag_cet_switch)
19677 {
19678 rtx target = JUMP_LABEL (insn);
19679 if (target == NULL_RTX || ANY_RETURN_P (target))
19680 return false;
19681
19682 /* Check the jump is a switch table. */
19683 rtx_insn *label = as_a<rtx_insn *> (target);
19684 rtx_insn *table = next_insn (label);
19685 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
19686 return false;
19687 else
19688 return true;
19689 }
19690 return false;
19691 }
19692
19693 /* Calculate integer abs() using only SSE2 instructions. */
19694
19695 void
19696 ix86_expand_sse2_abs (rtx target, rtx input)
19697 {
19698 machine_mode mode = GET_MODE (target);
19699 rtx tmp0, tmp1, x;
19700
19701 switch (mode)
19702 {
19703 case E_V2DImode:
19704 case E_V4DImode:
19705 /* For 64-bit signed integer X, with SSE4.2 use
19706 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
19707 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
19708 32 and use logical instead of arithmetic right shift (which is
19709 unimplemented) and subtract. */
19710 if (TARGET_SSE4_2)
19711 {
19712 tmp0 = gen_reg_rtx (mode);
19713 tmp1 = gen_reg_rtx (mode);
19714 emit_move_insn (tmp1, CONST0_RTX (mode));
19715 if (mode == E_V2DImode)
19716 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
19717 else
19718 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
19719 }
19720 else
19721 {
19722 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
19723 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
19724 - 1), NULL, 0, OPTAB_DIRECT);
19725 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
19726 }
19727
19728 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
19729 NULL, 0, OPTAB_DIRECT);
19730 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
19731 target, 0, OPTAB_DIRECT);
19732 break;
19733
19734 case E_V4SImode:
19735 /* For 32-bit signed integer X, the best way to calculate the absolute
19736 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
19737 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
19738 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
19739 NULL, 0, OPTAB_DIRECT);
19740 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
19741 NULL, 0, OPTAB_DIRECT);
19742 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
19743 target, 0, OPTAB_DIRECT);
19744 break;
19745
19746 case E_V8HImode:
19747 /* For 16-bit signed integer X, the best way to calculate the absolute
19748 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
19749 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
19750
19751 x = expand_simple_binop (mode, SMAX, tmp0, input,
19752 target, 0, OPTAB_DIRECT);
19753 break;
19754
19755 case E_V16QImode:
19756 /* For 8-bit signed integer X, the best way to calculate the absolute
19757 value of X is min ((unsigned char) X, (unsigned char) (-X)),
19758 as SSE2 provides the PMINUB insn. */
19759 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
19760
19761 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
19762 target, 0, OPTAB_DIRECT);
19763 break;
19764
19765 default:
19766 gcc_unreachable ();
19767 }
19768
19769 if (x != target)
19770 emit_move_insn (target, x);
19771 }
19772
19773 /* Expand an extract from a vector register through pextr insn.
19774 Return true if successful. */
19775
19776 bool
19777 ix86_expand_pextr (rtx *operands)
19778 {
19779 rtx dst = operands[0];
19780 rtx src = operands[1];
19781
19782 unsigned int size = INTVAL (operands[2]);
19783 unsigned int pos = INTVAL (operands[3]);
19784
19785 if (SUBREG_P (dst))
19786 {
19787 /* Reject non-lowpart subregs. */
19788 if (SUBREG_BYTE (dst) > 0)
19789 return false;
19790 dst = SUBREG_REG (dst);
19791 }
19792
19793 if (SUBREG_P (src))
19794 {
19795 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
19796 src = SUBREG_REG (src);
19797 }
19798
19799 switch (GET_MODE (src))
19800 {
19801 case E_V16QImode:
19802 case E_V8HImode:
19803 case E_V4SImode:
19804 case E_V2DImode:
19805 case E_V1TImode:
19806 case E_TImode:
19807 {
19808 machine_mode srcmode, dstmode;
19809 rtx d, pat;
19810
19811 if (!int_mode_for_size (size, 0).exists (&dstmode))
19812 return false;
19813
19814 switch (dstmode)
19815 {
19816 case E_QImode:
19817 if (!TARGET_SSE4_1)
19818 return false;
19819 srcmode = V16QImode;
19820 break;
19821
19822 case E_HImode:
19823 if (!TARGET_SSE2)
19824 return false;
19825 srcmode = V8HImode;
19826 break;
19827
19828 case E_SImode:
19829 if (!TARGET_SSE4_1)
19830 return false;
19831 srcmode = V4SImode;
19832 break;
19833
19834 case E_DImode:
19835 gcc_assert (TARGET_64BIT);
19836 if (!TARGET_SSE4_1)
19837 return false;
19838 srcmode = V2DImode;
19839 break;
19840
19841 default:
19842 return false;
19843 }
19844
19845 /* Reject extractions from misaligned positions. */
19846 if (pos & (size-1))
19847 return false;
19848
19849 if (GET_MODE (dst) == dstmode)
19850 d = dst;
19851 else
19852 d = gen_reg_rtx (dstmode);
19853
19854 /* Construct insn pattern. */
19855 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
19856 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
19857
19858 /* Let the rtl optimizers know about the zero extension performed. */
19859 if (dstmode == QImode || dstmode == HImode)
19860 {
19861 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
19862 d = gen_lowpart (SImode, d);
19863 }
19864
19865 emit_insn (gen_rtx_SET (d, pat));
19866
19867 if (d != dst)
19868 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
19869 return true;
19870 }
19871
19872 default:
19873 return false;
19874 }
19875 }
19876
19877 /* Expand an insert into a vector register through pinsr insn.
19878 Return true if successful. */
19879
19880 bool
19881 ix86_expand_pinsr (rtx *operands)
19882 {
19883 rtx dst = operands[0];
19884 rtx src = operands[3];
19885
19886 unsigned int size = INTVAL (operands[1]);
19887 unsigned int pos = INTVAL (operands[2]);
19888
19889 if (SUBREG_P (dst))
19890 {
19891 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
19892 dst = SUBREG_REG (dst);
19893 }
19894
19895 switch (GET_MODE (dst))
19896 {
19897 case E_V16QImode:
19898 case E_V8HImode:
19899 case E_V4SImode:
19900 case E_V2DImode:
19901 case E_V1TImode:
19902 case E_TImode:
19903 {
19904 machine_mode srcmode, dstmode;
19905 rtx (*pinsr)(rtx, rtx, rtx, rtx);
19906 rtx d;
19907
19908 if (!int_mode_for_size (size, 0).exists (&srcmode))
19909 return false;
19910
19911 switch (srcmode)
19912 {
19913 case E_QImode:
19914 if (!TARGET_SSE4_1)
19915 return false;
19916 dstmode = V16QImode;
19917 pinsr = gen_sse4_1_pinsrb;
19918 break;
19919
19920 case E_HImode:
19921 if (!TARGET_SSE2)
19922 return false;
19923 dstmode = V8HImode;
19924 pinsr = gen_sse2_pinsrw;
19925 break;
19926
19927 case E_SImode:
19928 if (!TARGET_SSE4_1)
19929 return false;
19930 dstmode = V4SImode;
19931 pinsr = gen_sse4_1_pinsrd;
19932 break;
19933
19934 case E_DImode:
19935 gcc_assert (TARGET_64BIT);
19936 if (!TARGET_SSE4_1)
19937 return false;
19938 dstmode = V2DImode;
19939 pinsr = gen_sse4_1_pinsrq;
19940 break;
19941
19942 default:
19943 return false;
19944 }
19945
19946 /* Reject insertions to misaligned positions. */
19947 if (pos & (size-1))
19948 return false;
19949
19950 if (SUBREG_P (src))
19951 {
19952 unsigned int srcpos = SUBREG_BYTE (src);
19953
19954 if (srcpos > 0)
19955 {
19956 rtx extr_ops[4];
19957
19958 extr_ops[0] = gen_reg_rtx (srcmode);
19959 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
19960 extr_ops[2] = GEN_INT (size);
19961 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
19962
19963 if (!ix86_expand_pextr (extr_ops))
19964 return false;
19965
19966 src = extr_ops[0];
19967 }
19968 else
19969 src = gen_lowpart (srcmode, SUBREG_REG (src));
19970 }
19971
19972 if (GET_MODE (dst) == dstmode)
19973 d = dst;
19974 else
19975 d = gen_reg_rtx (dstmode);
19976
19977 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
19978 gen_lowpart (srcmode, src),
19979 GEN_INT (1 << (pos / size))));
19980 if (d != dst)
19981 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
19982 return true;
19983 }
19984
19985 default:
19986 return false;
19987 }
19988 }
19989
19990 /* All CPUs prefer to avoid cross-lane operations so perform reductions
19991 upper against lower halves up to SSE reg size. */
19992
19993 machine_mode
19994 ix86_split_reduction (machine_mode mode)
19995 {
19996 /* Reduce lowpart against highpart until we reach SSE reg width to
19997 avoid cross-lane operations. */
19998 switch (mode)
19999 {
20000 case E_V8DImode:
20001 case E_V4DImode:
20002 return V2DImode;
20003 case E_V16SImode:
20004 case E_V8SImode:
20005 return V4SImode;
20006 case E_V32HImode:
20007 case E_V16HImode:
20008 return V8HImode;
20009 case E_V64QImode:
20010 case E_V32QImode:
20011 return V16QImode;
20012 case E_V16SFmode:
20013 case E_V8SFmode:
20014 return V4SFmode;
20015 case E_V8DFmode:
20016 case E_V4DFmode:
20017 return V2DFmode;
20018 default:
20019 return mode;
20020 }
20021 }
20022
20023 /* Generate call to __divmoddi4. */
20024
20025 void
20026 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
20027 rtx op0, rtx op1,
20028 rtx *quot_p, rtx *rem_p)
20029 {
20030 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
20031
20032 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
20033 mode, op0, mode, op1, mode,
20034 XEXP (rem, 0), Pmode);
20035 *quot_p = quot;
20036 *rem_p = rem;
20037 }
20038
20039 #include "gt-i386-expand.h"