x86: Check TARGET_AVX512VL when enabling FMA
[gcc.git] / gcc / config / i386 / i386-expand.c
1 /* Copyright (C) 1988-2020 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102 void
103 split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105 {
106 machine_mode half_mode;
107 unsigned int byte;
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
119 default:
120 gcc_unreachable ();
121 }
122
123 byte = GET_MODE_SIZE (half_mode);
124
125 while (num--)
126 {
127 rtx op = operands[num];
128
129 /* simplify_subreg refuse to split volatile memory addresses,
130 but we still have to handle it. */
131 if (MEM_P (op))
132 {
133 if (mem_op && rtx_equal_p (op, mem_op))
134 {
135 lo_half[num] = lo_half[mem_num];
136 hi_half[num] = hi_half[mem_num];
137 }
138 else
139 {
140 mem_op = op;
141 mem_num = num;
142 lo_half[num] = adjust_address (op, half_mode, 0);
143 hi_half[num] = adjust_address (op, half_mode, byte);
144 }
145 }
146 else
147 {
148 lo_half[num] = simplify_gen_subreg (half_mode, op,
149 GET_MODE (op) == VOIDmode
150 ? mode : GET_MODE (op), 0);
151 hi_half[num] = simplify_gen_subreg (half_mode, op,
152 GET_MODE (op) == VOIDmode
153 ? mode : GET_MODE (op), byte);
154 }
155 }
156 }
157
158 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
159 for the target. */
160
161 void
162 ix86_expand_clear (rtx dest)
163 {
164 rtx tmp;
165
166 /* We play register width games, which are only valid after reload. */
167 gcc_assert (reload_completed);
168
169 /* Avoid HImode and its attendant prefix byte. */
170 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
171 dest = gen_rtx_REG (SImode, REGNO (dest));
172 tmp = gen_rtx_SET (dest, const0_rtx);
173
174 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
175 {
176 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
177 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
178 }
179
180 emit_insn (tmp);
181 }
182
183 void
184 ix86_expand_move (machine_mode mode, rtx operands[])
185 {
186 rtx op0, op1;
187 rtx tmp, addend = NULL_RTX;
188 enum tls_model model;
189
190 op0 = operands[0];
191 op1 = operands[1];
192
193 switch (GET_CODE (op1))
194 {
195 case CONST:
196 tmp = XEXP (op1, 0);
197
198 if (GET_CODE (tmp) != PLUS
199 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
200 break;
201
202 op1 = XEXP (tmp, 0);
203 addend = XEXP (tmp, 1);
204 /* FALLTHRU */
205
206 case SYMBOL_REF:
207 model = SYMBOL_REF_TLS_MODEL (op1);
208
209 if (model)
210 op1 = legitimize_tls_address (op1, model, true);
211 else if (ix86_force_load_from_GOT_p (op1))
212 {
213 /* Load the external function address via GOT slot to avoid PLT. */
214 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
215 (TARGET_64BIT
216 ? UNSPEC_GOTPCREL
217 : UNSPEC_GOT));
218 op1 = gen_rtx_CONST (Pmode, op1);
219 op1 = gen_const_mem (Pmode, op1);
220 set_mem_alias_set (op1, ix86_GOT_alias_set ());
221 }
222 else
223 {
224 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
225 if (tmp)
226 {
227 op1 = tmp;
228 if (!addend)
229 break;
230 }
231 else
232 {
233 op1 = operands[1];
234 break;
235 }
236 }
237
238 if (addend)
239 {
240 op1 = force_operand (op1, NULL_RTX);
241 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
242 op0, 1, OPTAB_DIRECT);
243 }
244 else
245 op1 = force_operand (op1, op0);
246
247 if (op1 == op0)
248 return;
249
250 op1 = convert_to_mode (mode, op1, 1);
251
252 default:
253 break;
254 }
255
256 if ((flag_pic || MACHOPIC_INDIRECT)
257 && symbolic_operand (op1, mode))
258 {
259 if (TARGET_MACHO && !TARGET_64BIT)
260 {
261 #if TARGET_MACHO
262 /* dynamic-no-pic */
263 if (MACHOPIC_INDIRECT)
264 {
265 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
266 ? op0 : gen_reg_rtx (Pmode);
267 op1 = machopic_indirect_data_reference (op1, temp);
268 if (MACHOPIC_PURE)
269 op1 = machopic_legitimize_pic_address (op1, mode,
270 temp == op1 ? 0 : temp);
271 }
272 if (op0 != op1 && GET_CODE (op0) != MEM)
273 {
274 rtx insn = gen_rtx_SET (op0, op1);
275 emit_insn (insn);
276 return;
277 }
278 if (GET_CODE (op0) == MEM)
279 op1 = force_reg (Pmode, op1);
280 else
281 {
282 rtx temp = op0;
283 if (GET_CODE (temp) != REG)
284 temp = gen_reg_rtx (Pmode);
285 temp = legitimize_pic_address (op1, temp);
286 if (temp == op0)
287 return;
288 op1 = temp;
289 }
290 /* dynamic-no-pic */
291 #endif
292 }
293 else
294 {
295 if (MEM_P (op0))
296 op1 = force_reg (mode, op1);
297 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
298 {
299 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
300 op1 = legitimize_pic_address (op1, reg);
301 if (op0 == op1)
302 return;
303 op1 = convert_to_mode (mode, op1, 1);
304 }
305 }
306 }
307 else
308 {
309 if (MEM_P (op0)
310 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
311 || !push_operand (op0, mode))
312 && MEM_P (op1))
313 op1 = force_reg (mode, op1);
314
315 if (push_operand (op0, mode)
316 && ! general_no_elim_operand (op1, mode))
317 op1 = copy_to_mode_reg (mode, op1);
318
319 /* Force large constants in 64bit compilation into register
320 to get them CSEed. */
321 if (can_create_pseudo_p ()
322 && (mode == DImode) && TARGET_64BIT
323 && immediate_operand (op1, mode)
324 && !x86_64_zext_immediate_operand (op1, VOIDmode)
325 && !register_operand (op0, mode)
326 && optimize)
327 op1 = copy_to_mode_reg (mode, op1);
328
329 if (can_create_pseudo_p ()
330 && CONST_DOUBLE_P (op1))
331 {
332 /* If we are loading a floating point constant to a register,
333 force the value to memory now, since we'll get better code
334 out the back end. */
335
336 op1 = validize_mem (force_const_mem (mode, op1));
337 if (!register_operand (op0, mode))
338 {
339 rtx temp = gen_reg_rtx (mode);
340 emit_insn (gen_rtx_SET (temp, op1));
341 emit_move_insn (op0, temp);
342 return;
343 }
344 }
345 }
346
347 emit_insn (gen_rtx_SET (op0, op1));
348 }
349
350 void
351 ix86_expand_vector_move (machine_mode mode, rtx operands[])
352 {
353 rtx op0 = operands[0], op1 = operands[1];
354 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
355 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
356 unsigned int align = (TARGET_IAMCU
357 ? GET_MODE_BITSIZE (mode)
358 : GET_MODE_ALIGNMENT (mode));
359
360 if (push_operand (op0, VOIDmode))
361 op0 = emit_move_resolve_push (mode, op0);
362
363 /* Force constants other than zero into memory. We do not know how
364 the instructions used to build constants modify the upper 64 bits
365 of the register, once we have that information we may be able
366 to handle some of them more efficiently. */
367 if (can_create_pseudo_p ()
368 && (CONSTANT_P (op1)
369 || (SUBREG_P (op1)
370 && CONSTANT_P (SUBREG_REG (op1))))
371 && ((register_operand (op0, mode)
372 && !standard_sse_constant_p (op1, mode))
373 /* ix86_expand_vector_move_misalign() does not like constants. */
374 || (SSE_REG_MODE_P (mode)
375 && MEM_P (op0)
376 && MEM_ALIGN (op0) < align)))
377 {
378 if (SUBREG_P (op1))
379 {
380 machine_mode imode = GET_MODE (SUBREG_REG (op1));
381 rtx r = force_const_mem (imode, SUBREG_REG (op1));
382 if (r)
383 r = validize_mem (r);
384 else
385 r = force_reg (imode, SUBREG_REG (op1));
386 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
387 }
388 else
389 op1 = validize_mem (force_const_mem (mode, op1));
390 }
391
392 /* We need to check memory alignment for SSE mode since attribute
393 can make operands unaligned. */
394 if (can_create_pseudo_p ()
395 && SSE_REG_MODE_P (mode)
396 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
397 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
398 {
399 rtx tmp[2];
400
401 /* ix86_expand_vector_move_misalign() does not like both
402 arguments in memory. */
403 if (!register_operand (op0, mode)
404 && !register_operand (op1, mode))
405 op1 = force_reg (mode, op1);
406
407 tmp[0] = op0; tmp[1] = op1;
408 ix86_expand_vector_move_misalign (mode, tmp);
409 return;
410 }
411
412 /* Make operand1 a register if it isn't already. */
413 if (can_create_pseudo_p ()
414 && !register_operand (op0, mode)
415 && !register_operand (op1, mode))
416 {
417 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
418 return;
419 }
420
421 emit_insn (gen_rtx_SET (op0, op1));
422 }
423
424 /* Split 32-byte AVX unaligned load and store if needed. */
425
426 static void
427 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
428 {
429 rtx m;
430 rtx (*extract) (rtx, rtx, rtx);
431 machine_mode mode;
432
433 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
434 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
435 {
436 emit_insn (gen_rtx_SET (op0, op1));
437 return;
438 }
439
440 rtx orig_op0 = NULL_RTX;
441 mode = GET_MODE (op0);
442 switch (GET_MODE_CLASS (mode))
443 {
444 case MODE_VECTOR_INT:
445 case MODE_INT:
446 if (mode != V32QImode)
447 {
448 if (!MEM_P (op0))
449 {
450 orig_op0 = op0;
451 op0 = gen_reg_rtx (V32QImode);
452 }
453 else
454 op0 = gen_lowpart (V32QImode, op0);
455 op1 = gen_lowpart (V32QImode, op1);
456 mode = V32QImode;
457 }
458 break;
459 case MODE_VECTOR_FLOAT:
460 break;
461 default:
462 gcc_unreachable ();
463 }
464
465 switch (mode)
466 {
467 default:
468 gcc_unreachable ();
469 case E_V32QImode:
470 extract = gen_avx_vextractf128v32qi;
471 mode = V16QImode;
472 break;
473 case E_V8SFmode:
474 extract = gen_avx_vextractf128v8sf;
475 mode = V4SFmode;
476 break;
477 case E_V4DFmode:
478 extract = gen_avx_vextractf128v4df;
479 mode = V2DFmode;
480 break;
481 }
482
483 if (MEM_P (op1))
484 {
485 rtx r = gen_reg_rtx (mode);
486 m = adjust_address (op1, mode, 0);
487 emit_move_insn (r, m);
488 m = adjust_address (op1, mode, 16);
489 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
490 emit_move_insn (op0, r);
491 }
492 else if (MEM_P (op0))
493 {
494 m = adjust_address (op0, mode, 0);
495 emit_insn (extract (m, op1, const0_rtx));
496 m = adjust_address (op0, mode, 16);
497 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
498 }
499 else
500 gcc_unreachable ();
501
502 if (orig_op0)
503 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
504 }
505
506 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
507 straight to ix86_expand_vector_move. */
508 /* Code generation for scalar reg-reg moves of single and double precision data:
509 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
510 movaps reg, reg
511 else
512 movss reg, reg
513 if (x86_sse_partial_reg_dependency == true)
514 movapd reg, reg
515 else
516 movsd reg, reg
517
518 Code generation for scalar loads of double precision data:
519 if (x86_sse_split_regs == true)
520 movlpd mem, reg (gas syntax)
521 else
522 movsd mem, reg
523
524 Code generation for unaligned packed loads of single precision data
525 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
526 if (x86_sse_unaligned_move_optimal)
527 movups mem, reg
528
529 if (x86_sse_partial_reg_dependency == true)
530 {
531 xorps reg, reg
532 movlps mem, reg
533 movhps mem+8, reg
534 }
535 else
536 {
537 movlps mem, reg
538 movhps mem+8, reg
539 }
540
541 Code generation for unaligned packed loads of double precision data
542 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
543 if (x86_sse_unaligned_move_optimal)
544 movupd mem, reg
545
546 if (x86_sse_split_regs == true)
547 {
548 movlpd mem, reg
549 movhpd mem+8, reg
550 }
551 else
552 {
553 movsd mem, reg
554 movhpd mem+8, reg
555 }
556 */
557
558 void
559 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
560 {
561 rtx op0, op1, m;
562
563 op0 = operands[0];
564 op1 = operands[1];
565
566 /* Use unaligned load/store for AVX512 or when optimizing for size. */
567 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
568 {
569 emit_insn (gen_rtx_SET (op0, op1));
570 return;
571 }
572
573 if (TARGET_AVX)
574 {
575 if (GET_MODE_SIZE (mode) == 32)
576 ix86_avx256_split_vector_move_misalign (op0, op1);
577 else
578 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
579 emit_insn (gen_rtx_SET (op0, op1));
580 return;
581 }
582
583 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
584 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
585 {
586 emit_insn (gen_rtx_SET (op0, op1));
587 return;
588 }
589
590 /* ??? If we have typed data, then it would appear that using
591 movdqu is the only way to get unaligned data loaded with
592 integer type. */
593 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
594 {
595 emit_insn (gen_rtx_SET (op0, op1));
596 return;
597 }
598
599 if (MEM_P (op1))
600 {
601 if (TARGET_SSE2 && mode == V2DFmode)
602 {
603 rtx zero;
604
605 /* When SSE registers are split into halves, we can avoid
606 writing to the top half twice. */
607 if (TARGET_SSE_SPLIT_REGS)
608 {
609 emit_clobber (op0);
610 zero = op0;
611 }
612 else
613 {
614 /* ??? Not sure about the best option for the Intel chips.
615 The following would seem to satisfy; the register is
616 entirely cleared, breaking the dependency chain. We
617 then store to the upper half, with a dependency depth
618 of one. A rumor has it that Intel recommends two movsd
619 followed by an unpacklpd, but this is unconfirmed. And
620 given that the dependency depth of the unpacklpd would
621 still be one, I'm not sure why this would be better. */
622 zero = CONST0_RTX (V2DFmode);
623 }
624
625 m = adjust_address (op1, DFmode, 0);
626 emit_insn (gen_sse2_loadlpd (op0, zero, m));
627 m = adjust_address (op1, DFmode, 8);
628 emit_insn (gen_sse2_loadhpd (op0, op0, m));
629 }
630 else
631 {
632 rtx t;
633
634 if (mode != V4SFmode)
635 t = gen_reg_rtx (V4SFmode);
636 else
637 t = op0;
638
639 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
640 emit_move_insn (t, CONST0_RTX (V4SFmode));
641 else
642 emit_clobber (t);
643
644 m = adjust_address (op1, V2SFmode, 0);
645 emit_insn (gen_sse_loadlps (t, t, m));
646 m = adjust_address (op1, V2SFmode, 8);
647 emit_insn (gen_sse_loadhps (t, t, m));
648 if (mode != V4SFmode)
649 emit_move_insn (op0, gen_lowpart (mode, t));
650 }
651 }
652 else if (MEM_P (op0))
653 {
654 if (TARGET_SSE2 && mode == V2DFmode)
655 {
656 m = adjust_address (op0, DFmode, 0);
657 emit_insn (gen_sse2_storelpd (m, op1));
658 m = adjust_address (op0, DFmode, 8);
659 emit_insn (gen_sse2_storehpd (m, op1));
660 }
661 else
662 {
663 if (mode != V4SFmode)
664 op1 = gen_lowpart (V4SFmode, op1);
665
666 m = adjust_address (op0, V2SFmode, 0);
667 emit_insn (gen_sse_storelps (m, op1));
668 m = adjust_address (op0, V2SFmode, 8);
669 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
670 }
671 }
672 else
673 gcc_unreachable ();
674 }
675
676 /* Move bits 64:95 to bits 32:63. */
677
678 void
679 ix86_move_vector_high_sse_to_mmx (rtx op)
680 {
681 rtx mask = gen_rtx_PARALLEL (VOIDmode,
682 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
683 GEN_INT (0), GEN_INT (0)));
684 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
685 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
686 rtx insn = gen_rtx_SET (dest, op);
687 emit_insn (insn);
688 }
689
690 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
691
692 void
693 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
694 {
695 rtx op0 = operands[0];
696 rtx op1 = operands[1];
697 rtx op2 = operands[2];
698
699 machine_mode dmode = GET_MODE (op0);
700 machine_mode smode = GET_MODE (op1);
701 machine_mode inner_dmode = GET_MODE_INNER (dmode);
702 machine_mode inner_smode = GET_MODE_INNER (smode);
703
704 /* Get the corresponding SSE mode for destination. */
705 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
706 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
707 nunits).require ();
708 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
709 nunits / 2).require ();
710
711 /* Get the corresponding SSE mode for source. */
712 nunits = 16 / GET_MODE_SIZE (inner_smode);
713 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
714 nunits).require ();
715
716 /* Generate SSE pack with signed/unsigned saturation. */
717 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
718 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
719 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
720
721 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
722 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
723 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
724 op1, op2));
725 emit_insn (insn);
726
727 ix86_move_vector_high_sse_to_mmx (op0);
728 }
729
730 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
731
732 void
733 ix86_split_mmx_punpck (rtx operands[], bool high_p)
734 {
735 rtx op0 = operands[0];
736 rtx op1 = operands[1];
737 rtx op2 = operands[2];
738 machine_mode mode = GET_MODE (op0);
739 rtx mask;
740 /* The corresponding SSE mode. */
741 machine_mode sse_mode, double_sse_mode;
742
743 switch (mode)
744 {
745 case E_V8QImode:
746 sse_mode = V16QImode;
747 double_sse_mode = V32QImode;
748 mask = gen_rtx_PARALLEL (VOIDmode,
749 gen_rtvec (16,
750 GEN_INT (0), GEN_INT (16),
751 GEN_INT (1), GEN_INT (17),
752 GEN_INT (2), GEN_INT (18),
753 GEN_INT (3), GEN_INT (19),
754 GEN_INT (4), GEN_INT (20),
755 GEN_INT (5), GEN_INT (21),
756 GEN_INT (6), GEN_INT (22),
757 GEN_INT (7), GEN_INT (23)));
758 break;
759
760 case E_V4HImode:
761 sse_mode = V8HImode;
762 double_sse_mode = V16HImode;
763 mask = gen_rtx_PARALLEL (VOIDmode,
764 gen_rtvec (8,
765 GEN_INT (0), GEN_INT (8),
766 GEN_INT (1), GEN_INT (9),
767 GEN_INT (2), GEN_INT (10),
768 GEN_INT (3), GEN_INT (11)));
769 break;
770
771 case E_V2SImode:
772 sse_mode = V4SImode;
773 double_sse_mode = V8SImode;
774 mask = gen_rtx_PARALLEL (VOIDmode,
775 gen_rtvec (4,
776 GEN_INT (0), GEN_INT (4),
777 GEN_INT (1), GEN_INT (5)));
778 break;
779
780 default:
781 gcc_unreachable ();
782 }
783
784 /* Generate SSE punpcklXX. */
785 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
786 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
787 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
788
789 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
790 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
791 rtx insn = gen_rtx_SET (dest, op2);
792 emit_insn (insn);
793
794 if (high_p)
795 {
796 /* Move bits 64:127 to bits 0:63. */
797 mask = gen_rtx_PARALLEL (VOIDmode,
798 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
799 GEN_INT (0), GEN_INT (0)));
800 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
801 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
802 insn = gen_rtx_SET (dest, op1);
803 emit_insn (insn);
804 }
805 }
806
807 /* Helper function of ix86_fixup_binary_operands to canonicalize
808 operand order. Returns true if the operands should be swapped. */
809
810 static bool
811 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
812 rtx operands[])
813 {
814 rtx dst = operands[0];
815 rtx src1 = operands[1];
816 rtx src2 = operands[2];
817
818 /* If the operation is not commutative, we can't do anything. */
819 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
820 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
821 return false;
822
823 /* Highest priority is that src1 should match dst. */
824 if (rtx_equal_p (dst, src1))
825 return false;
826 if (rtx_equal_p (dst, src2))
827 return true;
828
829 /* Next highest priority is that immediate constants come second. */
830 if (immediate_operand (src2, mode))
831 return false;
832 if (immediate_operand (src1, mode))
833 return true;
834
835 /* Lowest priority is that memory references should come second. */
836 if (MEM_P (src2))
837 return false;
838 if (MEM_P (src1))
839 return true;
840
841 return false;
842 }
843
844
845 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
846 destination to use for the operation. If different from the true
847 destination in operands[0], a copy operation will be required. */
848
849 rtx
850 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
851 rtx operands[])
852 {
853 rtx dst = operands[0];
854 rtx src1 = operands[1];
855 rtx src2 = operands[2];
856
857 /* Canonicalize operand order. */
858 if (ix86_swap_binary_operands_p (code, mode, operands))
859 {
860 /* It is invalid to swap operands of different modes. */
861 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
862
863 std::swap (src1, src2);
864 }
865
866 /* Both source operands cannot be in memory. */
867 if (MEM_P (src1) && MEM_P (src2))
868 {
869 /* Optimization: Only read from memory once. */
870 if (rtx_equal_p (src1, src2))
871 {
872 src2 = force_reg (mode, src2);
873 src1 = src2;
874 }
875 else if (rtx_equal_p (dst, src1))
876 src2 = force_reg (mode, src2);
877 else
878 src1 = force_reg (mode, src1);
879 }
880
881 /* If the destination is memory, and we do not have matching source
882 operands, do things in registers. */
883 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
884 dst = gen_reg_rtx (mode);
885
886 /* Source 1 cannot be a constant. */
887 if (CONSTANT_P (src1))
888 src1 = force_reg (mode, src1);
889
890 /* Source 1 cannot be a non-matching memory. */
891 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
892 src1 = force_reg (mode, src1);
893
894 /* Improve address combine. */
895 if (code == PLUS
896 && GET_MODE_CLASS (mode) == MODE_INT
897 && MEM_P (src2))
898 src2 = force_reg (mode, src2);
899
900 operands[1] = src1;
901 operands[2] = src2;
902 return dst;
903 }
904
905 /* Similarly, but assume that the destination has already been
906 set up properly. */
907
908 void
909 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
910 machine_mode mode, rtx operands[])
911 {
912 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
913 gcc_assert (dst == operands[0]);
914 }
915
916 /* Attempt to expand a binary operator. Make the expansion closer to the
917 actual machine, then just general_operand, which will allow 3 separate
918 memory references (one output, two input) in a single insn. */
919
920 void
921 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
922 rtx operands[])
923 {
924 rtx src1, src2, dst, op, clob;
925
926 dst = ix86_fixup_binary_operands (code, mode, operands);
927 src1 = operands[1];
928 src2 = operands[2];
929
930 /* Emit the instruction. */
931
932 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
933
934 if (reload_completed
935 && code == PLUS
936 && !rtx_equal_p (dst, src1))
937 {
938 /* This is going to be an LEA; avoid splitting it later. */
939 emit_insn (op);
940 }
941 else
942 {
943 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
944 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
945 }
946
947 /* Fix up the destination if needed. */
948 if (dst != operands[0])
949 emit_move_insn (operands[0], dst);
950 }
951
952 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
953 the given OPERANDS. */
954
955 void
956 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
957 rtx operands[])
958 {
959 rtx op1 = NULL_RTX, op2 = NULL_RTX;
960 if (SUBREG_P (operands[1]))
961 {
962 op1 = operands[1];
963 op2 = operands[2];
964 }
965 else if (SUBREG_P (operands[2]))
966 {
967 op1 = operands[2];
968 op2 = operands[1];
969 }
970 /* Optimize (__m128i) d | (__m128i) e and similar code
971 when d and e are float vectors into float vector logical
972 insn. In C/C++ without using intrinsics there is no other way
973 to express vector logical operation on float vectors than
974 to cast them temporarily to integer vectors. */
975 if (op1
976 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
977 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
978 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
979 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
980 && SUBREG_BYTE (op1) == 0
981 && (GET_CODE (op2) == CONST_VECTOR
982 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
983 && SUBREG_BYTE (op2) == 0))
984 && can_create_pseudo_p ())
985 {
986 rtx dst;
987 switch (GET_MODE (SUBREG_REG (op1)))
988 {
989 case E_V4SFmode:
990 case E_V8SFmode:
991 case E_V16SFmode:
992 case E_V2DFmode:
993 case E_V4DFmode:
994 case E_V8DFmode:
995 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
996 if (GET_CODE (op2) == CONST_VECTOR)
997 {
998 op2 = gen_lowpart (GET_MODE (dst), op2);
999 op2 = force_reg (GET_MODE (dst), op2);
1000 }
1001 else
1002 {
1003 op1 = operands[1];
1004 op2 = SUBREG_REG (operands[2]);
1005 if (!vector_operand (op2, GET_MODE (dst)))
1006 op2 = force_reg (GET_MODE (dst), op2);
1007 }
1008 op1 = SUBREG_REG (op1);
1009 if (!vector_operand (op1, GET_MODE (dst)))
1010 op1 = force_reg (GET_MODE (dst), op1);
1011 emit_insn (gen_rtx_SET (dst,
1012 gen_rtx_fmt_ee (code, GET_MODE (dst),
1013 op1, op2)));
1014 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1015 return;
1016 default:
1017 break;
1018 }
1019 }
1020 if (!vector_operand (operands[1], mode))
1021 operands[1] = force_reg (mode, operands[1]);
1022 if (!vector_operand (operands[2], mode))
1023 operands[2] = force_reg (mode, operands[2]);
1024 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1025 emit_insn (gen_rtx_SET (operands[0],
1026 gen_rtx_fmt_ee (code, mode, operands[1],
1027 operands[2])));
1028 }
1029
1030 /* Return TRUE or FALSE depending on whether the binary operator meets the
1031 appropriate constraints. */
1032
1033 bool
1034 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1035 rtx operands[3])
1036 {
1037 rtx dst = operands[0];
1038 rtx src1 = operands[1];
1039 rtx src2 = operands[2];
1040
1041 /* Both source operands cannot be in memory. */
1042 if (MEM_P (src1) && MEM_P (src2))
1043 return false;
1044
1045 /* Canonicalize operand order for commutative operators. */
1046 if (ix86_swap_binary_operands_p (code, mode, operands))
1047 std::swap (src1, src2);
1048
1049 /* If the destination is memory, we must have a matching source operand. */
1050 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1051 return false;
1052
1053 /* Source 1 cannot be a constant. */
1054 if (CONSTANT_P (src1))
1055 return false;
1056
1057 /* Source 1 cannot be a non-matching memory. */
1058 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1059 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1060 return (code == AND
1061 && (mode == HImode
1062 || mode == SImode
1063 || (TARGET_64BIT && mode == DImode))
1064 && satisfies_constraint_L (src2));
1065
1066 return true;
1067 }
1068
1069 /* Attempt to expand a unary operator. Make the expansion closer to the
1070 actual machine, then just general_operand, which will allow 2 separate
1071 memory references (one output, one input) in a single insn. */
1072
1073 void
1074 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1075 rtx operands[])
1076 {
1077 bool matching_memory = false;
1078 rtx src, dst, op, clob;
1079
1080 dst = operands[0];
1081 src = operands[1];
1082
1083 /* If the destination is memory, and we do not have matching source
1084 operands, do things in registers. */
1085 if (MEM_P (dst))
1086 {
1087 if (rtx_equal_p (dst, src))
1088 matching_memory = true;
1089 else
1090 dst = gen_reg_rtx (mode);
1091 }
1092
1093 /* When source operand is memory, destination must match. */
1094 if (MEM_P (src) && !matching_memory)
1095 src = force_reg (mode, src);
1096
1097 /* Emit the instruction. */
1098
1099 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1100
1101 if (code == NOT)
1102 emit_insn (op);
1103 else
1104 {
1105 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1106 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1107 }
1108
1109 /* Fix up the destination if needed. */
1110 if (dst != operands[0])
1111 emit_move_insn (operands[0], dst);
1112 }
1113
1114 /* Predict just emitted jump instruction to be taken with probability PROB. */
1115
1116 static void
1117 predict_jump (int prob)
1118 {
1119 rtx_insn *insn = get_last_insn ();
1120 gcc_assert (JUMP_P (insn));
1121 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1122 }
1123
1124 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1125 divisor are within the range [0-255]. */
1126
1127 void
1128 ix86_split_idivmod (machine_mode mode, rtx operands[],
1129 bool unsigned_p)
1130 {
1131 rtx_code_label *end_label, *qimode_label;
1132 rtx div, mod;
1133 rtx_insn *insn;
1134 rtx scratch, tmp0, tmp1, tmp2;
1135 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1136
1137 switch (mode)
1138 {
1139 case E_SImode:
1140 if (GET_MODE (operands[0]) == SImode)
1141 {
1142 if (GET_MODE (operands[1]) == SImode)
1143 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1144 else
1145 gen_divmod4_1
1146 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1147 }
1148 else
1149 gen_divmod4_1
1150 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1151 break;
1152
1153 case E_DImode:
1154 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1155 break;
1156
1157 default:
1158 gcc_unreachable ();
1159 }
1160
1161 end_label = gen_label_rtx ();
1162 qimode_label = gen_label_rtx ();
1163
1164 scratch = gen_reg_rtx (mode);
1165
1166 /* Use 8bit unsigned divimod if dividend and divisor are within
1167 the range [0-255]. */
1168 emit_move_insn (scratch, operands[2]);
1169 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1170 scratch, 1, OPTAB_DIRECT);
1171 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1172 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1173 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1174 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1175 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1176 pc_rtx);
1177 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1178 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1179 JUMP_LABEL (insn) = qimode_label;
1180
1181 /* Generate original signed/unsigned divimod. */
1182 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1183 operands[2], operands[3]));
1184
1185 /* Branch to the end. */
1186 emit_jump_insn (gen_jump (end_label));
1187 emit_barrier ();
1188
1189 /* Generate 8bit unsigned divide. */
1190 emit_label (qimode_label);
1191 /* Don't use operands[0] for result of 8bit divide since not all
1192 registers support QImode ZERO_EXTRACT. */
1193 tmp0 = lowpart_subreg (HImode, scratch, mode);
1194 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1195 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1196 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1197
1198 if (unsigned_p)
1199 {
1200 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1201 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1202 }
1203 else
1204 {
1205 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1206 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1207 }
1208 if (mode == SImode)
1209 {
1210 if (GET_MODE (operands[0]) != SImode)
1211 div = gen_rtx_ZERO_EXTEND (DImode, div);
1212 if (GET_MODE (operands[1]) != SImode)
1213 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1214 }
1215
1216 /* Extract remainder from AH. */
1217 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1218 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1219 GEN_INT (8), GEN_INT (8));
1220 insn = emit_move_insn (operands[1], tmp1);
1221 set_unique_reg_note (insn, REG_EQUAL, mod);
1222
1223 /* Zero extend quotient from AL. */
1224 tmp1 = gen_lowpart (QImode, tmp0);
1225 insn = emit_insn (gen_extend_insn
1226 (operands[0], tmp1,
1227 GET_MODE (operands[0]), QImode, 1));
1228 set_unique_reg_note (insn, REG_EQUAL, div);
1229
1230 emit_label (end_label);
1231 }
1232
1233 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1234 matches destination. RTX includes clobber of FLAGS_REG. */
1235
1236 void
1237 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1238 rtx dst, rtx src)
1239 {
1240 rtx op, clob;
1241
1242 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1243 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1244
1245 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1246 }
1247
1248 /* Return true if regno1 def is nearest to the insn. */
1249
1250 static bool
1251 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1252 {
1253 rtx_insn *prev = insn;
1254 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1255
1256 if (insn == start)
1257 return false;
1258 while (prev && prev != start)
1259 {
1260 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1261 {
1262 prev = PREV_INSN (prev);
1263 continue;
1264 }
1265 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1266 return true;
1267 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1268 return false;
1269 prev = PREV_INSN (prev);
1270 }
1271
1272 /* None of the regs is defined in the bb. */
1273 return false;
1274 }
1275
1276 /* Split lea instructions into a sequence of instructions
1277 which are executed on ALU to avoid AGU stalls.
1278 It is assumed that it is allowed to clobber flags register
1279 at lea position. */
1280
1281 void
1282 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1283 {
1284 unsigned int regno0, regno1, regno2;
1285 struct ix86_address parts;
1286 rtx target, tmp;
1287 int ok, adds;
1288
1289 ok = ix86_decompose_address (operands[1], &parts);
1290 gcc_assert (ok);
1291
1292 target = gen_lowpart (mode, operands[0]);
1293
1294 regno0 = true_regnum (target);
1295 regno1 = INVALID_REGNUM;
1296 regno2 = INVALID_REGNUM;
1297
1298 if (parts.base)
1299 {
1300 parts.base = gen_lowpart (mode, parts.base);
1301 regno1 = true_regnum (parts.base);
1302 }
1303
1304 if (parts.index)
1305 {
1306 parts.index = gen_lowpart (mode, parts.index);
1307 regno2 = true_regnum (parts.index);
1308 }
1309
1310 if (parts.disp)
1311 parts.disp = gen_lowpart (mode, parts.disp);
1312
1313 if (parts.scale > 1)
1314 {
1315 /* Case r1 = r1 + ... */
1316 if (regno1 == regno0)
1317 {
1318 /* If we have a case r1 = r1 + C * r2 then we
1319 should use multiplication which is very
1320 expensive. Assume cost model is wrong if we
1321 have such case here. */
1322 gcc_assert (regno2 != regno0);
1323
1324 for (adds = parts.scale; adds > 0; adds--)
1325 ix86_emit_binop (PLUS, mode, target, parts.index);
1326 }
1327 else
1328 {
1329 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1330 if (regno0 != regno2)
1331 emit_insn (gen_rtx_SET (target, parts.index));
1332
1333 /* Use shift for scaling. */
1334 ix86_emit_binop (ASHIFT, mode, target,
1335 GEN_INT (exact_log2 (parts.scale)));
1336
1337 if (parts.base)
1338 ix86_emit_binop (PLUS, mode, target, parts.base);
1339
1340 if (parts.disp && parts.disp != const0_rtx)
1341 ix86_emit_binop (PLUS, mode, target, parts.disp);
1342 }
1343 }
1344 else if (!parts.base && !parts.index)
1345 {
1346 gcc_assert(parts.disp);
1347 emit_insn (gen_rtx_SET (target, parts.disp));
1348 }
1349 else
1350 {
1351 if (!parts.base)
1352 {
1353 if (regno0 != regno2)
1354 emit_insn (gen_rtx_SET (target, parts.index));
1355 }
1356 else if (!parts.index)
1357 {
1358 if (regno0 != regno1)
1359 emit_insn (gen_rtx_SET (target, parts.base));
1360 }
1361 else
1362 {
1363 if (regno0 == regno1)
1364 tmp = parts.index;
1365 else if (regno0 == regno2)
1366 tmp = parts.base;
1367 else
1368 {
1369 rtx tmp1;
1370
1371 /* Find better operand for SET instruction, depending
1372 on which definition is farther from the insn. */
1373 if (find_nearest_reg_def (insn, regno1, regno2))
1374 tmp = parts.index, tmp1 = parts.base;
1375 else
1376 tmp = parts.base, tmp1 = parts.index;
1377
1378 emit_insn (gen_rtx_SET (target, tmp));
1379
1380 if (parts.disp && parts.disp != const0_rtx)
1381 ix86_emit_binop (PLUS, mode, target, parts.disp);
1382
1383 ix86_emit_binop (PLUS, mode, target, tmp1);
1384 return;
1385 }
1386
1387 ix86_emit_binop (PLUS, mode, target, tmp);
1388 }
1389
1390 if (parts.disp && parts.disp != const0_rtx)
1391 ix86_emit_binop (PLUS, mode, target, parts.disp);
1392 }
1393 }
1394
1395 /* Post-reload splitter for converting an SF or DFmode value in an
1396 SSE register into an unsigned SImode. */
1397
1398 void
1399 ix86_split_convert_uns_si_sse (rtx operands[])
1400 {
1401 machine_mode vecmode;
1402 rtx value, large, zero_or_two31, input, two31, x;
1403
1404 large = operands[1];
1405 zero_or_two31 = operands[2];
1406 input = operands[3];
1407 two31 = operands[4];
1408 vecmode = GET_MODE (large);
1409 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1410
1411 /* Load up the value into the low element. We must ensure that the other
1412 elements are valid floats -- zero is the easiest such value. */
1413 if (MEM_P (input))
1414 {
1415 if (vecmode == V4SFmode)
1416 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1417 else
1418 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1419 }
1420 else
1421 {
1422 input = gen_rtx_REG (vecmode, REGNO (input));
1423 emit_move_insn (value, CONST0_RTX (vecmode));
1424 if (vecmode == V4SFmode)
1425 emit_insn (gen_sse_movss (value, value, input));
1426 else
1427 emit_insn (gen_sse2_movsd (value, value, input));
1428 }
1429
1430 emit_move_insn (large, two31);
1431 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1432
1433 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1434 emit_insn (gen_rtx_SET (large, x));
1435
1436 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1437 emit_insn (gen_rtx_SET (zero_or_two31, x));
1438
1439 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1440 emit_insn (gen_rtx_SET (value, x));
1441
1442 large = gen_rtx_REG (V4SImode, REGNO (large));
1443 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1444
1445 x = gen_rtx_REG (V4SImode, REGNO (value));
1446 if (vecmode == V4SFmode)
1447 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1448 else
1449 emit_insn (gen_sse2_cvttpd2dq (x, value));
1450 value = x;
1451
1452 emit_insn (gen_xorv4si3 (value, value, large));
1453 }
1454
1455 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1456 machine_mode mode, rtx target,
1457 rtx var, int one_var);
1458
1459 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1460 Expects the 64-bit DImode to be supplied in a pair of integral
1461 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1462 -mfpmath=sse, !optimize_size only. */
1463
1464 void
1465 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1466 {
1467 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1468 rtx int_xmm, fp_xmm;
1469 rtx biases, exponents;
1470 rtx x;
1471
1472 int_xmm = gen_reg_rtx (V4SImode);
1473 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1474 emit_insn (gen_movdi_to_sse (int_xmm, input));
1475 else if (TARGET_SSE_SPLIT_REGS)
1476 {
1477 emit_clobber (int_xmm);
1478 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1479 }
1480 else
1481 {
1482 x = gen_reg_rtx (V2DImode);
1483 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1484 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1485 }
1486
1487 x = gen_rtx_CONST_VECTOR (V4SImode,
1488 gen_rtvec (4, GEN_INT (0x43300000UL),
1489 GEN_INT (0x45300000UL),
1490 const0_rtx, const0_rtx));
1491 exponents = validize_mem (force_const_mem (V4SImode, x));
1492
1493 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1494 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1495
1496 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1497 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1498 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1499 (0x1.0p84 + double(fp_value_hi_xmm)).
1500 Note these exponents differ by 32. */
1501
1502 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1503
1504 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1505 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1506 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1507 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1508 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1509 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1510 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1511 biases = validize_mem (force_const_mem (V2DFmode, biases));
1512 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1513
1514 /* Add the upper and lower DFmode values together. */
1515 if (TARGET_SSE3)
1516 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1517 else
1518 {
1519 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1520 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1521 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1522 }
1523
1524 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1525 }
1526
1527 /* Not used, but eases macroization of patterns. */
1528 void
1529 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1530 {
1531 gcc_unreachable ();
1532 }
1533
1534 /* Convert an unsigned SImode value into a DFmode. Only currently used
1535 for SSE, but applicable anywhere. */
1536
1537 void
1538 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1539 {
1540 REAL_VALUE_TYPE TWO31r;
1541 rtx x, fp;
1542
1543 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1544 NULL, 1, OPTAB_DIRECT);
1545
1546 fp = gen_reg_rtx (DFmode);
1547 emit_insn (gen_floatsidf2 (fp, x));
1548
1549 real_ldexp (&TWO31r, &dconst1, 31);
1550 x = const_double_from_real_value (TWO31r, DFmode);
1551
1552 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1553 if (x != target)
1554 emit_move_insn (target, x);
1555 }
1556
1557 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1558 32-bit mode; otherwise we have a direct convert instruction. */
1559
1560 void
1561 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1562 {
1563 REAL_VALUE_TYPE TWO32r;
1564 rtx fp_lo, fp_hi, x;
1565
1566 fp_lo = gen_reg_rtx (DFmode);
1567 fp_hi = gen_reg_rtx (DFmode);
1568
1569 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1570
1571 real_ldexp (&TWO32r, &dconst1, 32);
1572 x = const_double_from_real_value (TWO32r, DFmode);
1573 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1574
1575 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1576
1577 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1578 0, OPTAB_DIRECT);
1579 if (x != target)
1580 emit_move_insn (target, x);
1581 }
1582
1583 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1584 For x86_32, -mfpmath=sse, !optimize_size only. */
1585 void
1586 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1587 {
1588 REAL_VALUE_TYPE ONE16r;
1589 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1590
1591 real_ldexp (&ONE16r, &dconst1, 16);
1592 x = const_double_from_real_value (ONE16r, SFmode);
1593 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1594 NULL, 0, OPTAB_DIRECT);
1595 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1596 NULL, 0, OPTAB_DIRECT);
1597 fp_hi = gen_reg_rtx (SFmode);
1598 fp_lo = gen_reg_rtx (SFmode);
1599 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1600 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1601 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1602 0, OPTAB_DIRECT);
1603 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1604 0, OPTAB_DIRECT);
1605 if (!rtx_equal_p (target, fp_hi))
1606 emit_move_insn (target, fp_hi);
1607 }
1608
1609 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1610 a vector of unsigned ints VAL to vector of floats TARGET. */
1611
1612 void
1613 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1614 {
1615 rtx tmp[8];
1616 REAL_VALUE_TYPE TWO16r;
1617 machine_mode intmode = GET_MODE (val);
1618 machine_mode fltmode = GET_MODE (target);
1619 rtx (*cvt) (rtx, rtx);
1620
1621 if (intmode == V4SImode)
1622 cvt = gen_floatv4siv4sf2;
1623 else
1624 cvt = gen_floatv8siv8sf2;
1625 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1626 tmp[0] = force_reg (intmode, tmp[0]);
1627 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1628 OPTAB_DIRECT);
1629 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1630 NULL_RTX, 1, OPTAB_DIRECT);
1631 tmp[3] = gen_reg_rtx (fltmode);
1632 emit_insn (cvt (tmp[3], tmp[1]));
1633 tmp[4] = gen_reg_rtx (fltmode);
1634 emit_insn (cvt (tmp[4], tmp[2]));
1635 real_ldexp (&TWO16r, &dconst1, 16);
1636 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1637 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1638 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1639 OPTAB_DIRECT);
1640 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1641 OPTAB_DIRECT);
1642 if (tmp[7] != target)
1643 emit_move_insn (target, tmp[7]);
1644 }
1645
1646 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1647 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1648 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1649 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1650
1651 rtx
1652 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1653 {
1654 REAL_VALUE_TYPE TWO31r;
1655 rtx two31r, tmp[4];
1656 machine_mode mode = GET_MODE (val);
1657 machine_mode scalarmode = GET_MODE_INNER (mode);
1658 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1659 rtx (*cmp) (rtx, rtx, rtx, rtx);
1660 int i;
1661
1662 for (i = 0; i < 3; i++)
1663 tmp[i] = gen_reg_rtx (mode);
1664 real_ldexp (&TWO31r, &dconst1, 31);
1665 two31r = const_double_from_real_value (TWO31r, scalarmode);
1666 two31r = ix86_build_const_vector (mode, 1, two31r);
1667 two31r = force_reg (mode, two31r);
1668 switch (mode)
1669 {
1670 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1671 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1672 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1673 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1674 default: gcc_unreachable ();
1675 }
1676 tmp[3] = gen_rtx_LE (mode, two31r, val);
1677 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1678 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1679 0, OPTAB_DIRECT);
1680 if (intmode == V4SImode || TARGET_AVX2)
1681 *xorp = expand_simple_binop (intmode, ASHIFT,
1682 gen_lowpart (intmode, tmp[0]),
1683 GEN_INT (31), NULL_RTX, 0,
1684 OPTAB_DIRECT);
1685 else
1686 {
1687 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
1688 two31 = ix86_build_const_vector (intmode, 1, two31);
1689 *xorp = expand_simple_binop (intmode, AND,
1690 gen_lowpart (intmode, tmp[0]),
1691 two31, NULL_RTX, 0,
1692 OPTAB_DIRECT);
1693 }
1694 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1695 0, OPTAB_DIRECT);
1696 }
1697
1698 /* Generate code for floating point ABS or NEG. */
1699
1700 void
1701 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1702 rtx operands[])
1703 {
1704 rtx set, dst, src;
1705 bool use_sse = false;
1706 bool vector_mode = VECTOR_MODE_P (mode);
1707 machine_mode vmode = mode;
1708 rtvec par;
1709
1710 if (vector_mode || mode == TFmode)
1711 use_sse = true;
1712 else if (TARGET_SSE_MATH)
1713 {
1714 use_sse = SSE_FLOAT_MODE_P (mode);
1715 if (mode == SFmode)
1716 vmode = V4SFmode;
1717 else if (mode == DFmode)
1718 vmode = V2DFmode;
1719 }
1720
1721 dst = operands[0];
1722 src = operands[1];
1723
1724 set = gen_rtx_fmt_e (code, mode, src);
1725 set = gen_rtx_SET (dst, set);
1726
1727 if (use_sse)
1728 {
1729 rtx mask, use, clob;
1730
1731 /* NEG and ABS performed with SSE use bitwise mask operations.
1732 Create the appropriate mask now. */
1733 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
1734 use = gen_rtx_USE (VOIDmode, mask);
1735 if (vector_mode || mode == TFmode)
1736 par = gen_rtvec (2, set, use);
1737 else
1738 {
1739 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1740 par = gen_rtvec (3, set, use, clob);
1741 }
1742 }
1743 else
1744 {
1745 rtx clob;
1746
1747 /* Changing of sign for FP values is doable using integer unit too. */
1748 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1749 par = gen_rtvec (2, set, clob);
1750 }
1751
1752 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1753 }
1754
1755 /* Deconstruct a floating point ABS or NEG operation
1756 with integer registers into integer operations. */
1757
1758 void
1759 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1760 rtx operands[])
1761 {
1762 enum rtx_code absneg_op;
1763 rtx dst, set;
1764
1765 gcc_assert (operands_match_p (operands[0], operands[1]));
1766
1767 switch (mode)
1768 {
1769 case E_SFmode:
1770 dst = gen_lowpart (SImode, operands[0]);
1771
1772 if (code == ABS)
1773 {
1774 set = gen_int_mode (0x7fffffff, SImode);
1775 absneg_op = AND;
1776 }
1777 else
1778 {
1779 set = gen_int_mode (0x80000000, SImode);
1780 absneg_op = XOR;
1781 }
1782 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1783 break;
1784
1785 case E_DFmode:
1786 if (TARGET_64BIT)
1787 {
1788 dst = gen_lowpart (DImode, operands[0]);
1789 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1790
1791 if (code == ABS)
1792 set = const0_rtx;
1793 else
1794 set = gen_rtx_NOT (DImode, dst);
1795 }
1796 else
1797 {
1798 dst = gen_highpart (SImode, operands[0]);
1799
1800 if (code == ABS)
1801 {
1802 set = gen_int_mode (0x7fffffff, SImode);
1803 absneg_op = AND;
1804 }
1805 else
1806 {
1807 set = gen_int_mode (0x80000000, SImode);
1808 absneg_op = XOR;
1809 }
1810 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1811 }
1812 break;
1813
1814 case E_XFmode:
1815 dst = gen_rtx_REG (SImode,
1816 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1817 if (code == ABS)
1818 {
1819 set = GEN_INT (0x7fff);
1820 absneg_op = AND;
1821 }
1822 else
1823 {
1824 set = GEN_INT (0x8000);
1825 absneg_op = XOR;
1826 }
1827 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1828 break;
1829
1830 default:
1831 gcc_unreachable ();
1832 }
1833
1834 set = gen_rtx_SET (dst, set);
1835
1836 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1837 rtvec par = gen_rtvec (2, set, clob);
1838
1839 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1840 }
1841
1842 /* Expand a copysign operation. Special case operand 0 being a constant. */
1843
1844 void
1845 ix86_expand_copysign (rtx operands[])
1846 {
1847 machine_mode mode, vmode;
1848 rtx dest, op0, op1, mask;
1849
1850 dest = operands[0];
1851 op0 = operands[1];
1852 op1 = operands[2];
1853
1854 mode = GET_MODE (dest);
1855
1856 if (mode == SFmode)
1857 vmode = V4SFmode;
1858 else if (mode == DFmode)
1859 vmode = V2DFmode;
1860 else if (mode == TFmode)
1861 vmode = mode;
1862 else
1863 gcc_unreachable ();
1864
1865 mask = ix86_build_signbit_mask (vmode, 0, 0);
1866
1867 if (CONST_DOUBLE_P (op0))
1868 {
1869 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1870 op0 = simplify_unary_operation (ABS, mode, op0, mode);
1871
1872 if (mode == SFmode || mode == DFmode)
1873 {
1874 if (op0 == CONST0_RTX (mode))
1875 op0 = CONST0_RTX (vmode);
1876 else
1877 {
1878 rtx v = ix86_build_const_vector (vmode, false, op0);
1879
1880 op0 = force_reg (vmode, v);
1881 }
1882 }
1883 else if (op0 != CONST0_RTX (mode))
1884 op0 = force_reg (mode, op0);
1885
1886 emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
1887 }
1888 else
1889 {
1890 rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
1891
1892 emit_insn (gen_copysign3_var
1893 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
1894 }
1895 }
1896
1897 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1898 be a constant, and so has already been expanded into a vector constant. */
1899
1900 void
1901 ix86_split_copysign_const (rtx operands[])
1902 {
1903 machine_mode mode, vmode;
1904 rtx dest, op0, mask, x;
1905
1906 dest = operands[0];
1907 op0 = operands[1];
1908 mask = operands[3];
1909
1910 mode = GET_MODE (dest);
1911 vmode = GET_MODE (mask);
1912
1913 dest = lowpart_subreg (vmode, dest, mode);
1914 x = gen_rtx_AND (vmode, dest, mask);
1915 emit_insn (gen_rtx_SET (dest, x));
1916
1917 if (op0 != CONST0_RTX (vmode))
1918 {
1919 x = gen_rtx_IOR (vmode, dest, op0);
1920 emit_insn (gen_rtx_SET (dest, x));
1921 }
1922 }
1923
1924 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1925 so we have to do two masks. */
1926
1927 void
1928 ix86_split_copysign_var (rtx operands[])
1929 {
1930 machine_mode mode, vmode;
1931 rtx dest, scratch, op0, op1, mask, nmask, x;
1932
1933 dest = operands[0];
1934 scratch = operands[1];
1935 op0 = operands[2];
1936 op1 = operands[3];
1937 nmask = operands[4];
1938 mask = operands[5];
1939
1940 mode = GET_MODE (dest);
1941 vmode = GET_MODE (mask);
1942
1943 if (rtx_equal_p (op0, op1))
1944 {
1945 /* Shouldn't happen often (it's useless, obviously), but when it does
1946 we'd generate incorrect code if we continue below. */
1947 emit_move_insn (dest, op0);
1948 return;
1949 }
1950
1951 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
1952 {
1953 gcc_assert (REGNO (op1) == REGNO (scratch));
1954
1955 x = gen_rtx_AND (vmode, scratch, mask);
1956 emit_insn (gen_rtx_SET (scratch, x));
1957
1958 dest = mask;
1959 op0 = lowpart_subreg (vmode, op0, mode);
1960 x = gen_rtx_NOT (vmode, dest);
1961 x = gen_rtx_AND (vmode, x, op0);
1962 emit_insn (gen_rtx_SET (dest, x));
1963 }
1964 else
1965 {
1966 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
1967 {
1968 x = gen_rtx_AND (vmode, scratch, mask);
1969 }
1970 else /* alternative 2,4 */
1971 {
1972 gcc_assert (REGNO (mask) == REGNO (scratch));
1973 op1 = lowpart_subreg (vmode, op1, mode);
1974 x = gen_rtx_AND (vmode, scratch, op1);
1975 }
1976 emit_insn (gen_rtx_SET (scratch, x));
1977
1978 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
1979 {
1980 dest = lowpart_subreg (vmode, op0, mode);
1981 x = gen_rtx_AND (vmode, dest, nmask);
1982 }
1983 else /* alternative 3,4 */
1984 {
1985 gcc_assert (REGNO (nmask) == REGNO (dest));
1986 dest = nmask;
1987 op0 = lowpart_subreg (vmode, op0, mode);
1988 x = gen_rtx_AND (vmode, dest, op0);
1989 }
1990 emit_insn (gen_rtx_SET (dest, x));
1991 }
1992
1993 x = gen_rtx_IOR (vmode, dest, scratch);
1994 emit_insn (gen_rtx_SET (dest, x));
1995 }
1996
1997 /* Expand an xorsign operation. */
1998
1999 void
2000 ix86_expand_xorsign (rtx operands[])
2001 {
2002 machine_mode mode, vmode;
2003 rtx dest, op0, op1, mask;
2004
2005 dest = operands[0];
2006 op0 = operands[1];
2007 op1 = operands[2];
2008
2009 mode = GET_MODE (dest);
2010
2011 if (mode == SFmode)
2012 vmode = V4SFmode;
2013 else if (mode == DFmode)
2014 vmode = V2DFmode;
2015 else
2016 gcc_unreachable ();
2017
2018 mask = ix86_build_signbit_mask (vmode, 0, 0);
2019
2020 emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2021 }
2022
2023 /* Deconstruct an xorsign operation into bit masks. */
2024
2025 void
2026 ix86_split_xorsign (rtx operands[])
2027 {
2028 machine_mode mode, vmode;
2029 rtx dest, op0, mask, x;
2030
2031 dest = operands[0];
2032 op0 = operands[1];
2033 mask = operands[3];
2034
2035 mode = GET_MODE (dest);
2036 vmode = GET_MODE (mask);
2037
2038 dest = lowpart_subreg (vmode, dest, mode);
2039 x = gen_rtx_AND (vmode, dest, mask);
2040 emit_insn (gen_rtx_SET (dest, x));
2041
2042 op0 = lowpart_subreg (vmode, op0, mode);
2043 x = gen_rtx_XOR (vmode, dest, op0);
2044 emit_insn (gen_rtx_SET (dest, x));
2045 }
2046
2047 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2048
2049 void
2050 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2051 {
2052 machine_mode mode = GET_MODE (op0);
2053 rtx tmp;
2054
2055 /* Handle special case - vector comparsion with boolean result, transform
2056 it using ptest instruction. */
2057 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2058 {
2059 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2060 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2061
2062 gcc_assert (code == EQ || code == NE);
2063 /* Generate XOR since we can't check that one operand is zero vector. */
2064 tmp = gen_reg_rtx (mode);
2065 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2066 tmp = gen_lowpart (p_mode, tmp);
2067 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2068 gen_rtx_UNSPEC (CCmode,
2069 gen_rtvec (2, tmp, tmp),
2070 UNSPEC_PTEST)));
2071 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2072 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2073 gen_rtx_LABEL_REF (VOIDmode, label),
2074 pc_rtx);
2075 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2076 return;
2077 }
2078
2079 switch (mode)
2080 {
2081 case E_SFmode:
2082 case E_DFmode:
2083 case E_XFmode:
2084 case E_QImode:
2085 case E_HImode:
2086 case E_SImode:
2087 simple:
2088 tmp = ix86_expand_compare (code, op0, op1);
2089 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2090 gen_rtx_LABEL_REF (VOIDmode, label),
2091 pc_rtx);
2092 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2093 return;
2094
2095 case E_DImode:
2096 if (TARGET_64BIT)
2097 goto simple;
2098 /* For 32-bit target DI comparison may be performed on
2099 SSE registers. To allow this we should avoid split
2100 to SI mode which is achieved by doing xor in DI mode
2101 and then comparing with zero (which is recognized by
2102 STV pass). We don't compare using xor when optimizing
2103 for size. */
2104 if (!optimize_insn_for_size_p ()
2105 && TARGET_STV
2106 && (code == EQ || code == NE))
2107 {
2108 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2109 op1 = const0_rtx;
2110 }
2111 /* FALLTHRU */
2112 case E_TImode:
2113 /* Expand DImode branch into multiple compare+branch. */
2114 {
2115 rtx lo[2], hi[2];
2116 rtx_code_label *label2;
2117 enum rtx_code code1, code2, code3;
2118 machine_mode submode;
2119
2120 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2121 {
2122 std::swap (op0, op1);
2123 code = swap_condition (code);
2124 }
2125
2126 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2127 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2128
2129 submode = mode == DImode ? SImode : DImode;
2130
2131 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2132 avoid two branches. This costs one extra insn, so disable when
2133 optimizing for size. */
2134
2135 if ((code == EQ || code == NE)
2136 && (!optimize_insn_for_size_p ()
2137 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2138 {
2139 rtx xor0, xor1;
2140
2141 xor1 = hi[0];
2142 if (hi[1] != const0_rtx)
2143 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2144 NULL_RTX, 0, OPTAB_WIDEN);
2145
2146 xor0 = lo[0];
2147 if (lo[1] != const0_rtx)
2148 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2149 NULL_RTX, 0, OPTAB_WIDEN);
2150
2151 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2152 NULL_RTX, 0, OPTAB_WIDEN);
2153
2154 ix86_expand_branch (code, tmp, const0_rtx, label);
2155 return;
2156 }
2157
2158 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2159 op1 is a constant and the low word is zero, then we can just
2160 examine the high word. Similarly for low word -1 and
2161 less-or-equal-than or greater-than. */
2162
2163 if (CONST_INT_P (hi[1]))
2164 switch (code)
2165 {
2166 case LT: case LTU: case GE: case GEU:
2167 if (lo[1] == const0_rtx)
2168 {
2169 ix86_expand_branch (code, hi[0], hi[1], label);
2170 return;
2171 }
2172 break;
2173 case LE: case LEU: case GT: case GTU:
2174 if (lo[1] == constm1_rtx)
2175 {
2176 ix86_expand_branch (code, hi[0], hi[1], label);
2177 return;
2178 }
2179 break;
2180 default:
2181 break;
2182 }
2183
2184 /* Emulate comparisons that do not depend on Zero flag with
2185 double-word subtraction. Note that only Overflow, Sign
2186 and Carry flags are valid, so swap arguments and condition
2187 of comparisons that would otherwise test Zero flag. */
2188
2189 switch (code)
2190 {
2191 case LE: case LEU: case GT: case GTU:
2192 std::swap (lo[0], lo[1]);
2193 std::swap (hi[0], hi[1]);
2194 code = swap_condition (code);
2195 /* FALLTHRU */
2196
2197 case LT: case LTU: case GE: case GEU:
2198 {
2199 bool uns = (code == LTU || code == GEU);
2200 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2201 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2202
2203 if (!nonimmediate_operand (lo[0], submode))
2204 lo[0] = force_reg (submode, lo[0]);
2205 if (!x86_64_general_operand (lo[1], submode))
2206 lo[1] = force_reg (submode, lo[1]);
2207
2208 if (!register_operand (hi[0], submode))
2209 hi[0] = force_reg (submode, hi[0]);
2210 if ((uns && !nonimmediate_operand (hi[1], submode))
2211 || (!uns && !x86_64_general_operand (hi[1], submode)))
2212 hi[1] = force_reg (submode, hi[1]);
2213
2214 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2215
2216 tmp = gen_rtx_SCRATCH (submode);
2217 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2218
2219 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2220 ix86_expand_branch (code, tmp, const0_rtx, label);
2221 return;
2222 }
2223
2224 default:
2225 break;
2226 }
2227
2228 /* Otherwise, we need two or three jumps. */
2229
2230 label2 = gen_label_rtx ();
2231
2232 code1 = code;
2233 code2 = swap_condition (code);
2234 code3 = unsigned_condition (code);
2235
2236 switch (code)
2237 {
2238 case LT: case GT: case LTU: case GTU:
2239 break;
2240
2241 case LE: code1 = LT; code2 = GT; break;
2242 case GE: code1 = GT; code2 = LT; break;
2243 case LEU: code1 = LTU; code2 = GTU; break;
2244 case GEU: code1 = GTU; code2 = LTU; break;
2245
2246 case EQ: code1 = UNKNOWN; code2 = NE; break;
2247 case NE: code2 = UNKNOWN; break;
2248
2249 default:
2250 gcc_unreachable ();
2251 }
2252
2253 /*
2254 * a < b =>
2255 * if (hi(a) < hi(b)) goto true;
2256 * if (hi(a) > hi(b)) goto false;
2257 * if (lo(a) < lo(b)) goto true;
2258 * false:
2259 */
2260
2261 if (code1 != UNKNOWN)
2262 ix86_expand_branch (code1, hi[0], hi[1], label);
2263 if (code2 != UNKNOWN)
2264 ix86_expand_branch (code2, hi[0], hi[1], label2);
2265
2266 ix86_expand_branch (code3, lo[0], lo[1], label);
2267
2268 if (code2 != UNKNOWN)
2269 emit_label (label2);
2270 return;
2271 }
2272
2273 default:
2274 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2275 goto simple;
2276 }
2277 }
2278
2279 /* Figure out whether to use unordered fp comparisons. */
2280
2281 static bool
2282 ix86_unordered_fp_compare (enum rtx_code code)
2283 {
2284 if (!TARGET_IEEE_FP)
2285 return false;
2286
2287 switch (code)
2288 {
2289 case LT:
2290 case LE:
2291 case GT:
2292 case GE:
2293 case LTGT:
2294 return false;
2295
2296 case EQ:
2297 case NE:
2298
2299 case UNORDERED:
2300 case ORDERED:
2301 case UNLT:
2302 case UNLE:
2303 case UNGT:
2304 case UNGE:
2305 case UNEQ:
2306 return true;
2307
2308 default:
2309 gcc_unreachable ();
2310 }
2311 }
2312
2313 /* Return a comparison we can do and that it is equivalent to
2314 swap_condition (code) apart possibly from orderedness.
2315 But, never change orderedness if TARGET_IEEE_FP, returning
2316 UNKNOWN in that case if necessary. */
2317
2318 static enum rtx_code
2319 ix86_fp_swap_condition (enum rtx_code code)
2320 {
2321 switch (code)
2322 {
2323 case GT: /* GTU - CF=0 & ZF=0 */
2324 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2325 case GE: /* GEU - CF=0 */
2326 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2327 case UNLT: /* LTU - CF=1 */
2328 return TARGET_IEEE_FP ? UNKNOWN : GT;
2329 case UNLE: /* LEU - CF=1 | ZF=1 */
2330 return TARGET_IEEE_FP ? UNKNOWN : GE;
2331 default:
2332 return swap_condition (code);
2333 }
2334 }
2335
2336 /* Return cost of comparison CODE using the best strategy for performance.
2337 All following functions do use number of instructions as a cost metrics.
2338 In future this should be tweaked to compute bytes for optimize_size and
2339 take into account performance of various instructions on various CPUs. */
2340
2341 static int
2342 ix86_fp_comparison_cost (enum rtx_code code)
2343 {
2344 int arith_cost;
2345
2346 /* The cost of code using bit-twiddling on %ah. */
2347 switch (code)
2348 {
2349 case UNLE:
2350 case UNLT:
2351 case LTGT:
2352 case GT:
2353 case GE:
2354 case UNORDERED:
2355 case ORDERED:
2356 case UNEQ:
2357 arith_cost = 4;
2358 break;
2359 case LT:
2360 case NE:
2361 case EQ:
2362 case UNGE:
2363 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2364 break;
2365 case LE:
2366 case UNGT:
2367 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2368 break;
2369 default:
2370 gcc_unreachable ();
2371 }
2372
2373 switch (ix86_fp_comparison_strategy (code))
2374 {
2375 case IX86_FPCMP_COMI:
2376 return arith_cost > 4 ? 3 : 2;
2377 case IX86_FPCMP_SAHF:
2378 return arith_cost > 4 ? 4 : 3;
2379 default:
2380 return arith_cost;
2381 }
2382 }
2383
2384 /* Swap, force into registers, or otherwise massage the two operands
2385 to a fp comparison. The operands are updated in place; the new
2386 comparison code is returned. */
2387
2388 static enum rtx_code
2389 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2390 {
2391 bool unordered_compare = ix86_unordered_fp_compare (code);
2392 rtx op0 = *pop0, op1 = *pop1;
2393 machine_mode op_mode = GET_MODE (op0);
2394 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2395
2396 /* All of the unordered compare instructions only work on registers.
2397 The same is true of the fcomi compare instructions. The XFmode
2398 compare instructions require registers except when comparing
2399 against zero or when converting operand 1 from fixed point to
2400 floating point. */
2401
2402 if (!is_sse
2403 && (unordered_compare
2404 || (op_mode == XFmode
2405 && ! (standard_80387_constant_p (op0) == 1
2406 || standard_80387_constant_p (op1) == 1)
2407 && GET_CODE (op1) != FLOAT)
2408 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2409 {
2410 op0 = force_reg (op_mode, op0);
2411 op1 = force_reg (op_mode, op1);
2412 }
2413 else
2414 {
2415 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2416 things around if they appear profitable, otherwise force op0
2417 into a register. */
2418
2419 if (standard_80387_constant_p (op0) == 0
2420 || (MEM_P (op0)
2421 && ! (standard_80387_constant_p (op1) == 0
2422 || MEM_P (op1))))
2423 {
2424 enum rtx_code new_code = ix86_fp_swap_condition (code);
2425 if (new_code != UNKNOWN)
2426 {
2427 std::swap (op0, op1);
2428 code = new_code;
2429 }
2430 }
2431
2432 if (!REG_P (op0))
2433 op0 = force_reg (op_mode, op0);
2434
2435 if (CONSTANT_P (op1))
2436 {
2437 int tmp = standard_80387_constant_p (op1);
2438 if (tmp == 0)
2439 op1 = validize_mem (force_const_mem (op_mode, op1));
2440 else if (tmp == 1)
2441 {
2442 if (TARGET_CMOVE)
2443 op1 = force_reg (op_mode, op1);
2444 }
2445 else
2446 op1 = force_reg (op_mode, op1);
2447 }
2448 }
2449
2450 /* Try to rearrange the comparison to make it cheaper. */
2451 if (ix86_fp_comparison_cost (code)
2452 > ix86_fp_comparison_cost (swap_condition (code))
2453 && (REG_P (op1) || can_create_pseudo_p ()))
2454 {
2455 std::swap (op0, op1);
2456 code = swap_condition (code);
2457 if (!REG_P (op0))
2458 op0 = force_reg (op_mode, op0);
2459 }
2460
2461 *pop0 = op0;
2462 *pop1 = op1;
2463 return code;
2464 }
2465
2466 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2467
2468 static rtx
2469 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2470 {
2471 bool unordered_compare = ix86_unordered_fp_compare (code);
2472 machine_mode cmp_mode;
2473 rtx tmp, scratch;
2474
2475 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2476
2477 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2478 if (unordered_compare)
2479 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2480
2481 /* Do fcomi/sahf based test when profitable. */
2482 switch (ix86_fp_comparison_strategy (code))
2483 {
2484 case IX86_FPCMP_COMI:
2485 cmp_mode = CCFPmode;
2486 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2487 break;
2488
2489 case IX86_FPCMP_SAHF:
2490 cmp_mode = CCFPmode;
2491 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2492 scratch = gen_reg_rtx (HImode);
2493 emit_insn (gen_rtx_SET (scratch, tmp));
2494 emit_insn (gen_x86_sahf_1 (scratch));
2495 break;
2496
2497 case IX86_FPCMP_ARITH:
2498 cmp_mode = CCNOmode;
2499 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2500 scratch = gen_reg_rtx (HImode);
2501 emit_insn (gen_rtx_SET (scratch, tmp));
2502
2503 /* In the unordered case, we have to check C2 for NaN's, which
2504 doesn't happen to work out to anything nice combination-wise.
2505 So do some bit twiddling on the value we've got in AH to come
2506 up with an appropriate set of condition codes. */
2507
2508 switch (code)
2509 {
2510 case GT:
2511 case UNGT:
2512 if (code == GT || !TARGET_IEEE_FP)
2513 {
2514 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2515 code = EQ;
2516 }
2517 else
2518 {
2519 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2520 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2521 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2522 cmp_mode = CCmode;
2523 code = GEU;
2524 }
2525 break;
2526 case LT:
2527 case UNLT:
2528 if (code == LT && TARGET_IEEE_FP)
2529 {
2530 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2531 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2532 cmp_mode = CCmode;
2533 code = EQ;
2534 }
2535 else
2536 {
2537 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2538 code = NE;
2539 }
2540 break;
2541 case GE:
2542 case UNGE:
2543 if (code == GE || !TARGET_IEEE_FP)
2544 {
2545 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2546 code = EQ;
2547 }
2548 else
2549 {
2550 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2551 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2552 code = NE;
2553 }
2554 break;
2555 case LE:
2556 case UNLE:
2557 if (code == LE && TARGET_IEEE_FP)
2558 {
2559 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2560 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2561 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2562 cmp_mode = CCmode;
2563 code = LTU;
2564 }
2565 else
2566 {
2567 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2568 code = NE;
2569 }
2570 break;
2571 case EQ:
2572 case UNEQ:
2573 if (code == EQ && TARGET_IEEE_FP)
2574 {
2575 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2576 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2577 cmp_mode = CCmode;
2578 code = EQ;
2579 }
2580 else
2581 {
2582 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2583 code = NE;
2584 }
2585 break;
2586 case NE:
2587 case LTGT:
2588 if (code == NE && TARGET_IEEE_FP)
2589 {
2590 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2591 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2592 GEN_INT (0x40)));
2593 code = NE;
2594 }
2595 else
2596 {
2597 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2598 code = EQ;
2599 }
2600 break;
2601
2602 case UNORDERED:
2603 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2604 code = NE;
2605 break;
2606 case ORDERED:
2607 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2608 code = EQ;
2609 break;
2610
2611 default:
2612 gcc_unreachable ();
2613 }
2614 break;
2615
2616 default:
2617 gcc_unreachable();
2618 }
2619
2620 /* Return the test that should be put into the flags user, i.e.
2621 the bcc, scc, or cmov instruction. */
2622 return gen_rtx_fmt_ee (code, VOIDmode,
2623 gen_rtx_REG (cmp_mode, FLAGS_REG),
2624 const0_rtx);
2625 }
2626
2627 /* Generate insn patterns to do an integer compare of OPERANDS. */
2628
2629 static rtx
2630 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2631 {
2632 machine_mode cmpmode;
2633 rtx tmp, flags;
2634
2635 cmpmode = SELECT_CC_MODE (code, op0, op1);
2636 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2637
2638 /* This is very simple, but making the interface the same as in the
2639 FP case makes the rest of the code easier. */
2640 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2641 emit_insn (gen_rtx_SET (flags, tmp));
2642
2643 /* Return the test that should be put into the flags user, i.e.
2644 the bcc, scc, or cmov instruction. */
2645 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2646 }
2647
2648 static rtx
2649 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2650 {
2651 rtx ret;
2652
2653 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2654 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2655
2656 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2657 {
2658 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2659 ret = ix86_expand_fp_compare (code, op0, op1);
2660 }
2661 else
2662 ret = ix86_expand_int_compare (code, op0, op1);
2663
2664 return ret;
2665 }
2666
2667 void
2668 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2669 {
2670 rtx ret;
2671
2672 gcc_assert (GET_MODE (dest) == QImode);
2673
2674 ret = ix86_expand_compare (code, op0, op1);
2675 PUT_MODE (ret, QImode);
2676 emit_insn (gen_rtx_SET (dest, ret));
2677 }
2678
2679 /* Expand comparison setting or clearing carry flag. Return true when
2680 successful and set pop for the operation. */
2681 static bool
2682 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2683 {
2684 machine_mode mode
2685 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2686
2687 /* Do not handle double-mode compares that go through special path. */
2688 if (mode == (TARGET_64BIT ? TImode : DImode))
2689 return false;
2690
2691 if (SCALAR_FLOAT_MODE_P (mode))
2692 {
2693 rtx compare_op;
2694 rtx_insn *compare_seq;
2695
2696 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2697
2698 /* Shortcut: following common codes never translate
2699 into carry flag compares. */
2700 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2701 || code == ORDERED || code == UNORDERED)
2702 return false;
2703
2704 /* These comparisons require zero flag; swap operands so they won't. */
2705 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2706 && !TARGET_IEEE_FP)
2707 {
2708 std::swap (op0, op1);
2709 code = swap_condition (code);
2710 }
2711
2712 /* Try to expand the comparison and verify that we end up with
2713 carry flag based comparison. This fails to be true only when
2714 we decide to expand comparison using arithmetic that is not
2715 too common scenario. */
2716 start_sequence ();
2717 compare_op = ix86_expand_fp_compare (code, op0, op1);
2718 compare_seq = get_insns ();
2719 end_sequence ();
2720
2721 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2722 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2723 else
2724 code = GET_CODE (compare_op);
2725
2726 if (code != LTU && code != GEU)
2727 return false;
2728
2729 emit_insn (compare_seq);
2730 *pop = compare_op;
2731 return true;
2732 }
2733
2734 if (!INTEGRAL_MODE_P (mode))
2735 return false;
2736
2737 switch (code)
2738 {
2739 case LTU:
2740 case GEU:
2741 break;
2742
2743 /* Convert a==0 into (unsigned)a<1. */
2744 case EQ:
2745 case NE:
2746 if (op1 != const0_rtx)
2747 return false;
2748 op1 = const1_rtx;
2749 code = (code == EQ ? LTU : GEU);
2750 break;
2751
2752 /* Convert a>b into b<a or a>=b-1. */
2753 case GTU:
2754 case LEU:
2755 if (CONST_INT_P (op1))
2756 {
2757 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2758 /* Bail out on overflow. We still can swap operands but that
2759 would force loading of the constant into register. */
2760 if (op1 == const0_rtx
2761 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2762 return false;
2763 code = (code == GTU ? GEU : LTU);
2764 }
2765 else
2766 {
2767 std::swap (op0, op1);
2768 code = (code == GTU ? LTU : GEU);
2769 }
2770 break;
2771
2772 /* Convert a>=0 into (unsigned)a<0x80000000. */
2773 case LT:
2774 case GE:
2775 if (mode == DImode || op1 != const0_rtx)
2776 return false;
2777 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2778 code = (code == LT ? GEU : LTU);
2779 break;
2780 case LE:
2781 case GT:
2782 if (mode == DImode || op1 != constm1_rtx)
2783 return false;
2784 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2785 code = (code == LE ? GEU : LTU);
2786 break;
2787
2788 default:
2789 return false;
2790 }
2791 /* Swapping operands may cause constant to appear as first operand. */
2792 if (!nonimmediate_operand (op0, VOIDmode))
2793 {
2794 if (!can_create_pseudo_p ())
2795 return false;
2796 op0 = force_reg (mode, op0);
2797 }
2798 *pop = ix86_expand_compare (code, op0, op1);
2799 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2800 return true;
2801 }
2802
2803 /* Expand conditional increment or decrement using adb/sbb instructions.
2804 The default case using setcc followed by the conditional move can be
2805 done by generic code. */
2806 bool
2807 ix86_expand_int_addcc (rtx operands[])
2808 {
2809 enum rtx_code code = GET_CODE (operands[1]);
2810 rtx flags;
2811 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2812 rtx compare_op;
2813 rtx val = const0_rtx;
2814 bool fpcmp = false;
2815 machine_mode mode;
2816 rtx op0 = XEXP (operands[1], 0);
2817 rtx op1 = XEXP (operands[1], 1);
2818
2819 if (operands[3] != const1_rtx
2820 && operands[3] != constm1_rtx)
2821 return false;
2822 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2823 return false;
2824 code = GET_CODE (compare_op);
2825
2826 flags = XEXP (compare_op, 0);
2827
2828 if (GET_MODE (flags) == CCFPmode)
2829 {
2830 fpcmp = true;
2831 code = ix86_fp_compare_code_to_integer (code);
2832 }
2833
2834 if (code != LTU)
2835 {
2836 val = constm1_rtx;
2837 if (fpcmp)
2838 PUT_CODE (compare_op,
2839 reverse_condition_maybe_unordered
2840 (GET_CODE (compare_op)));
2841 else
2842 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2843 }
2844
2845 mode = GET_MODE (operands[0]);
2846
2847 /* Construct either adc or sbb insn. */
2848 if ((code == LTU) == (operands[3] == constm1_rtx))
2849 insn = gen_sub3_carry;
2850 else
2851 insn = gen_add3_carry;
2852
2853 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2854
2855 return true;
2856 }
2857
2858 bool
2859 ix86_expand_int_movcc (rtx operands[])
2860 {
2861 enum rtx_code code = GET_CODE (operands[1]), compare_code;
2862 rtx_insn *compare_seq;
2863 rtx compare_op;
2864 machine_mode mode = GET_MODE (operands[0]);
2865 bool sign_bit_compare_p = false;
2866 rtx op0 = XEXP (operands[1], 0);
2867 rtx op1 = XEXP (operands[1], 1);
2868
2869 if (GET_MODE (op0) == TImode
2870 || (GET_MODE (op0) == DImode
2871 && !TARGET_64BIT))
2872 return false;
2873
2874 start_sequence ();
2875 compare_op = ix86_expand_compare (code, op0, op1);
2876 compare_seq = get_insns ();
2877 end_sequence ();
2878
2879 compare_code = GET_CODE (compare_op);
2880
2881 if ((op1 == const0_rtx && (code == GE || code == LT))
2882 || (op1 == constm1_rtx && (code == GT || code == LE)))
2883 sign_bit_compare_p = true;
2884
2885 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2886 HImode insns, we'd be swallowed in word prefix ops. */
2887
2888 if ((mode != HImode || TARGET_FAST_PREFIX)
2889 && (mode != (TARGET_64BIT ? TImode : DImode))
2890 && CONST_INT_P (operands[2])
2891 && CONST_INT_P (operands[3]))
2892 {
2893 rtx out = operands[0];
2894 HOST_WIDE_INT ct = INTVAL (operands[2]);
2895 HOST_WIDE_INT cf = INTVAL (operands[3]);
2896 HOST_WIDE_INT diff;
2897
2898 diff = ct - cf;
2899 /* Sign bit compares are better done using shifts than we do by using
2900 sbb. */
2901 if (sign_bit_compare_p
2902 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2903 {
2904 /* Detect overlap between destination and compare sources. */
2905 rtx tmp = out;
2906
2907 if (!sign_bit_compare_p)
2908 {
2909 rtx flags;
2910 bool fpcmp = false;
2911
2912 compare_code = GET_CODE (compare_op);
2913
2914 flags = XEXP (compare_op, 0);
2915
2916 if (GET_MODE (flags) == CCFPmode)
2917 {
2918 fpcmp = true;
2919 compare_code
2920 = ix86_fp_compare_code_to_integer (compare_code);
2921 }
2922
2923 /* To simplify rest of code, restrict to the GEU case. */
2924 if (compare_code == LTU)
2925 {
2926 std::swap (ct, cf);
2927 compare_code = reverse_condition (compare_code);
2928 code = reverse_condition (code);
2929 }
2930 else
2931 {
2932 if (fpcmp)
2933 PUT_CODE (compare_op,
2934 reverse_condition_maybe_unordered
2935 (GET_CODE (compare_op)));
2936 else
2937 PUT_CODE (compare_op,
2938 reverse_condition (GET_CODE (compare_op)));
2939 }
2940 diff = ct - cf;
2941
2942 if (reg_overlap_mentioned_p (out, op0)
2943 || reg_overlap_mentioned_p (out, op1))
2944 tmp = gen_reg_rtx (mode);
2945
2946 if (mode == DImode)
2947 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2948 else
2949 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
2950 flags, compare_op));
2951 }
2952 else
2953 {
2954 if (code == GT || code == GE)
2955 code = reverse_condition (code);
2956 else
2957 {
2958 std::swap (ct, cf);
2959 diff = ct - cf;
2960 }
2961 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2962 }
2963
2964 if (diff == 1)
2965 {
2966 /*
2967 * cmpl op0,op1
2968 * sbbl dest,dest
2969 * [addl dest, ct]
2970 *
2971 * Size 5 - 8.
2972 */
2973 if (ct)
2974 tmp = expand_simple_binop (mode, PLUS,
2975 tmp, GEN_INT (ct),
2976 copy_rtx (tmp), 1, OPTAB_DIRECT);
2977 }
2978 else if (cf == -1)
2979 {
2980 /*
2981 * cmpl op0,op1
2982 * sbbl dest,dest
2983 * orl $ct, dest
2984 *
2985 * Size 8.
2986 */
2987 tmp = expand_simple_binop (mode, IOR,
2988 tmp, GEN_INT (ct),
2989 copy_rtx (tmp), 1, OPTAB_DIRECT);
2990 }
2991 else if (diff == -1 && ct)
2992 {
2993 /*
2994 * cmpl op0,op1
2995 * sbbl dest,dest
2996 * notl dest
2997 * [addl dest, cf]
2998 *
2999 * Size 8 - 11.
3000 */
3001 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3002 if (cf)
3003 tmp = expand_simple_binop (mode, PLUS,
3004 copy_rtx (tmp), GEN_INT (cf),
3005 copy_rtx (tmp), 1, OPTAB_DIRECT);
3006 }
3007 else
3008 {
3009 /*
3010 * cmpl op0,op1
3011 * sbbl dest,dest
3012 * [notl dest]
3013 * andl cf - ct, dest
3014 * [addl dest, ct]
3015 *
3016 * Size 8 - 11.
3017 */
3018
3019 if (cf == 0)
3020 {
3021 cf = ct;
3022 ct = 0;
3023 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3024 }
3025
3026 tmp = expand_simple_binop (mode, AND,
3027 copy_rtx (tmp),
3028 gen_int_mode (cf - ct, mode),
3029 copy_rtx (tmp), 1, OPTAB_DIRECT);
3030 if (ct)
3031 tmp = expand_simple_binop (mode, PLUS,
3032 copy_rtx (tmp), GEN_INT (ct),
3033 copy_rtx (tmp), 1, OPTAB_DIRECT);
3034 }
3035
3036 if (!rtx_equal_p (tmp, out))
3037 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3038
3039 return true;
3040 }
3041
3042 if (diff < 0)
3043 {
3044 machine_mode cmp_mode = GET_MODE (op0);
3045 enum rtx_code new_code;
3046
3047 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3048 {
3049 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3050
3051 /* We may be reversing a non-trapping
3052 comparison to a trapping comparison. */
3053 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3054 && code != EQ && code != NE
3055 && code != ORDERED && code != UNORDERED)
3056 new_code = UNKNOWN;
3057 else
3058 new_code = reverse_condition_maybe_unordered (code);
3059 }
3060 else
3061 new_code = ix86_reverse_condition (code, cmp_mode);
3062 if (new_code != UNKNOWN)
3063 {
3064 std::swap (ct, cf);
3065 diff = -diff;
3066 code = new_code;
3067 }
3068 }
3069
3070 compare_code = UNKNOWN;
3071 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3072 && CONST_INT_P (op1))
3073 {
3074 if (op1 == const0_rtx
3075 && (code == LT || code == GE))
3076 compare_code = code;
3077 else if (op1 == constm1_rtx)
3078 {
3079 if (code == LE)
3080 compare_code = LT;
3081 else if (code == GT)
3082 compare_code = GE;
3083 }
3084 }
3085
3086 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3087 if (compare_code != UNKNOWN
3088 && GET_MODE (op0) == GET_MODE (out)
3089 && (cf == -1 || ct == -1))
3090 {
3091 /* If lea code below could be used, only optimize
3092 if it results in a 2 insn sequence. */
3093
3094 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3095 || diff == 3 || diff == 5 || diff == 9)
3096 || (compare_code == LT && ct == -1)
3097 || (compare_code == GE && cf == -1))
3098 {
3099 /*
3100 * notl op1 (if necessary)
3101 * sarl $31, op1
3102 * orl cf, op1
3103 */
3104 if (ct != -1)
3105 {
3106 cf = ct;
3107 ct = -1;
3108 code = reverse_condition (code);
3109 }
3110
3111 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3112
3113 out = expand_simple_binop (mode, IOR,
3114 out, GEN_INT (cf),
3115 out, 1, OPTAB_DIRECT);
3116 if (out != operands[0])
3117 emit_move_insn (operands[0], out);
3118
3119 return true;
3120 }
3121 }
3122
3123
3124 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3125 || diff == 3 || diff == 5 || diff == 9)
3126 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3127 && (mode != DImode
3128 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3129 {
3130 /*
3131 * xorl dest,dest
3132 * cmpl op1,op2
3133 * setcc dest
3134 * lea cf(dest*(ct-cf)),dest
3135 *
3136 * Size 14.
3137 *
3138 * This also catches the degenerate setcc-only case.
3139 */
3140
3141 rtx tmp;
3142 int nops;
3143
3144 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3145
3146 nops = 0;
3147 /* On x86_64 the lea instruction operates on Pmode, so we need
3148 to get arithmetics done in proper mode to match. */
3149 if (diff == 1)
3150 tmp = copy_rtx (out);
3151 else
3152 {
3153 rtx out1;
3154 out1 = copy_rtx (out);
3155 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3156 nops++;
3157 if (diff & 1)
3158 {
3159 tmp = gen_rtx_PLUS (mode, tmp, out1);
3160 nops++;
3161 }
3162 }
3163 if (cf != 0)
3164 {
3165 tmp = plus_constant (mode, tmp, cf);
3166 nops++;
3167 }
3168 if (!rtx_equal_p (tmp, out))
3169 {
3170 if (nops == 1)
3171 out = force_operand (tmp, copy_rtx (out));
3172 else
3173 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3174 }
3175 if (!rtx_equal_p (out, operands[0]))
3176 emit_move_insn (operands[0], copy_rtx (out));
3177
3178 return true;
3179 }
3180
3181 /*
3182 * General case: Jumpful:
3183 * xorl dest,dest cmpl op1, op2
3184 * cmpl op1, op2 movl ct, dest
3185 * setcc dest jcc 1f
3186 * decl dest movl cf, dest
3187 * andl (cf-ct),dest 1:
3188 * addl ct,dest
3189 *
3190 * Size 20. Size 14.
3191 *
3192 * This is reasonably steep, but branch mispredict costs are
3193 * high on modern cpus, so consider failing only if optimizing
3194 * for space.
3195 */
3196
3197 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3198 && BRANCH_COST (optimize_insn_for_speed_p (),
3199 false) >= 2)
3200 {
3201 if (cf == 0)
3202 {
3203 machine_mode cmp_mode = GET_MODE (op0);
3204 enum rtx_code new_code;
3205
3206 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3207 {
3208 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3209
3210 /* We may be reversing a non-trapping
3211 comparison to a trapping comparison. */
3212 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3213 && code != EQ && code != NE
3214 && code != ORDERED && code != UNORDERED)
3215 new_code = UNKNOWN;
3216 else
3217 new_code = reverse_condition_maybe_unordered (code);
3218
3219 }
3220 else
3221 {
3222 new_code = ix86_reverse_condition (code, cmp_mode);
3223 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3224 compare_code = reverse_condition (compare_code);
3225 }
3226
3227 if (new_code != UNKNOWN)
3228 {
3229 cf = ct;
3230 ct = 0;
3231 code = new_code;
3232 }
3233 }
3234
3235 if (compare_code != UNKNOWN)
3236 {
3237 /* notl op1 (if needed)
3238 sarl $31, op1
3239 andl (cf-ct), op1
3240 addl ct, op1
3241
3242 For x < 0 (resp. x <= -1) there will be no notl,
3243 so if possible swap the constants to get rid of the
3244 complement.
3245 True/false will be -1/0 while code below (store flag
3246 followed by decrement) is 0/-1, so the constants need
3247 to be exchanged once more. */
3248
3249 if (compare_code == GE || !cf)
3250 {
3251 code = reverse_condition (code);
3252 compare_code = LT;
3253 }
3254 else
3255 std::swap (ct, cf);
3256
3257 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3258 }
3259 else
3260 {
3261 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3262
3263 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3264 constm1_rtx,
3265 copy_rtx (out), 1, OPTAB_DIRECT);
3266 }
3267
3268 out = expand_simple_binop (mode, AND, copy_rtx (out),
3269 gen_int_mode (cf - ct, mode),
3270 copy_rtx (out), 1, OPTAB_DIRECT);
3271 if (ct)
3272 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3273 copy_rtx (out), 1, OPTAB_DIRECT);
3274 if (!rtx_equal_p (out, operands[0]))
3275 emit_move_insn (operands[0], copy_rtx (out));
3276
3277 return true;
3278 }
3279 }
3280
3281 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3282 {
3283 /* Try a few things more with specific constants and a variable. */
3284
3285 optab op;
3286 rtx var, orig_out, out, tmp;
3287
3288 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3289 return false;
3290
3291 /* If one of the two operands is an interesting constant, load a
3292 constant with the above and mask it in with a logical operation. */
3293
3294 if (CONST_INT_P (operands[2]))
3295 {
3296 var = operands[3];
3297 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3298 operands[3] = constm1_rtx, op = and_optab;
3299 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3300 operands[3] = const0_rtx, op = ior_optab;
3301 else
3302 return false;
3303 }
3304 else if (CONST_INT_P (operands[3]))
3305 {
3306 var = operands[2];
3307 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3308 operands[2] = constm1_rtx, op = and_optab;
3309 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3310 operands[2] = const0_rtx, op = ior_optab;
3311 else
3312 return false;
3313 }
3314 else
3315 return false;
3316
3317 orig_out = operands[0];
3318 tmp = gen_reg_rtx (mode);
3319 operands[0] = tmp;
3320
3321 /* Recurse to get the constant loaded. */
3322 if (!ix86_expand_int_movcc (operands))
3323 return false;
3324
3325 /* Mask in the interesting variable. */
3326 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3327 OPTAB_WIDEN);
3328 if (!rtx_equal_p (out, orig_out))
3329 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3330
3331 return true;
3332 }
3333
3334 /*
3335 * For comparison with above,
3336 *
3337 * movl cf,dest
3338 * movl ct,tmp
3339 * cmpl op1,op2
3340 * cmovcc tmp,dest
3341 *
3342 * Size 15.
3343 */
3344
3345 if (! nonimmediate_operand (operands[2], mode))
3346 operands[2] = force_reg (mode, operands[2]);
3347 if (! nonimmediate_operand (operands[3], mode))
3348 operands[3] = force_reg (mode, operands[3]);
3349
3350 if (! register_operand (operands[2], VOIDmode)
3351 && (mode == QImode
3352 || ! register_operand (operands[3], VOIDmode)))
3353 operands[2] = force_reg (mode, operands[2]);
3354
3355 if (mode == QImode
3356 && ! register_operand (operands[3], VOIDmode))
3357 operands[3] = force_reg (mode, operands[3]);
3358
3359 emit_insn (compare_seq);
3360 emit_insn (gen_rtx_SET (operands[0],
3361 gen_rtx_IF_THEN_ELSE (mode,
3362 compare_op, operands[2],
3363 operands[3])));
3364 return true;
3365 }
3366
3367 /* Detect conditional moves that exactly match min/max operational
3368 semantics. Note that this is IEEE safe, as long as we don't
3369 interchange the operands.
3370
3371 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3372 and TRUE if the operation is successful and instructions are emitted. */
3373
3374 static bool
3375 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3376 rtx cmp_op1, rtx if_true, rtx if_false)
3377 {
3378 machine_mode mode;
3379 bool is_min;
3380 rtx tmp;
3381
3382 if (code == LT)
3383 ;
3384 else if (code == UNGE)
3385 std::swap (if_true, if_false);
3386 else
3387 return false;
3388
3389 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3390 is_min = true;
3391 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3392 is_min = false;
3393 else
3394 return false;
3395
3396 mode = GET_MODE (dest);
3397
3398 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3399 but MODE may be a vector mode and thus not appropriate. */
3400 if (!flag_finite_math_only || flag_signed_zeros)
3401 {
3402 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3403 rtvec v;
3404
3405 if_true = force_reg (mode, if_true);
3406 v = gen_rtvec (2, if_true, if_false);
3407 tmp = gen_rtx_UNSPEC (mode, v, u);
3408 }
3409 else
3410 {
3411 code = is_min ? SMIN : SMAX;
3412 if (MEM_P (if_true) && MEM_P (if_false))
3413 if_true = force_reg (mode, if_true);
3414 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3415 }
3416
3417 emit_insn (gen_rtx_SET (dest, tmp));
3418 return true;
3419 }
3420
3421 /* Return true if MODE is valid for vector compare to mask register,
3422 Same result for conditionl vector move with mask register. */
3423 static bool
3424 ix86_valid_mask_cmp_mode (machine_mode mode)
3425 {
3426 /* XOP has its own vector conditional movement. */
3427 if (TARGET_XOP && !TARGET_AVX512F)
3428 return false;
3429
3430 /* AVX512F is needed for mask operation. */
3431 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3432 return false;
3433
3434 /* AVX512BW is needed for vector QI/HImode,
3435 AVX512VL is needed for 128/256-bit vector. */
3436 machine_mode inner_mode = GET_MODE_INNER (mode);
3437 int vector_size = GET_MODE_SIZE (mode);
3438 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3439 return false;
3440
3441 return vector_size == 64 || TARGET_AVX512VL;
3442 }
3443
3444 /* Expand an SSE comparison. Return the register with the result. */
3445
3446 static rtx
3447 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3448 rtx op_true, rtx op_false)
3449 {
3450 machine_mode mode = GET_MODE (dest);
3451 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3452
3453 /* In general case result of comparison can differ from operands' type. */
3454 machine_mode cmp_mode;
3455
3456 /* In AVX512F the result of comparison is an integer mask. */
3457 bool maskcmp = false;
3458 rtx x;
3459
3460 if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
3461 {
3462 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3463 maskcmp = true;
3464 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3465 }
3466 else
3467 cmp_mode = cmp_ops_mode;
3468
3469 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3470
3471 int (*op1_predicate)(rtx, machine_mode)
3472 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3473
3474 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3475 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3476
3477 if (optimize
3478 || (maskcmp && cmp_mode != mode)
3479 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3480 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3481 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3482
3483 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3484
3485 if (cmp_mode != mode && !maskcmp)
3486 {
3487 x = force_reg (cmp_ops_mode, x);
3488 convert_move (dest, x, false);
3489 }
3490 else
3491 emit_insn (gen_rtx_SET (dest, x));
3492
3493 return dest;
3494 }
3495
3496 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3497 operations. This is used for both scalar and vector conditional moves. */
3498
3499 void
3500 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3501 {
3502 machine_mode mode = GET_MODE (dest);
3503 machine_mode cmpmode = GET_MODE (cmp);
3504
3505 /* In AVX512F the result of comparison is an integer mask. */
3506 bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
3507
3508 rtx t2, t3, x;
3509
3510 /* If we have an integer mask and FP value then we need
3511 to cast mask to FP mode. */
3512 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3513 {
3514 cmp = force_reg (cmpmode, cmp);
3515 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3516 }
3517
3518 if (maskcmp)
3519 {
3520 /* Using vector move with mask register. */
3521 cmp = force_reg (cmpmode, cmp);
3522 /* Optimize for mask zero. */
3523 op_true = (op_true != CONST0_RTX (mode)
3524 ? force_reg (mode, op_true) : op_true);
3525 op_false = (op_false != CONST0_RTX (mode)
3526 ? force_reg (mode, op_false) : op_false);
3527 if (op_true == CONST0_RTX (mode))
3528 {
3529 rtx (*gen_not) (rtx, rtx);
3530 switch (cmpmode)
3531 {
3532 case E_QImode: gen_not = gen_knotqi; break;
3533 case E_HImode: gen_not = gen_knothi; break;
3534 case E_SImode: gen_not = gen_knotsi; break;
3535 case E_DImode: gen_not = gen_knotdi; break;
3536 default: gcc_unreachable ();
3537 }
3538 rtx n = gen_reg_rtx (cmpmode);
3539 emit_insn (gen_not (n, cmp));
3540 cmp = n;
3541 /* Reverse op_true op_false. */
3542 std::swap (op_true, op_false);
3543 }
3544
3545 rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
3546 emit_insn (gen_rtx_SET (dest, vec_merge));
3547 return;
3548 }
3549 else if (vector_all_ones_operand (op_true, mode)
3550 && op_false == CONST0_RTX (mode))
3551 {
3552 emit_insn (gen_rtx_SET (dest, cmp));
3553 return;
3554 }
3555 else if (op_false == CONST0_RTX (mode))
3556 {
3557 op_true = force_reg (mode, op_true);
3558 x = gen_rtx_AND (mode, cmp, op_true);
3559 emit_insn (gen_rtx_SET (dest, x));
3560 return;
3561 }
3562 else if (op_true == CONST0_RTX (mode))
3563 {
3564 op_false = force_reg (mode, op_false);
3565 x = gen_rtx_NOT (mode, cmp);
3566 x = gen_rtx_AND (mode, x, op_false);
3567 emit_insn (gen_rtx_SET (dest, x));
3568 return;
3569 }
3570 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3571 {
3572 op_false = force_reg (mode, op_false);
3573 x = gen_rtx_IOR (mode, cmp, op_false);
3574 emit_insn (gen_rtx_SET (dest, x));
3575 return;
3576 }
3577 else if (TARGET_XOP)
3578 {
3579 op_true = force_reg (mode, op_true);
3580
3581 if (!nonimmediate_operand (op_false, mode))
3582 op_false = force_reg (mode, op_false);
3583
3584 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3585 op_true,
3586 op_false)));
3587 return;
3588 }
3589
3590 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3591 rtx d = dest;
3592
3593 if (!vector_operand (op_true, mode))
3594 op_true = force_reg (mode, op_true);
3595
3596 op_false = force_reg (mode, op_false);
3597
3598 switch (mode)
3599 {
3600 case E_V4SFmode:
3601 if (TARGET_SSE4_1)
3602 gen = gen_sse4_1_blendvps;
3603 break;
3604 case E_V2DFmode:
3605 if (TARGET_SSE4_1)
3606 gen = gen_sse4_1_blendvpd;
3607 break;
3608 case E_SFmode:
3609 if (TARGET_SSE4_1)
3610 {
3611 gen = gen_sse4_1_blendvss;
3612 op_true = force_reg (mode, op_true);
3613 }
3614 break;
3615 case E_DFmode:
3616 if (TARGET_SSE4_1)
3617 {
3618 gen = gen_sse4_1_blendvsd;
3619 op_true = force_reg (mode, op_true);
3620 }
3621 break;
3622 case E_V16QImode:
3623 case E_V8HImode:
3624 case E_V4SImode:
3625 case E_V2DImode:
3626 if (TARGET_SSE4_1)
3627 {
3628 gen = gen_sse4_1_pblendvb;
3629 if (mode != V16QImode)
3630 d = gen_reg_rtx (V16QImode);
3631 op_false = gen_lowpart (V16QImode, op_false);
3632 op_true = gen_lowpart (V16QImode, op_true);
3633 cmp = gen_lowpart (V16QImode, cmp);
3634 }
3635 break;
3636 case E_V8SFmode:
3637 if (TARGET_AVX)
3638 gen = gen_avx_blendvps256;
3639 break;
3640 case E_V4DFmode:
3641 if (TARGET_AVX)
3642 gen = gen_avx_blendvpd256;
3643 break;
3644 case E_V32QImode:
3645 case E_V16HImode:
3646 case E_V8SImode:
3647 case E_V4DImode:
3648 if (TARGET_AVX2)
3649 {
3650 gen = gen_avx2_pblendvb;
3651 if (mode != V32QImode)
3652 d = gen_reg_rtx (V32QImode);
3653 op_false = gen_lowpart (V32QImode, op_false);
3654 op_true = gen_lowpart (V32QImode, op_true);
3655 cmp = gen_lowpart (V32QImode, cmp);
3656 }
3657 break;
3658
3659 case E_V64QImode:
3660 gen = gen_avx512bw_blendmv64qi;
3661 break;
3662 case E_V32HImode:
3663 gen = gen_avx512bw_blendmv32hi;
3664 break;
3665 case E_V16SImode:
3666 gen = gen_avx512f_blendmv16si;
3667 break;
3668 case E_V8DImode:
3669 gen = gen_avx512f_blendmv8di;
3670 break;
3671 case E_V8DFmode:
3672 gen = gen_avx512f_blendmv8df;
3673 break;
3674 case E_V16SFmode:
3675 gen = gen_avx512f_blendmv16sf;
3676 break;
3677
3678 default:
3679 break;
3680 }
3681
3682 if (gen != NULL)
3683 {
3684 emit_insn (gen (d, op_false, op_true, cmp));
3685 if (d != dest)
3686 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3687 }
3688 else
3689 {
3690 op_true = force_reg (mode, op_true);
3691
3692 t2 = gen_reg_rtx (mode);
3693 if (optimize)
3694 t3 = gen_reg_rtx (mode);
3695 else
3696 t3 = dest;
3697
3698 x = gen_rtx_AND (mode, op_true, cmp);
3699 emit_insn (gen_rtx_SET (t2, x));
3700
3701 x = gen_rtx_NOT (mode, cmp);
3702 x = gen_rtx_AND (mode, x, op_false);
3703 emit_insn (gen_rtx_SET (t3, x));
3704
3705 x = gen_rtx_IOR (mode, t3, t2);
3706 emit_insn (gen_rtx_SET (dest, x));
3707 }
3708 }
3709
3710 /* Swap, force into registers, or otherwise massage the two operands
3711 to an sse comparison with a mask result. Thus we differ a bit from
3712 ix86_prepare_fp_compare_args which expects to produce a flags result.
3713
3714 The DEST operand exists to help determine whether to commute commutative
3715 operators. The POP0/POP1 operands are updated in place. The new
3716 comparison code is returned, or UNKNOWN if not implementable. */
3717
3718 static enum rtx_code
3719 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3720 rtx *pop0, rtx *pop1)
3721 {
3722 switch (code)
3723 {
3724 case LTGT:
3725 case UNEQ:
3726 /* AVX supports all the needed comparisons. */
3727 if (TARGET_AVX)
3728 break;
3729 /* We have no LTGT as an operator. We could implement it with
3730 NE & ORDERED, but this requires an extra temporary. It's
3731 not clear that it's worth it. */
3732 return UNKNOWN;
3733
3734 case LT:
3735 case LE:
3736 case UNGT:
3737 case UNGE:
3738 /* These are supported directly. */
3739 break;
3740
3741 case EQ:
3742 case NE:
3743 case UNORDERED:
3744 case ORDERED:
3745 /* AVX has 3 operand comparisons, no need to swap anything. */
3746 if (TARGET_AVX)
3747 break;
3748 /* For commutative operators, try to canonicalize the destination
3749 operand to be first in the comparison - this helps reload to
3750 avoid extra moves. */
3751 if (!dest || !rtx_equal_p (dest, *pop1))
3752 break;
3753 /* FALLTHRU */
3754
3755 case GE:
3756 case GT:
3757 case UNLE:
3758 case UNLT:
3759 /* These are not supported directly before AVX, and furthermore
3760 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3761 comparison operands to transform into something that is
3762 supported. */
3763 std::swap (*pop0, *pop1);
3764 code = swap_condition (code);
3765 break;
3766
3767 default:
3768 gcc_unreachable ();
3769 }
3770
3771 return code;
3772 }
3773
3774 /* Expand a floating-point conditional move. Return true if successful. */
3775
3776 bool
3777 ix86_expand_fp_movcc (rtx operands[])
3778 {
3779 machine_mode mode = GET_MODE (operands[0]);
3780 enum rtx_code code = GET_CODE (operands[1]);
3781 rtx tmp, compare_op;
3782 rtx op0 = XEXP (operands[1], 0);
3783 rtx op1 = XEXP (operands[1], 1);
3784
3785 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3786 {
3787 machine_mode cmode;
3788
3789 /* Since we've no cmove for sse registers, don't force bad register
3790 allocation just to gain access to it. Deny movcc when the
3791 comparison mode doesn't match the move mode. */
3792 cmode = GET_MODE (op0);
3793 if (cmode == VOIDmode)
3794 cmode = GET_MODE (op1);
3795 if (cmode != mode)
3796 return false;
3797
3798 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3799 if (code == UNKNOWN)
3800 return false;
3801
3802 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3803 operands[2], operands[3]))
3804 return true;
3805
3806 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3807 operands[2], operands[3]);
3808 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3809 return true;
3810 }
3811
3812 if (GET_MODE (op0) == TImode
3813 || (GET_MODE (op0) == DImode
3814 && !TARGET_64BIT))
3815 return false;
3816
3817 /* The floating point conditional move instructions don't directly
3818 support conditions resulting from a signed integer comparison. */
3819
3820 compare_op = ix86_expand_compare (code, op0, op1);
3821 if (!fcmov_comparison_operator (compare_op, VOIDmode))
3822 {
3823 tmp = gen_reg_rtx (QImode);
3824 ix86_expand_setcc (tmp, code, op0, op1);
3825
3826 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3827 }
3828
3829 emit_insn (gen_rtx_SET (operands[0],
3830 gen_rtx_IF_THEN_ELSE (mode, compare_op,
3831 operands[2], operands[3])));
3832
3833 return true;
3834 }
3835
3836 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3837
3838 static int
3839 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3840 {
3841 switch (code)
3842 {
3843 case EQ:
3844 return 0;
3845 case LT:
3846 case LTU:
3847 return 1;
3848 case LE:
3849 case LEU:
3850 return 2;
3851 case NE:
3852 return 4;
3853 case GE:
3854 case GEU:
3855 return 5;
3856 case GT:
3857 case GTU:
3858 return 6;
3859 default:
3860 gcc_unreachable ();
3861 }
3862 }
3863
3864 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3865
3866 static int
3867 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3868 {
3869 switch (code)
3870 {
3871 case EQ:
3872 return 0x00;
3873 case NE:
3874 return 0x04;
3875 case GT:
3876 return 0x0e;
3877 case LE:
3878 return 0x02;
3879 case GE:
3880 return 0x0d;
3881 case LT:
3882 return 0x01;
3883 case UNLE:
3884 return 0x0a;
3885 case UNLT:
3886 return 0x09;
3887 case UNGE:
3888 return 0x05;
3889 case UNGT:
3890 return 0x06;
3891 case UNEQ:
3892 return 0x18;
3893 case LTGT:
3894 return 0x0c;
3895 case ORDERED:
3896 return 0x07;
3897 case UNORDERED:
3898 return 0x03;
3899 default:
3900 gcc_unreachable ();
3901 }
3902 }
3903
3904 /* Return immediate value to be used in UNSPEC_PCMP
3905 for comparison CODE in MODE. */
3906
3907 static int
3908 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3909 {
3910 if (FLOAT_MODE_P (mode))
3911 return ix86_fp_cmp_code_to_pcmp_immediate (code);
3912 return ix86_int_cmp_code_to_pcmp_immediate (code);
3913 }
3914
3915 /* Expand AVX-512 vector comparison. */
3916
3917 bool
3918 ix86_expand_mask_vec_cmp (rtx operands[])
3919 {
3920 machine_mode mask_mode = GET_MODE (operands[0]);
3921 machine_mode cmp_mode = GET_MODE (operands[2]);
3922 enum rtx_code code = GET_CODE (operands[1]);
3923 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3924 int unspec_code;
3925 rtx unspec;
3926
3927 switch (code)
3928 {
3929 case LEU:
3930 case GTU:
3931 case GEU:
3932 case LTU:
3933 unspec_code = UNSPEC_UNSIGNED_PCMP;
3934 break;
3935
3936 default:
3937 unspec_code = UNSPEC_PCMP;
3938 }
3939
3940 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
3941 operands[3], imm),
3942 unspec_code);
3943 emit_insn (gen_rtx_SET (operands[0], unspec));
3944
3945 return true;
3946 }
3947
3948 /* Expand fp vector comparison. */
3949
3950 bool
3951 ix86_expand_fp_vec_cmp (rtx operands[])
3952 {
3953 enum rtx_code code = GET_CODE (operands[1]);
3954 rtx cmp;
3955
3956 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
3957 &operands[2], &operands[3]);
3958 if (code == UNKNOWN)
3959 {
3960 rtx temp;
3961 switch (GET_CODE (operands[1]))
3962 {
3963 case LTGT:
3964 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
3965 operands[3], NULL, NULL);
3966 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
3967 operands[3], NULL, NULL);
3968 code = AND;
3969 break;
3970 case UNEQ:
3971 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
3972 operands[3], NULL, NULL);
3973 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
3974 operands[3], NULL, NULL);
3975 code = IOR;
3976 break;
3977 default:
3978 gcc_unreachable ();
3979 }
3980 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
3981 OPTAB_DIRECT);
3982 }
3983 else
3984 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
3985 operands[1], operands[2]);
3986
3987 if (operands[0] != cmp)
3988 emit_move_insn (operands[0], cmp);
3989
3990 return true;
3991 }
3992
3993 static rtx
3994 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
3995 rtx op_true, rtx op_false, bool *negate)
3996 {
3997 machine_mode data_mode = GET_MODE (dest);
3998 machine_mode mode = GET_MODE (cop0);
3999 rtx x;
4000
4001 *negate = false;
4002
4003 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4004 if (TARGET_XOP
4005 && (mode == V16QImode || mode == V8HImode
4006 || mode == V4SImode || mode == V2DImode))
4007 ;
4008 /* AVX512F supports all of the comparsions
4009 on all 128/256/512-bit vector int types. */
4010 else if (ix86_valid_mask_cmp_mode (mode))
4011 ;
4012 else
4013 {
4014 /* Canonicalize the comparison to EQ, GT, GTU. */
4015 switch (code)
4016 {
4017 case EQ:
4018 case GT:
4019 case GTU:
4020 break;
4021
4022 case NE:
4023 case LE:
4024 case LEU:
4025 code = reverse_condition (code);
4026 *negate = true;
4027 break;
4028
4029 case GE:
4030 case GEU:
4031 code = reverse_condition (code);
4032 *negate = true;
4033 /* FALLTHRU */
4034
4035 case LT:
4036 case LTU:
4037 std::swap (cop0, cop1);
4038 code = swap_condition (code);
4039 break;
4040
4041 default:
4042 gcc_unreachable ();
4043 }
4044
4045 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4046 if (mode == V2DImode)
4047 {
4048 switch (code)
4049 {
4050 case EQ:
4051 /* SSE4.1 supports EQ. */
4052 if (!TARGET_SSE4_1)
4053 return NULL;
4054 break;
4055
4056 case GT:
4057 case GTU:
4058 /* SSE4.2 supports GT/GTU. */
4059 if (!TARGET_SSE4_2)
4060 return NULL;
4061 break;
4062
4063 default:
4064 gcc_unreachable ();
4065 }
4066 }
4067
4068 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4069 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4070 if (*negate)
4071 std::swap (optrue, opfalse);
4072
4073 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4074 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4075 min (x, y) == x). While we add one instruction (the minimum),
4076 we remove the need for two instructions in the negation, as the
4077 result is done this way.
4078 When using masks, do it for SI/DImode element types, as it is shorter
4079 than the two subtractions. */
4080 if ((code != EQ
4081 && GET_MODE_SIZE (mode) != 64
4082 && vector_all_ones_operand (opfalse, data_mode)
4083 && optrue == CONST0_RTX (data_mode))
4084 || (code == GTU
4085 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4086 /* Don't do it if not using integer masks and we'd end up with
4087 the right values in the registers though. */
4088 && (GET_MODE_SIZE (mode) == 64
4089 || !vector_all_ones_operand (optrue, data_mode)
4090 || opfalse != CONST0_RTX (data_mode))))
4091 {
4092 rtx (*gen) (rtx, rtx, rtx) = NULL;
4093
4094 switch (mode)
4095 {
4096 case E_V16SImode:
4097 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4098 break;
4099 case E_V8DImode:
4100 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4101 cop0 = force_reg (mode, cop0);
4102 cop1 = force_reg (mode, cop1);
4103 break;
4104 case E_V32QImode:
4105 if (TARGET_AVX2)
4106 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4107 break;
4108 case E_V16HImode:
4109 if (TARGET_AVX2)
4110 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4111 break;
4112 case E_V8SImode:
4113 if (TARGET_AVX2)
4114 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4115 break;
4116 case E_V4DImode:
4117 if (TARGET_AVX512VL)
4118 {
4119 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4120 cop0 = force_reg (mode, cop0);
4121 cop1 = force_reg (mode, cop1);
4122 }
4123 break;
4124 case E_V16QImode:
4125 if (code == GTU && TARGET_SSE2)
4126 gen = gen_uminv16qi3;
4127 else if (code == GT && TARGET_SSE4_1)
4128 gen = gen_sminv16qi3;
4129 break;
4130 case E_V8HImode:
4131 if (code == GTU && TARGET_SSE4_1)
4132 gen = gen_uminv8hi3;
4133 else if (code == GT && TARGET_SSE2)
4134 gen = gen_sminv8hi3;
4135 break;
4136 case E_V4SImode:
4137 if (TARGET_SSE4_1)
4138 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4139 break;
4140 case E_V2DImode:
4141 if (TARGET_AVX512VL)
4142 {
4143 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4144 cop0 = force_reg (mode, cop0);
4145 cop1 = force_reg (mode, cop1);
4146 }
4147 break;
4148 default:
4149 break;
4150 }
4151
4152 if (gen)
4153 {
4154 rtx tem = gen_reg_rtx (mode);
4155 if (!vector_operand (cop0, mode))
4156 cop0 = force_reg (mode, cop0);
4157 if (!vector_operand (cop1, mode))
4158 cop1 = force_reg (mode, cop1);
4159 *negate = !*negate;
4160 emit_insn (gen (tem, cop0, cop1));
4161 cop1 = tem;
4162 code = EQ;
4163 }
4164 }
4165
4166 /* Unsigned parallel compare is not supported by the hardware.
4167 Play some tricks to turn this into a signed comparison
4168 against 0. */
4169 if (code == GTU)
4170 {
4171 cop0 = force_reg (mode, cop0);
4172
4173 switch (mode)
4174 {
4175 case E_V16SImode:
4176 case E_V8DImode:
4177 case E_V8SImode:
4178 case E_V4DImode:
4179 case E_V4SImode:
4180 case E_V2DImode:
4181 {
4182 rtx t1, t2, mask;
4183
4184 /* Subtract (-(INT MAX) - 1) from both operands to make
4185 them signed. */
4186 mask = ix86_build_signbit_mask (mode, true, false);
4187 t1 = gen_reg_rtx (mode);
4188 emit_insn (gen_sub3_insn (t1, cop0, mask));
4189
4190 t2 = gen_reg_rtx (mode);
4191 emit_insn (gen_sub3_insn (t2, cop1, mask));
4192
4193 cop0 = t1;
4194 cop1 = t2;
4195 code = GT;
4196 }
4197 break;
4198
4199 case E_V64QImode:
4200 case E_V32HImode:
4201 case E_V32QImode:
4202 case E_V16HImode:
4203 case E_V16QImode:
4204 case E_V8HImode:
4205 /* Perform a parallel unsigned saturating subtraction. */
4206 x = gen_reg_rtx (mode);
4207 emit_insn (gen_rtx_SET
4208 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4209 cop0 = x;
4210 cop1 = CONST0_RTX (mode);
4211 code = EQ;
4212 *negate = !*negate;
4213 break;
4214
4215 default:
4216 gcc_unreachable ();
4217 }
4218 }
4219 }
4220
4221 if (*negate)
4222 std::swap (op_true, op_false);
4223
4224 /* Allow the comparison to be done in one mode, but the movcc to
4225 happen in another mode. */
4226 if (data_mode == mode)
4227 {
4228 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4229 op_true, op_false);
4230 }
4231 else
4232 {
4233 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4234 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4235 op_true, op_false);
4236 if (GET_MODE (x) == mode)
4237 x = gen_lowpart (data_mode, x);
4238 }
4239
4240 return x;
4241 }
4242
4243 /* Expand integer vector comparison. */
4244
4245 bool
4246 ix86_expand_int_vec_cmp (rtx operands[])
4247 {
4248 rtx_code code = GET_CODE (operands[1]);
4249 bool negate = false;
4250 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4251 operands[3], NULL, NULL, &negate);
4252
4253 if (!cmp)
4254 return false;
4255
4256 if (negate)
4257 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4258 CONST0_RTX (GET_MODE (cmp)),
4259 NULL, NULL, &negate);
4260
4261 gcc_assert (!negate);
4262
4263 if (operands[0] != cmp)
4264 emit_move_insn (operands[0], cmp);
4265
4266 return true;
4267 }
4268
4269 /* Expand a floating-point vector conditional move; a vcond operation
4270 rather than a movcc operation. */
4271
4272 bool
4273 ix86_expand_fp_vcond (rtx operands[])
4274 {
4275 enum rtx_code code = GET_CODE (operands[3]);
4276 rtx cmp;
4277
4278 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4279 &operands[4], &operands[5]);
4280 if (code == UNKNOWN)
4281 {
4282 rtx temp;
4283 switch (GET_CODE (operands[3]))
4284 {
4285 case LTGT:
4286 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4287 operands[5], operands[0], operands[0]);
4288 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4289 operands[5], operands[1], operands[2]);
4290 code = AND;
4291 break;
4292 case UNEQ:
4293 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4294 operands[5], operands[0], operands[0]);
4295 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4296 operands[5], operands[1], operands[2]);
4297 code = IOR;
4298 break;
4299 default:
4300 gcc_unreachable ();
4301 }
4302 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4303 OPTAB_DIRECT);
4304 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4305 return true;
4306 }
4307
4308 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4309 operands[5], operands[1], operands[2]))
4310 return true;
4311
4312 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4313 operands[1], operands[2]);
4314 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4315 return true;
4316 }
4317
4318 /* Expand a signed/unsigned integral vector conditional move. */
4319
4320 bool
4321 ix86_expand_int_vcond (rtx operands[])
4322 {
4323 machine_mode data_mode = GET_MODE (operands[0]);
4324 machine_mode mode = GET_MODE (operands[4]);
4325 enum rtx_code code = GET_CODE (operands[3]);
4326 bool negate = false;
4327 rtx x, cop0, cop1;
4328
4329 cop0 = operands[4];
4330 cop1 = operands[5];
4331
4332 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4333 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4334 if ((code == LT || code == GE)
4335 && data_mode == mode
4336 && cop1 == CONST0_RTX (mode)
4337 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4338 && GET_MODE_UNIT_SIZE (data_mode) > 1
4339 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4340 && (GET_MODE_SIZE (data_mode) == 16
4341 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4342 {
4343 rtx negop = operands[2 - (code == LT)];
4344 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4345 if (negop == CONST1_RTX (data_mode))
4346 {
4347 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4348 operands[0], 1, OPTAB_DIRECT);
4349 if (res != operands[0])
4350 emit_move_insn (operands[0], res);
4351 return true;
4352 }
4353 else if (GET_MODE_INNER (data_mode) != DImode
4354 && vector_all_ones_operand (negop, data_mode))
4355 {
4356 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4357 operands[0], 0, OPTAB_DIRECT);
4358 if (res != operands[0])
4359 emit_move_insn (operands[0], res);
4360 return true;
4361 }
4362 }
4363
4364 if (!nonimmediate_operand (cop1, mode))
4365 cop1 = force_reg (mode, cop1);
4366 if (!general_operand (operands[1], data_mode))
4367 operands[1] = force_reg (data_mode, operands[1]);
4368 if (!general_operand (operands[2], data_mode))
4369 operands[2] = force_reg (data_mode, operands[2]);
4370
4371 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4372 operands[1], operands[2], &negate);
4373
4374 if (!x)
4375 return false;
4376
4377 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4378 operands[2-negate]);
4379 return true;
4380 }
4381
4382 static bool
4383 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4384 struct expand_vec_perm_d *d)
4385 {
4386 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4387 expander, so args are either in d, or in op0, op1 etc. */
4388 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4389 machine_mode maskmode = mode;
4390 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4391
4392 switch (mode)
4393 {
4394 case E_V8HImode:
4395 if (TARGET_AVX512VL && TARGET_AVX512BW)
4396 gen = gen_avx512vl_vpermt2varv8hi3;
4397 break;
4398 case E_V16HImode:
4399 if (TARGET_AVX512VL && TARGET_AVX512BW)
4400 gen = gen_avx512vl_vpermt2varv16hi3;
4401 break;
4402 case E_V64QImode:
4403 if (TARGET_AVX512VBMI)
4404 gen = gen_avx512bw_vpermt2varv64qi3;
4405 break;
4406 case E_V32HImode:
4407 if (TARGET_AVX512BW)
4408 gen = gen_avx512bw_vpermt2varv32hi3;
4409 break;
4410 case E_V4SImode:
4411 if (TARGET_AVX512VL)
4412 gen = gen_avx512vl_vpermt2varv4si3;
4413 break;
4414 case E_V8SImode:
4415 if (TARGET_AVX512VL)
4416 gen = gen_avx512vl_vpermt2varv8si3;
4417 break;
4418 case E_V16SImode:
4419 if (TARGET_AVX512F)
4420 gen = gen_avx512f_vpermt2varv16si3;
4421 break;
4422 case E_V4SFmode:
4423 if (TARGET_AVX512VL)
4424 {
4425 gen = gen_avx512vl_vpermt2varv4sf3;
4426 maskmode = V4SImode;
4427 }
4428 break;
4429 case E_V8SFmode:
4430 if (TARGET_AVX512VL)
4431 {
4432 gen = gen_avx512vl_vpermt2varv8sf3;
4433 maskmode = V8SImode;
4434 }
4435 break;
4436 case E_V16SFmode:
4437 if (TARGET_AVX512F)
4438 {
4439 gen = gen_avx512f_vpermt2varv16sf3;
4440 maskmode = V16SImode;
4441 }
4442 break;
4443 case E_V2DImode:
4444 if (TARGET_AVX512VL)
4445 gen = gen_avx512vl_vpermt2varv2di3;
4446 break;
4447 case E_V4DImode:
4448 if (TARGET_AVX512VL)
4449 gen = gen_avx512vl_vpermt2varv4di3;
4450 break;
4451 case E_V8DImode:
4452 if (TARGET_AVX512F)
4453 gen = gen_avx512f_vpermt2varv8di3;
4454 break;
4455 case E_V2DFmode:
4456 if (TARGET_AVX512VL)
4457 {
4458 gen = gen_avx512vl_vpermt2varv2df3;
4459 maskmode = V2DImode;
4460 }
4461 break;
4462 case E_V4DFmode:
4463 if (TARGET_AVX512VL)
4464 {
4465 gen = gen_avx512vl_vpermt2varv4df3;
4466 maskmode = V4DImode;
4467 }
4468 break;
4469 case E_V8DFmode:
4470 if (TARGET_AVX512F)
4471 {
4472 gen = gen_avx512f_vpermt2varv8df3;
4473 maskmode = V8DImode;
4474 }
4475 break;
4476 default:
4477 break;
4478 }
4479
4480 if (gen == NULL)
4481 return false;
4482
4483 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4484 expander, so args are either in d, or in op0, op1 etc. */
4485 if (d)
4486 {
4487 rtx vec[64];
4488 target = d->target;
4489 op0 = d->op0;
4490 op1 = d->op1;
4491 for (int i = 0; i < d->nelt; ++i)
4492 vec[i] = GEN_INT (d->perm[i]);
4493 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4494 }
4495
4496 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4497 return true;
4498 }
4499
4500 /* Expand a variable vector permutation. */
4501
4502 void
4503 ix86_expand_vec_perm (rtx operands[])
4504 {
4505 rtx target = operands[0];
4506 rtx op0 = operands[1];
4507 rtx op1 = operands[2];
4508 rtx mask = operands[3];
4509 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4510 machine_mode mode = GET_MODE (op0);
4511 machine_mode maskmode = GET_MODE (mask);
4512 int w, e, i;
4513 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4514
4515 /* Number of elements in the vector. */
4516 w = GET_MODE_NUNITS (mode);
4517 e = GET_MODE_UNIT_SIZE (mode);
4518 gcc_assert (w <= 64);
4519
4520 if (TARGET_AVX512F && one_operand_shuffle)
4521 {
4522 rtx (*gen) (rtx, rtx, rtx) = NULL;
4523 switch (mode)
4524 {
4525 case E_V16SImode:
4526 gen =gen_avx512f_permvarv16si;
4527 break;
4528 case E_V16SFmode:
4529 gen = gen_avx512f_permvarv16sf;
4530 break;
4531 case E_V8DImode:
4532 gen = gen_avx512f_permvarv8di;
4533 break;
4534 case E_V8DFmode:
4535 gen = gen_avx512f_permvarv8df;
4536 break;
4537 default:
4538 break;
4539 }
4540 if (gen != NULL)
4541 {
4542 emit_insn (gen (target, op0, mask));
4543 return;
4544 }
4545 }
4546
4547 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4548 return;
4549
4550 if (TARGET_AVX2)
4551 {
4552 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4553 {
4554 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4555 an constant shuffle operand. With a tiny bit of effort we can
4556 use VPERMD instead. A re-interpretation stall for V4DFmode is
4557 unfortunate but there's no avoiding it.
4558 Similarly for V16HImode we don't have instructions for variable
4559 shuffling, while for V32QImode we can use after preparing suitable
4560 masks vpshufb; vpshufb; vpermq; vpor. */
4561
4562 if (mode == V16HImode)
4563 {
4564 maskmode = mode = V32QImode;
4565 w = 32;
4566 e = 1;
4567 }
4568 else
4569 {
4570 maskmode = mode = V8SImode;
4571 w = 8;
4572 e = 4;
4573 }
4574 t1 = gen_reg_rtx (maskmode);
4575
4576 /* Replicate the low bits of the V4DImode mask into V8SImode:
4577 mask = { A B C D }
4578 t1 = { A A B B C C D D }. */
4579 for (i = 0; i < w / 2; ++i)
4580 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4581 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4582 vt = force_reg (maskmode, vt);
4583 mask = gen_lowpart (maskmode, mask);
4584 if (maskmode == V8SImode)
4585 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4586 else
4587 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4588
4589 /* Multiply the shuffle indicies by two. */
4590 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4591 OPTAB_DIRECT);
4592
4593 /* Add one to the odd shuffle indicies:
4594 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4595 for (i = 0; i < w / 2; ++i)
4596 {
4597 vec[i * 2] = const0_rtx;
4598 vec[i * 2 + 1] = const1_rtx;
4599 }
4600 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4601 vt = validize_mem (force_const_mem (maskmode, vt));
4602 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4603 OPTAB_DIRECT);
4604
4605 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4606 operands[3] = mask = t1;
4607 target = gen_reg_rtx (mode);
4608 op0 = gen_lowpart (mode, op0);
4609 op1 = gen_lowpart (mode, op1);
4610 }
4611
4612 switch (mode)
4613 {
4614 case E_V8SImode:
4615 /* The VPERMD and VPERMPS instructions already properly ignore
4616 the high bits of the shuffle elements. No need for us to
4617 perform an AND ourselves. */
4618 if (one_operand_shuffle)
4619 {
4620 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4621 if (target != operands[0])
4622 emit_move_insn (operands[0],
4623 gen_lowpart (GET_MODE (operands[0]), target));
4624 }
4625 else
4626 {
4627 t1 = gen_reg_rtx (V8SImode);
4628 t2 = gen_reg_rtx (V8SImode);
4629 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4630 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4631 goto merge_two;
4632 }
4633 return;
4634
4635 case E_V8SFmode:
4636 mask = gen_lowpart (V8SImode, mask);
4637 if (one_operand_shuffle)
4638 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4639 else
4640 {
4641 t1 = gen_reg_rtx (V8SFmode);
4642 t2 = gen_reg_rtx (V8SFmode);
4643 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4644 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4645 goto merge_two;
4646 }
4647 return;
4648
4649 case E_V4SImode:
4650 /* By combining the two 128-bit input vectors into one 256-bit
4651 input vector, we can use VPERMD and VPERMPS for the full
4652 two-operand shuffle. */
4653 t1 = gen_reg_rtx (V8SImode);
4654 t2 = gen_reg_rtx (V8SImode);
4655 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4656 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4657 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4658 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4659 return;
4660
4661 case E_V4SFmode:
4662 t1 = gen_reg_rtx (V8SFmode);
4663 t2 = gen_reg_rtx (V8SImode);
4664 mask = gen_lowpart (V4SImode, mask);
4665 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4666 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4667 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4668 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4669 return;
4670
4671 case E_V32QImode:
4672 t1 = gen_reg_rtx (V32QImode);
4673 t2 = gen_reg_rtx (V32QImode);
4674 t3 = gen_reg_rtx (V32QImode);
4675 vt2 = GEN_INT (-128);
4676 vt = gen_const_vec_duplicate (V32QImode, vt2);
4677 vt = force_reg (V32QImode, vt);
4678 for (i = 0; i < 32; i++)
4679 vec[i] = i < 16 ? vt2 : const0_rtx;
4680 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4681 vt2 = force_reg (V32QImode, vt2);
4682 /* From mask create two adjusted masks, which contain the same
4683 bits as mask in the low 7 bits of each vector element.
4684 The first mask will have the most significant bit clear
4685 if it requests element from the same 128-bit lane
4686 and MSB set if it requests element from the other 128-bit lane.
4687 The second mask will have the opposite values of the MSB,
4688 and additionally will have its 128-bit lanes swapped.
4689 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4690 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4691 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4692 stands for other 12 bytes. */
4693 /* The bit whether element is from the same lane or the other
4694 lane is bit 4, so shift it up by 3 to the MSB position. */
4695 t5 = gen_reg_rtx (V4DImode);
4696 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4697 GEN_INT (3)));
4698 /* Clear MSB bits from the mask just in case it had them set. */
4699 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4700 /* After this t1 will have MSB set for elements from other lane. */
4701 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4702 /* Clear bits other than MSB. */
4703 emit_insn (gen_andv32qi3 (t1, t1, vt));
4704 /* Or in the lower bits from mask into t3. */
4705 emit_insn (gen_iorv32qi3 (t3, t1, t2));
4706 /* And invert MSB bits in t1, so MSB is set for elements from the same
4707 lane. */
4708 emit_insn (gen_xorv32qi3 (t1, t1, vt));
4709 /* Swap 128-bit lanes in t3. */
4710 t6 = gen_reg_rtx (V4DImode);
4711 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4712 const2_rtx, GEN_INT (3),
4713 const0_rtx, const1_rtx));
4714 /* And or in the lower bits from mask into t1. */
4715 emit_insn (gen_iorv32qi3 (t1, t1, t2));
4716 if (one_operand_shuffle)
4717 {
4718 /* Each of these shuffles will put 0s in places where
4719 element from the other 128-bit lane is needed, otherwise
4720 will shuffle in the requested value. */
4721 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4722 gen_lowpart (V32QImode, t6)));
4723 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4724 /* For t3 the 128-bit lanes are swapped again. */
4725 t7 = gen_reg_rtx (V4DImode);
4726 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4727 const2_rtx, GEN_INT (3),
4728 const0_rtx, const1_rtx));
4729 /* And oring both together leads to the result. */
4730 emit_insn (gen_iorv32qi3 (target, t1,
4731 gen_lowpart (V32QImode, t7)));
4732 if (target != operands[0])
4733 emit_move_insn (operands[0],
4734 gen_lowpart (GET_MODE (operands[0]), target));
4735 return;
4736 }
4737
4738 t4 = gen_reg_rtx (V32QImode);
4739 /* Similarly to the above one_operand_shuffle code,
4740 just for repeated twice for each operand. merge_two:
4741 code will merge the two results together. */
4742 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4743 gen_lowpart (V32QImode, t6)));
4744 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4745 gen_lowpart (V32QImode, t6)));
4746 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4747 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4748 t7 = gen_reg_rtx (V4DImode);
4749 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4750 const2_rtx, GEN_INT (3),
4751 const0_rtx, const1_rtx));
4752 t8 = gen_reg_rtx (V4DImode);
4753 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4754 const2_rtx, GEN_INT (3),
4755 const0_rtx, const1_rtx));
4756 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4757 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4758 t1 = t4;
4759 t2 = t3;
4760 goto merge_two;
4761
4762 default:
4763 gcc_assert (GET_MODE_SIZE (mode) <= 16);
4764 break;
4765 }
4766 }
4767
4768 if (TARGET_XOP)
4769 {
4770 /* The XOP VPPERM insn supports three inputs. By ignoring the
4771 one_operand_shuffle special case, we avoid creating another
4772 set of constant vectors in memory. */
4773 one_operand_shuffle = false;
4774
4775 /* mask = mask & {2*w-1, ...} */
4776 vt = GEN_INT (2*w - 1);
4777 }
4778 else
4779 {
4780 /* mask = mask & {w-1, ...} */
4781 vt = GEN_INT (w - 1);
4782 }
4783
4784 vt = gen_const_vec_duplicate (maskmode, vt);
4785 mask = expand_simple_binop (maskmode, AND, mask, vt,
4786 NULL_RTX, 0, OPTAB_DIRECT);
4787
4788 /* For non-QImode operations, convert the word permutation control
4789 into a byte permutation control. */
4790 if (mode != V16QImode)
4791 {
4792 mask = expand_simple_binop (maskmode, ASHIFT, mask,
4793 GEN_INT (exact_log2 (e)),
4794 NULL_RTX, 0, OPTAB_DIRECT);
4795
4796 /* Convert mask to vector of chars. */
4797 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4798
4799 /* Replicate each of the input bytes into byte positions:
4800 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4801 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4802 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4803 for (i = 0; i < 16; ++i)
4804 vec[i] = GEN_INT (i/e * e);
4805 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4806 vt = validize_mem (force_const_mem (V16QImode, vt));
4807 if (TARGET_XOP)
4808 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4809 else
4810 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4811
4812 /* Convert it into the byte positions by doing
4813 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4814 for (i = 0; i < 16; ++i)
4815 vec[i] = GEN_INT (i % e);
4816 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4817 vt = validize_mem (force_const_mem (V16QImode, vt));
4818 emit_insn (gen_addv16qi3 (mask, mask, vt));
4819 }
4820
4821 /* The actual shuffle operations all operate on V16QImode. */
4822 op0 = gen_lowpart (V16QImode, op0);
4823 op1 = gen_lowpart (V16QImode, op1);
4824
4825 if (TARGET_XOP)
4826 {
4827 if (GET_MODE (target) != V16QImode)
4828 target = gen_reg_rtx (V16QImode);
4829 emit_insn (gen_xop_pperm (target, op0, op1, mask));
4830 if (target != operands[0])
4831 emit_move_insn (operands[0],
4832 gen_lowpart (GET_MODE (operands[0]), target));
4833 }
4834 else if (one_operand_shuffle)
4835 {
4836 if (GET_MODE (target) != V16QImode)
4837 target = gen_reg_rtx (V16QImode);
4838 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4839 if (target != operands[0])
4840 emit_move_insn (operands[0],
4841 gen_lowpart (GET_MODE (operands[0]), target));
4842 }
4843 else
4844 {
4845 rtx xops[6];
4846 bool ok;
4847
4848 /* Shuffle the two input vectors independently. */
4849 t1 = gen_reg_rtx (V16QImode);
4850 t2 = gen_reg_rtx (V16QImode);
4851 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4852 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4853
4854 merge_two:
4855 /* Then merge them together. The key is whether any given control
4856 element contained a bit set that indicates the second word. */
4857 mask = operands[3];
4858 vt = GEN_INT (w);
4859 if (maskmode == V2DImode && !TARGET_SSE4_1)
4860 {
4861 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4862 more shuffle to convert the V2DI input mask into a V4SI
4863 input mask. At which point the masking that expand_int_vcond
4864 will work as desired. */
4865 rtx t3 = gen_reg_rtx (V4SImode);
4866 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4867 const0_rtx, const0_rtx,
4868 const2_rtx, const2_rtx));
4869 mask = t3;
4870 maskmode = V4SImode;
4871 e = w = 4;
4872 }
4873
4874 vt = gen_const_vec_duplicate (maskmode, vt);
4875 vt = force_reg (maskmode, vt);
4876 mask = expand_simple_binop (maskmode, AND, mask, vt,
4877 NULL_RTX, 0, OPTAB_DIRECT);
4878
4879 if (GET_MODE (target) != mode)
4880 target = gen_reg_rtx (mode);
4881 xops[0] = target;
4882 xops[1] = gen_lowpart (mode, t2);
4883 xops[2] = gen_lowpart (mode, t1);
4884 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4885 xops[4] = mask;
4886 xops[5] = vt;
4887 ok = ix86_expand_int_vcond (xops);
4888 gcc_assert (ok);
4889 if (target != operands[0])
4890 emit_move_insn (operands[0],
4891 gen_lowpart (GET_MODE (operands[0]), target));
4892 }
4893 }
4894
4895 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4896 true if we should do zero extension, else sign extension. HIGH_P is
4897 true if we want the N/2 high elements, else the low elements. */
4898
4899 void
4900 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4901 {
4902 machine_mode imode = GET_MODE (src);
4903 rtx tmp;
4904
4905 if (TARGET_SSE4_1)
4906 {
4907 rtx (*unpack)(rtx, rtx);
4908 rtx (*extract)(rtx, rtx) = NULL;
4909 machine_mode halfmode = BLKmode;
4910
4911 switch (imode)
4912 {
4913 case E_V64QImode:
4914 if (unsigned_p)
4915 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4916 else
4917 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4918 halfmode = V32QImode;
4919 extract
4920 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4921 break;
4922 case E_V32QImode:
4923 if (unsigned_p)
4924 unpack = gen_avx2_zero_extendv16qiv16hi2;
4925 else
4926 unpack = gen_avx2_sign_extendv16qiv16hi2;
4927 halfmode = V16QImode;
4928 extract
4929 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4930 break;
4931 case E_V32HImode:
4932 if (unsigned_p)
4933 unpack = gen_avx512f_zero_extendv16hiv16si2;
4934 else
4935 unpack = gen_avx512f_sign_extendv16hiv16si2;
4936 halfmode = V16HImode;
4937 extract
4938 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
4939 break;
4940 case E_V16HImode:
4941 if (unsigned_p)
4942 unpack = gen_avx2_zero_extendv8hiv8si2;
4943 else
4944 unpack = gen_avx2_sign_extendv8hiv8si2;
4945 halfmode = V8HImode;
4946 extract
4947 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
4948 break;
4949 case E_V16SImode:
4950 if (unsigned_p)
4951 unpack = gen_avx512f_zero_extendv8siv8di2;
4952 else
4953 unpack = gen_avx512f_sign_extendv8siv8di2;
4954 halfmode = V8SImode;
4955 extract
4956 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
4957 break;
4958 case E_V8SImode:
4959 if (unsigned_p)
4960 unpack = gen_avx2_zero_extendv4siv4di2;
4961 else
4962 unpack = gen_avx2_sign_extendv4siv4di2;
4963 halfmode = V4SImode;
4964 extract
4965 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
4966 break;
4967 case E_V16QImode:
4968 if (unsigned_p)
4969 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
4970 else
4971 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
4972 break;
4973 case E_V8HImode:
4974 if (unsigned_p)
4975 unpack = gen_sse4_1_zero_extendv4hiv4si2;
4976 else
4977 unpack = gen_sse4_1_sign_extendv4hiv4si2;
4978 break;
4979 case E_V4SImode:
4980 if (unsigned_p)
4981 unpack = gen_sse4_1_zero_extendv2siv2di2;
4982 else
4983 unpack = gen_sse4_1_sign_extendv2siv2di2;
4984 break;
4985 default:
4986 gcc_unreachable ();
4987 }
4988
4989 if (GET_MODE_SIZE (imode) >= 32)
4990 {
4991 tmp = gen_reg_rtx (halfmode);
4992 emit_insn (extract (tmp, src));
4993 }
4994 else if (high_p)
4995 {
4996 /* Shift higher 8 bytes to lower 8 bytes. */
4997 tmp = gen_reg_rtx (V1TImode);
4998 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
4999 GEN_INT (64)));
5000 tmp = gen_lowpart (imode, tmp);
5001 }
5002 else
5003 tmp = src;
5004
5005 emit_insn (unpack (dest, tmp));
5006 }
5007 else
5008 {
5009 rtx (*unpack)(rtx, rtx, rtx);
5010
5011 switch (imode)
5012 {
5013 case E_V16QImode:
5014 if (high_p)
5015 unpack = gen_vec_interleave_highv16qi;
5016 else
5017 unpack = gen_vec_interleave_lowv16qi;
5018 break;
5019 case E_V8HImode:
5020 if (high_p)
5021 unpack = gen_vec_interleave_highv8hi;
5022 else
5023 unpack = gen_vec_interleave_lowv8hi;
5024 break;
5025 case E_V4SImode:
5026 if (high_p)
5027 unpack = gen_vec_interleave_highv4si;
5028 else
5029 unpack = gen_vec_interleave_lowv4si;
5030 break;
5031 default:
5032 gcc_unreachable ();
5033 }
5034
5035 if (unsigned_p)
5036 tmp = force_reg (imode, CONST0_RTX (imode));
5037 else
5038 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5039 src, pc_rtx, pc_rtx);
5040
5041 rtx tmp2 = gen_reg_rtx (imode);
5042 emit_insn (unpack (tmp2, src, tmp));
5043 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5044 }
5045 }
5046
5047 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5048 but works for floating pointer parameters and nonoffsetable memories.
5049 For pushes, it returns just stack offsets; the values will be saved
5050 in the right order. Maximally three parts are generated. */
5051
5052 static int
5053 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5054 {
5055 int size;
5056
5057 if (!TARGET_64BIT)
5058 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5059 else
5060 size = (GET_MODE_SIZE (mode) + 4) / 8;
5061
5062 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5063 gcc_assert (size >= 2 && size <= 4);
5064
5065 /* Optimize constant pool reference to immediates. This is used by fp
5066 moves, that force all constants to memory to allow combining. */
5067 if (MEM_P (operand) && MEM_READONLY_P (operand))
5068 operand = avoid_constant_pool_reference (operand);
5069
5070 if (MEM_P (operand) && !offsettable_memref_p (operand))
5071 {
5072 /* The only non-offsetable memories we handle are pushes. */
5073 int ok = push_operand (operand, VOIDmode);
5074
5075 gcc_assert (ok);
5076
5077 operand = copy_rtx (operand);
5078 PUT_MODE (operand, word_mode);
5079 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5080 return size;
5081 }
5082
5083 if (GET_CODE (operand) == CONST_VECTOR)
5084 {
5085 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5086 /* Caution: if we looked through a constant pool memory above,
5087 the operand may actually have a different mode now. That's
5088 ok, since we want to pun this all the way back to an integer. */
5089 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5090 gcc_assert (operand != NULL);
5091 mode = imode;
5092 }
5093
5094 if (!TARGET_64BIT)
5095 {
5096 if (mode == DImode)
5097 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5098 else
5099 {
5100 int i;
5101
5102 if (REG_P (operand))
5103 {
5104 gcc_assert (reload_completed);
5105 for (i = 0; i < size; i++)
5106 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5107 }
5108 else if (offsettable_memref_p (operand))
5109 {
5110 operand = adjust_address (operand, SImode, 0);
5111 parts[0] = operand;
5112 for (i = 1; i < size; i++)
5113 parts[i] = adjust_address (operand, SImode, 4 * i);
5114 }
5115 else if (CONST_DOUBLE_P (operand))
5116 {
5117 const REAL_VALUE_TYPE *r;
5118 long l[4];
5119
5120 r = CONST_DOUBLE_REAL_VALUE (operand);
5121 switch (mode)
5122 {
5123 case E_TFmode:
5124 real_to_target (l, r, mode);
5125 parts[3] = gen_int_mode (l[3], SImode);
5126 parts[2] = gen_int_mode (l[2], SImode);
5127 break;
5128 case E_XFmode:
5129 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5130 long double may not be 80-bit. */
5131 real_to_target (l, r, mode);
5132 parts[2] = gen_int_mode (l[2], SImode);
5133 break;
5134 case E_DFmode:
5135 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5136 break;
5137 default:
5138 gcc_unreachable ();
5139 }
5140 parts[1] = gen_int_mode (l[1], SImode);
5141 parts[0] = gen_int_mode (l[0], SImode);
5142 }
5143 else
5144 gcc_unreachable ();
5145 }
5146 }
5147 else
5148 {
5149 if (mode == TImode)
5150 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5151 if (mode == XFmode || mode == TFmode)
5152 {
5153 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5154 if (REG_P (operand))
5155 {
5156 gcc_assert (reload_completed);
5157 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5158 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5159 }
5160 else if (offsettable_memref_p (operand))
5161 {
5162 operand = adjust_address (operand, DImode, 0);
5163 parts[0] = operand;
5164 parts[1] = adjust_address (operand, upper_mode, 8);
5165 }
5166 else if (CONST_DOUBLE_P (operand))
5167 {
5168 long l[4];
5169
5170 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5171
5172 /* real_to_target puts 32-bit pieces in each long. */
5173 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5174 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5175 << 32), DImode);
5176
5177 if (upper_mode == SImode)
5178 parts[1] = gen_int_mode (l[2], SImode);
5179 else
5180 parts[1]
5181 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5182 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5183 << 32), DImode);
5184 }
5185 else
5186 gcc_unreachable ();
5187 }
5188 }
5189
5190 return size;
5191 }
5192
5193 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5194 Return false when normal moves are needed; true when all required
5195 insns have been emitted. Operands 2-4 contain the input values
5196 int the correct order; operands 5-7 contain the output values. */
5197
5198 void
5199 ix86_split_long_move (rtx operands[])
5200 {
5201 rtx part[2][4];
5202 int nparts, i, j;
5203 int push = 0;
5204 int collisions = 0;
5205 machine_mode mode = GET_MODE (operands[0]);
5206 bool collisionparts[4];
5207
5208 /* The DFmode expanders may ask us to move double.
5209 For 64bit target this is single move. By hiding the fact
5210 here we simplify i386.md splitters. */
5211 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5212 {
5213 /* Optimize constant pool reference to immediates. This is used by
5214 fp moves, that force all constants to memory to allow combining. */
5215
5216 if (MEM_P (operands[1])
5217 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5218 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5219 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5220 if (push_operand (operands[0], VOIDmode))
5221 {
5222 operands[0] = copy_rtx (operands[0]);
5223 PUT_MODE (operands[0], word_mode);
5224 }
5225 else
5226 operands[0] = gen_lowpart (DImode, operands[0]);
5227 operands[1] = gen_lowpart (DImode, operands[1]);
5228 emit_move_insn (operands[0], operands[1]);
5229 return;
5230 }
5231
5232 /* The only non-offsettable memory we handle is push. */
5233 if (push_operand (operands[0], VOIDmode))
5234 push = 1;
5235 else
5236 gcc_assert (!MEM_P (operands[0])
5237 || offsettable_memref_p (operands[0]));
5238
5239 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5240 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5241
5242 /* When emitting push, take care for source operands on the stack. */
5243 if (push && MEM_P (operands[1])
5244 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5245 {
5246 rtx src_base = XEXP (part[1][nparts - 1], 0);
5247
5248 /* Compensate for the stack decrement by 4. */
5249 if (!TARGET_64BIT && nparts == 3
5250 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5251 src_base = plus_constant (Pmode, src_base, 4);
5252
5253 /* src_base refers to the stack pointer and is
5254 automatically decreased by emitted push. */
5255 for (i = 0; i < nparts; i++)
5256 part[1][i] = change_address (part[1][i],
5257 GET_MODE (part[1][i]), src_base);
5258 }
5259
5260 /* We need to do copy in the right order in case an address register
5261 of the source overlaps the destination. */
5262 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5263 {
5264 rtx tmp;
5265
5266 for (i = 0; i < nparts; i++)
5267 {
5268 collisionparts[i]
5269 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5270 if (collisionparts[i])
5271 collisions++;
5272 }
5273
5274 /* Collision in the middle part can be handled by reordering. */
5275 if (collisions == 1 && nparts == 3 && collisionparts [1])
5276 {
5277 std::swap (part[0][1], part[0][2]);
5278 std::swap (part[1][1], part[1][2]);
5279 }
5280 else if (collisions == 1
5281 && nparts == 4
5282 && (collisionparts [1] || collisionparts [2]))
5283 {
5284 if (collisionparts [1])
5285 {
5286 std::swap (part[0][1], part[0][2]);
5287 std::swap (part[1][1], part[1][2]);
5288 }
5289 else
5290 {
5291 std::swap (part[0][2], part[0][3]);
5292 std::swap (part[1][2], part[1][3]);
5293 }
5294 }
5295
5296 /* If there are more collisions, we can't handle it by reordering.
5297 Do an lea to the last part and use only one colliding move. */
5298 else if (collisions > 1)
5299 {
5300 rtx base, addr;
5301
5302 collisions = 1;
5303
5304 base = part[0][nparts - 1];
5305
5306 /* Handle the case when the last part isn't valid for lea.
5307 Happens in 64-bit mode storing the 12-byte XFmode. */
5308 if (GET_MODE (base) != Pmode)
5309 base = gen_rtx_REG (Pmode, REGNO (base));
5310
5311 addr = XEXP (part[1][0], 0);
5312 if (TARGET_TLS_DIRECT_SEG_REFS)
5313 {
5314 struct ix86_address parts;
5315 int ok = ix86_decompose_address (addr, &parts);
5316 gcc_assert (ok);
5317 /* It is not valid to use %gs: or %fs: in lea. */
5318 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5319 }
5320 emit_insn (gen_rtx_SET (base, addr));
5321 part[1][0] = replace_equiv_address (part[1][0], base);
5322 for (i = 1; i < nparts; i++)
5323 {
5324 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5325 part[1][i] = replace_equiv_address (part[1][i], tmp);
5326 }
5327 }
5328 }
5329
5330 if (push)
5331 {
5332 if (!TARGET_64BIT)
5333 {
5334 if (nparts == 3)
5335 {
5336 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5337 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5338 emit_move_insn (part[0][2], part[1][2]);
5339 }
5340 else if (nparts == 4)
5341 {
5342 emit_move_insn (part[0][3], part[1][3]);
5343 emit_move_insn (part[0][2], part[1][2]);
5344 }
5345 }
5346 else
5347 {
5348 /* In 64bit mode we don't have 32bit push available. In case this is
5349 register, it is OK - we will just use larger counterpart. We also
5350 retype memory - these comes from attempt to avoid REX prefix on
5351 moving of second half of TFmode value. */
5352 if (GET_MODE (part[1][1]) == SImode)
5353 {
5354 switch (GET_CODE (part[1][1]))
5355 {
5356 case MEM:
5357 part[1][1] = adjust_address (part[1][1], DImode, 0);
5358 break;
5359
5360 case REG:
5361 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5362 break;
5363
5364 default:
5365 gcc_unreachable ();
5366 }
5367
5368 if (GET_MODE (part[1][0]) == SImode)
5369 part[1][0] = part[1][1];
5370 }
5371 }
5372 emit_move_insn (part[0][1], part[1][1]);
5373 emit_move_insn (part[0][0], part[1][0]);
5374 return;
5375 }
5376
5377 /* Choose correct order to not overwrite the source before it is copied. */
5378 if ((REG_P (part[0][0])
5379 && REG_P (part[1][1])
5380 && (REGNO (part[0][0]) == REGNO (part[1][1])
5381 || (nparts == 3
5382 && REGNO (part[0][0]) == REGNO (part[1][2]))
5383 || (nparts == 4
5384 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5385 || (collisions > 0
5386 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5387 {
5388 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5389 {
5390 operands[2 + i] = part[0][j];
5391 operands[6 + i] = part[1][j];
5392 }
5393 }
5394 else
5395 {
5396 for (i = 0; i < nparts; i++)
5397 {
5398 operands[2 + i] = part[0][i];
5399 operands[6 + i] = part[1][i];
5400 }
5401 }
5402
5403 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5404 if (optimize_insn_for_size_p ())
5405 {
5406 for (j = 0; j < nparts - 1; j++)
5407 if (CONST_INT_P (operands[6 + j])
5408 && operands[6 + j] != const0_rtx
5409 && REG_P (operands[2 + j]))
5410 for (i = j; i < nparts - 1; i++)
5411 if (CONST_INT_P (operands[7 + i])
5412 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5413 operands[7 + i] = operands[2 + j];
5414 }
5415
5416 for (i = 0; i < nparts; i++)
5417 emit_move_insn (operands[2 + i], operands[6 + i]);
5418
5419 return;
5420 }
5421
5422 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5423 left shift by a constant, either using a single shift or
5424 a sequence of add instructions. */
5425
5426 static void
5427 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5428 {
5429 if (count == 1
5430 || (count * ix86_cost->add <= ix86_cost->shift_const
5431 && !optimize_insn_for_size_p ()))
5432 {
5433 while (count-- > 0)
5434 emit_insn (gen_add2_insn (operand, operand));
5435 }
5436 else
5437 {
5438 rtx (*insn)(rtx, rtx, rtx);
5439
5440 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5441 emit_insn (insn (operand, operand, GEN_INT (count)));
5442 }
5443 }
5444
5445 void
5446 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5447 {
5448 rtx (*gen_ashl3)(rtx, rtx, rtx);
5449 rtx (*gen_shld)(rtx, rtx, rtx);
5450 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5451 machine_mode half_mode;
5452
5453 rtx low[2], high[2];
5454 int count;
5455
5456 if (CONST_INT_P (operands[2]))
5457 {
5458 split_double_mode (mode, operands, 2, low, high);
5459 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5460
5461 if (count >= half_width)
5462 {
5463 emit_move_insn (high[0], low[1]);
5464 emit_move_insn (low[0], const0_rtx);
5465
5466 if (count > half_width)
5467 ix86_expand_ashl_const (high[0], count - half_width, mode);
5468 }
5469 else
5470 {
5471 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5472
5473 if (!rtx_equal_p (operands[0], operands[1]))
5474 emit_move_insn (operands[0], operands[1]);
5475
5476 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5477 ix86_expand_ashl_const (low[0], count, mode);
5478 }
5479 return;
5480 }
5481
5482 split_double_mode (mode, operands, 1, low, high);
5483 half_mode = mode == DImode ? SImode : DImode;
5484
5485 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5486
5487 if (operands[1] == const1_rtx)
5488 {
5489 /* Assuming we've chosen a QImode capable registers, then 1 << N
5490 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5491 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5492 {
5493 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5494
5495 ix86_expand_clear (low[0]);
5496 ix86_expand_clear (high[0]);
5497 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5498
5499 d = gen_lowpart (QImode, low[0]);
5500 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5501 s = gen_rtx_EQ (QImode, flags, const0_rtx);
5502 emit_insn (gen_rtx_SET (d, s));
5503
5504 d = gen_lowpart (QImode, high[0]);
5505 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5506 s = gen_rtx_NE (QImode, flags, const0_rtx);
5507 emit_insn (gen_rtx_SET (d, s));
5508 }
5509
5510 /* Otherwise, we can get the same results by manually performing
5511 a bit extract operation on bit 5/6, and then performing the two
5512 shifts. The two methods of getting 0/1 into low/high are exactly
5513 the same size. Avoiding the shift in the bit extract case helps
5514 pentium4 a bit; no one else seems to care much either way. */
5515 else
5516 {
5517 rtx (*gen_lshr3)(rtx, rtx, rtx);
5518 rtx (*gen_and3)(rtx, rtx, rtx);
5519 rtx (*gen_xor3)(rtx, rtx, rtx);
5520 HOST_WIDE_INT bits;
5521 rtx x;
5522
5523 if (mode == DImode)
5524 {
5525 gen_lshr3 = gen_lshrsi3;
5526 gen_and3 = gen_andsi3;
5527 gen_xor3 = gen_xorsi3;
5528 bits = 5;
5529 }
5530 else
5531 {
5532 gen_lshr3 = gen_lshrdi3;
5533 gen_and3 = gen_anddi3;
5534 gen_xor3 = gen_xordi3;
5535 bits = 6;
5536 }
5537
5538 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5539 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5540 else
5541 x = gen_lowpart (half_mode, operands[2]);
5542 emit_insn (gen_rtx_SET (high[0], x));
5543
5544 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5545 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5546 emit_move_insn (low[0], high[0]);
5547 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5548 }
5549
5550 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5551 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5552 return;
5553 }
5554
5555 if (operands[1] == constm1_rtx)
5556 {
5557 /* For -1 << N, we can avoid the shld instruction, because we
5558 know that we're shifting 0...31/63 ones into a -1. */
5559 emit_move_insn (low[0], constm1_rtx);
5560 if (optimize_insn_for_size_p ())
5561 emit_move_insn (high[0], low[0]);
5562 else
5563 emit_move_insn (high[0], constm1_rtx);
5564 }
5565 else
5566 {
5567 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5568
5569 if (!rtx_equal_p (operands[0], operands[1]))
5570 emit_move_insn (operands[0], operands[1]);
5571
5572 split_double_mode (mode, operands, 1, low, high);
5573 emit_insn (gen_shld (high[0], low[0], operands[2]));
5574 }
5575
5576 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5577
5578 if (TARGET_CMOVE && scratch)
5579 {
5580 ix86_expand_clear (scratch);
5581 emit_insn (gen_x86_shift_adj_1
5582 (half_mode, high[0], low[0], operands[2], scratch));
5583 }
5584 else
5585 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
5586 }
5587
5588 void
5589 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5590 {
5591 rtx (*gen_ashr3)(rtx, rtx, rtx)
5592 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5593 rtx (*gen_shrd)(rtx, rtx, rtx);
5594 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5595
5596 rtx low[2], high[2];
5597 int count;
5598
5599 if (CONST_INT_P (operands[2]))
5600 {
5601 split_double_mode (mode, operands, 2, low, high);
5602 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5603
5604 if (count == GET_MODE_BITSIZE (mode) - 1)
5605 {
5606 emit_move_insn (high[0], high[1]);
5607 emit_insn (gen_ashr3 (high[0], high[0],
5608 GEN_INT (half_width - 1)));
5609 emit_move_insn (low[0], high[0]);
5610
5611 }
5612 else if (count >= half_width)
5613 {
5614 emit_move_insn (low[0], high[1]);
5615 emit_move_insn (high[0], low[0]);
5616 emit_insn (gen_ashr3 (high[0], high[0],
5617 GEN_INT (half_width - 1)));
5618
5619 if (count > half_width)
5620 emit_insn (gen_ashr3 (low[0], low[0],
5621 GEN_INT (count - half_width)));
5622 }
5623 else
5624 {
5625 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5626
5627 if (!rtx_equal_p (operands[0], operands[1]))
5628 emit_move_insn (operands[0], operands[1]);
5629
5630 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5631 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5632 }
5633 }
5634 else
5635 {
5636 machine_mode half_mode;
5637
5638 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5639
5640 if (!rtx_equal_p (operands[0], operands[1]))
5641 emit_move_insn (operands[0], operands[1]);
5642
5643 split_double_mode (mode, operands, 1, low, high);
5644 half_mode = mode == DImode ? SImode : DImode;
5645
5646 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5647 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5648
5649 if (TARGET_CMOVE && scratch)
5650 {
5651 emit_move_insn (scratch, high[0]);
5652 emit_insn (gen_ashr3 (scratch, scratch,
5653 GEN_INT (half_width - 1)));
5654 emit_insn (gen_x86_shift_adj_1
5655 (half_mode, low[0], high[0], operands[2], scratch));
5656 }
5657 else
5658 emit_insn (gen_x86_shift_adj_3
5659 (half_mode, low[0], high[0], operands[2]));
5660 }
5661 }
5662
5663 void
5664 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5665 {
5666 rtx (*gen_lshr3)(rtx, rtx, rtx)
5667 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5668 rtx (*gen_shrd)(rtx, rtx, rtx);
5669 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5670
5671 rtx low[2], high[2];
5672 int count;
5673
5674 if (CONST_INT_P (operands[2]))
5675 {
5676 split_double_mode (mode, operands, 2, low, high);
5677 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5678
5679 if (count >= half_width)
5680 {
5681 emit_move_insn (low[0], high[1]);
5682 ix86_expand_clear (high[0]);
5683
5684 if (count > half_width)
5685 emit_insn (gen_lshr3 (low[0], low[0],
5686 GEN_INT (count - half_width)));
5687 }
5688 else
5689 {
5690 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5691
5692 if (!rtx_equal_p (operands[0], operands[1]))
5693 emit_move_insn (operands[0], operands[1]);
5694
5695 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5696 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5697 }
5698 }
5699 else
5700 {
5701 machine_mode half_mode;
5702
5703 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5704
5705 if (!rtx_equal_p (operands[0], operands[1]))
5706 emit_move_insn (operands[0], operands[1]);
5707
5708 split_double_mode (mode, operands, 1, low, high);
5709 half_mode = mode == DImode ? SImode : DImode;
5710
5711 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5712 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5713
5714 if (TARGET_CMOVE && scratch)
5715 {
5716 ix86_expand_clear (scratch);
5717 emit_insn (gen_x86_shift_adj_1
5718 (half_mode, low[0], high[0], operands[2], scratch));
5719 }
5720 else
5721 emit_insn (gen_x86_shift_adj_2
5722 (half_mode, low[0], high[0], operands[2]));
5723 }
5724 }
5725
5726 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5727 DImode for constant loop counts. */
5728
5729 static machine_mode
5730 counter_mode (rtx count_exp)
5731 {
5732 if (GET_MODE (count_exp) != VOIDmode)
5733 return GET_MODE (count_exp);
5734 if (!CONST_INT_P (count_exp))
5735 return Pmode;
5736 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5737 return DImode;
5738 return SImode;
5739 }
5740
5741 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5742 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5743 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5744 memory by VALUE (supposed to be in MODE).
5745
5746 The size is rounded down to whole number of chunk size moved at once.
5747 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5748
5749
5750 static void
5751 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
5752 rtx destptr, rtx srcptr, rtx value,
5753 rtx count, machine_mode mode, int unroll,
5754 int expected_size, bool issetmem)
5755 {
5756 rtx_code_label *out_label, *top_label;
5757 rtx iter, tmp;
5758 machine_mode iter_mode = counter_mode (count);
5759 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5760 rtx piece_size = GEN_INT (piece_size_n);
5761 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5762 rtx size;
5763 int i;
5764
5765 top_label = gen_label_rtx ();
5766 out_label = gen_label_rtx ();
5767 iter = gen_reg_rtx (iter_mode);
5768
5769 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5770 NULL, 1, OPTAB_DIRECT);
5771 /* Those two should combine. */
5772 if (piece_size == const1_rtx)
5773 {
5774 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5775 true, out_label);
5776 predict_jump (REG_BR_PROB_BASE * 10 / 100);
5777 }
5778 emit_move_insn (iter, const0_rtx);
5779
5780 emit_label (top_label);
5781
5782 tmp = convert_modes (Pmode, iter_mode, iter, true);
5783
5784 /* This assert could be relaxed - in this case we'll need to compute
5785 smallest power of two, containing in PIECE_SIZE_N and pass it to
5786 offset_address. */
5787 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5788 destmem = offset_address (destmem, tmp, piece_size_n);
5789 destmem = adjust_address (destmem, mode, 0);
5790
5791 if (!issetmem)
5792 {
5793 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5794 srcmem = adjust_address (srcmem, mode, 0);
5795
5796 /* When unrolling for chips that reorder memory reads and writes,
5797 we can save registers by using single temporary.
5798 Also using 4 temporaries is overkill in 32bit mode. */
5799 if (!TARGET_64BIT && 0)
5800 {
5801 for (i = 0; i < unroll; i++)
5802 {
5803 if (i)
5804 {
5805 destmem = adjust_address (copy_rtx (destmem), mode,
5806 GET_MODE_SIZE (mode));
5807 srcmem = adjust_address (copy_rtx (srcmem), mode,
5808 GET_MODE_SIZE (mode));
5809 }
5810 emit_move_insn (destmem, srcmem);
5811 }
5812 }
5813 else
5814 {
5815 rtx tmpreg[4];
5816 gcc_assert (unroll <= 4);
5817 for (i = 0; i < unroll; i++)
5818 {
5819 tmpreg[i] = gen_reg_rtx (mode);
5820 if (i)
5821 srcmem = adjust_address (copy_rtx (srcmem), mode,
5822 GET_MODE_SIZE (mode));
5823 emit_move_insn (tmpreg[i], srcmem);
5824 }
5825 for (i = 0; i < unroll; i++)
5826 {
5827 if (i)
5828 destmem = adjust_address (copy_rtx (destmem), mode,
5829 GET_MODE_SIZE (mode));
5830 emit_move_insn (destmem, tmpreg[i]);
5831 }
5832 }
5833 }
5834 else
5835 for (i = 0; i < unroll; i++)
5836 {
5837 if (i)
5838 destmem = adjust_address (copy_rtx (destmem), mode,
5839 GET_MODE_SIZE (mode));
5840 emit_move_insn (destmem, value);
5841 }
5842
5843 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5844 true, OPTAB_LIB_WIDEN);
5845 if (tmp != iter)
5846 emit_move_insn (iter, tmp);
5847
5848 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5849 true, top_label);
5850 if (expected_size != -1)
5851 {
5852 expected_size /= GET_MODE_SIZE (mode) * unroll;
5853 if (expected_size == 0)
5854 predict_jump (0);
5855 else if (expected_size > REG_BR_PROB_BASE)
5856 predict_jump (REG_BR_PROB_BASE - 1);
5857 else
5858 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5859 / expected_size);
5860 }
5861 else
5862 predict_jump (REG_BR_PROB_BASE * 80 / 100);
5863 iter = ix86_zero_extend_to_Pmode (iter);
5864 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5865 true, OPTAB_LIB_WIDEN);
5866 if (tmp != destptr)
5867 emit_move_insn (destptr, tmp);
5868 if (!issetmem)
5869 {
5870 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5871 true, OPTAB_LIB_WIDEN);
5872 if (tmp != srcptr)
5873 emit_move_insn (srcptr, tmp);
5874 }
5875 emit_label (out_label);
5876 }
5877
5878 /* Divide COUNTREG by SCALE. */
5879 static rtx
5880 scale_counter (rtx countreg, int scale)
5881 {
5882 rtx sc;
5883
5884 if (scale == 1)
5885 return countreg;
5886 if (CONST_INT_P (countreg))
5887 return GEN_INT (INTVAL (countreg) / scale);
5888 gcc_assert (REG_P (countreg));
5889
5890 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5891 GEN_INT (exact_log2 (scale)),
5892 NULL, 1, OPTAB_DIRECT);
5893 return sc;
5894 }
5895
5896 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5897 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5898 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5899 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5900 ORIG_VALUE is the original value passed to memset to fill the memory with.
5901 Other arguments have same meaning as for previous function. */
5902
5903 static void
5904 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
5905 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5906 rtx count,
5907 machine_mode mode, bool issetmem)
5908 {
5909 rtx destexp;
5910 rtx srcexp;
5911 rtx countreg;
5912 HOST_WIDE_INT rounded_count;
5913
5914 /* If possible, it is shorter to use rep movs.
5915 TODO: Maybe it is better to move this logic to decide_alg. */
5916 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5917 && (!issetmem || orig_value == const0_rtx))
5918 mode = SImode;
5919
5920 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5921 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5922
5923 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5924 GET_MODE_SIZE (mode)));
5925 if (mode != QImode)
5926 {
5927 destexp = gen_rtx_ASHIFT (Pmode, countreg,
5928 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5929 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5930 }
5931 else
5932 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
5933 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
5934 {
5935 rounded_count
5936 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5937 destmem = shallow_copy_rtx (destmem);
5938 set_mem_size (destmem, rounded_count);
5939 }
5940 else if (MEM_SIZE_KNOWN_P (destmem))
5941 clear_mem_size (destmem);
5942
5943 if (issetmem)
5944 {
5945 value = force_reg (mode, gen_lowpart (mode, value));
5946 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
5947 }
5948 else
5949 {
5950 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
5951 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
5952 if (mode != QImode)
5953 {
5954 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
5955 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5956 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
5957 }
5958 else
5959 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
5960 if (CONST_INT_P (count))
5961 {
5962 rounded_count
5963 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5964 srcmem = shallow_copy_rtx (srcmem);
5965 set_mem_size (srcmem, rounded_count);
5966 }
5967 else
5968 {
5969 if (MEM_SIZE_KNOWN_P (srcmem))
5970 clear_mem_size (srcmem);
5971 }
5972 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
5973 destexp, srcexp));
5974 }
5975 }
5976
5977 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
5978 DESTMEM.
5979 SRC is passed by pointer to be updated on return.
5980 Return value is updated DST. */
5981 static rtx
5982 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
5983 HOST_WIDE_INT size_to_move)
5984 {
5985 rtx dst = destmem, src = *srcmem, tempreg;
5986 enum insn_code code;
5987 machine_mode move_mode;
5988 int piece_size, i;
5989
5990 /* Find the widest mode in which we could perform moves.
5991 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
5992 it until move of such size is supported. */
5993 piece_size = 1 << floor_log2 (size_to_move);
5994 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
5995 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
5996 {
5997 gcc_assert (piece_size > 1);
5998 piece_size >>= 1;
5999 }
6000
6001 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6002 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6003 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6004 {
6005 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6006 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6007 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6008 {
6009 move_mode = word_mode;
6010 piece_size = GET_MODE_SIZE (move_mode);
6011 code = optab_handler (mov_optab, move_mode);
6012 }
6013 }
6014 gcc_assert (code != CODE_FOR_nothing);
6015
6016 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6017 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6018
6019 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6020 gcc_assert (size_to_move % piece_size == 0);
6021
6022 for (i = 0; i < size_to_move; i += piece_size)
6023 {
6024 /* We move from memory to memory, so we'll need to do it via
6025 a temporary register. */
6026 tempreg = gen_reg_rtx (move_mode);
6027 emit_insn (GEN_FCN (code) (tempreg, src));
6028 emit_insn (GEN_FCN (code) (dst, tempreg));
6029
6030 emit_move_insn (destptr,
6031 plus_constant (Pmode, copy_rtx (destptr), piece_size));
6032 emit_move_insn (srcptr,
6033 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
6034
6035 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6036 piece_size);
6037 src = adjust_automodify_address_nv (src, move_mode, srcptr,
6038 piece_size);
6039 }
6040
6041 /* Update DST and SRC rtx. */
6042 *srcmem = src;
6043 return dst;
6044 }
6045
6046 /* Helper function for the string operations below. Dest VARIABLE whether
6047 it is aligned to VALUE bytes. If true, jump to the label. */
6048
6049 static rtx_code_label *
6050 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6051 {
6052 rtx_code_label *label = gen_label_rtx ();
6053 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6054 if (GET_MODE (variable) == DImode)
6055 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6056 else
6057 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6058 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6059 1, label);
6060 if (epilogue)
6061 predict_jump (REG_BR_PROB_BASE * 50 / 100);
6062 else
6063 predict_jump (REG_BR_PROB_BASE * 90 / 100);
6064 return label;
6065 }
6066
6067
6068 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6069
6070 static void
6071 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
6072 rtx destptr, rtx srcptr, rtx count, int max_size)
6073 {
6074 rtx src, dest;
6075 if (CONST_INT_P (count))
6076 {
6077 HOST_WIDE_INT countval = INTVAL (count);
6078 HOST_WIDE_INT epilogue_size = countval % max_size;
6079 int i;
6080
6081 /* For now MAX_SIZE should be a power of 2. This assert could be
6082 relaxed, but it'll require a bit more complicated epilogue
6083 expanding. */
6084 gcc_assert ((max_size & (max_size - 1)) == 0);
6085 for (i = max_size; i >= 1; i >>= 1)
6086 {
6087 if (epilogue_size & i)
6088 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6089 }
6090 return;
6091 }
6092 if (max_size > 8)
6093 {
6094 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6095 count, 1, OPTAB_DIRECT);
6096 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
6097 count, QImode, 1, 4, false);
6098 return;
6099 }
6100
6101 /* When there are stringops, we can cheaply increase dest and src pointers.
6102 Otherwise we save code size by maintaining offset (zero is readily
6103 available from preceding rep operation) and using x86 addressing modes.
6104 */
6105 if (TARGET_SINGLE_STRINGOP)
6106 {
6107 if (max_size > 4)
6108 {
6109 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6110 src = change_address (srcmem, SImode, srcptr);
6111 dest = change_address (destmem, SImode, destptr);
6112 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6113 emit_label (label);
6114 LABEL_NUSES (label) = 1;
6115 }
6116 if (max_size > 2)
6117 {
6118 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6119 src = change_address (srcmem, HImode, srcptr);
6120 dest = change_address (destmem, HImode, destptr);
6121 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6122 emit_label (label);
6123 LABEL_NUSES (label) = 1;
6124 }
6125 if (max_size > 1)
6126 {
6127 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6128 src = change_address (srcmem, QImode, srcptr);
6129 dest = change_address (destmem, QImode, destptr);
6130 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6131 emit_label (label);
6132 LABEL_NUSES (label) = 1;
6133 }
6134 }
6135 else
6136 {
6137 rtx offset = force_reg (Pmode, const0_rtx);
6138 rtx tmp;
6139
6140 if (max_size > 4)
6141 {
6142 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6143 src = change_address (srcmem, SImode, srcptr);
6144 dest = change_address (destmem, SImode, destptr);
6145 emit_move_insn (dest, src);
6146 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6147 true, OPTAB_LIB_WIDEN);
6148 if (tmp != offset)
6149 emit_move_insn (offset, tmp);
6150 emit_label (label);
6151 LABEL_NUSES (label) = 1;
6152 }
6153 if (max_size > 2)
6154 {
6155 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6156 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6157 src = change_address (srcmem, HImode, tmp);
6158 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6159 dest = change_address (destmem, HImode, tmp);
6160 emit_move_insn (dest, src);
6161 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6162 true, OPTAB_LIB_WIDEN);
6163 if (tmp != offset)
6164 emit_move_insn (offset, tmp);
6165 emit_label (label);
6166 LABEL_NUSES (label) = 1;
6167 }
6168 if (max_size > 1)
6169 {
6170 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6171 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6172 src = change_address (srcmem, QImode, tmp);
6173 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6174 dest = change_address (destmem, QImode, tmp);
6175 emit_move_insn (dest, src);
6176 emit_label (label);
6177 LABEL_NUSES (label) = 1;
6178 }
6179 }
6180 }
6181
6182 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6183 with value PROMOTED_VAL.
6184 SRC is passed by pointer to be updated on return.
6185 Return value is updated DST. */
6186 static rtx
6187 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6188 HOST_WIDE_INT size_to_move)
6189 {
6190 rtx dst = destmem;
6191 enum insn_code code;
6192 machine_mode move_mode;
6193 int piece_size, i;
6194
6195 /* Find the widest mode in which we could perform moves.
6196 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6197 it until move of such size is supported. */
6198 move_mode = GET_MODE (promoted_val);
6199 if (move_mode == VOIDmode)
6200 move_mode = QImode;
6201 if (size_to_move < GET_MODE_SIZE (move_mode))
6202 {
6203 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6204 move_mode = int_mode_for_size (move_bits, 0).require ();
6205 promoted_val = gen_lowpart (move_mode, promoted_val);
6206 }
6207 piece_size = GET_MODE_SIZE (move_mode);
6208 code = optab_handler (mov_optab, move_mode);
6209 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6210
6211 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6212
6213 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6214 gcc_assert (size_to_move % piece_size == 0);
6215
6216 for (i = 0; i < size_to_move; i += piece_size)
6217 {
6218 if (piece_size <= GET_MODE_SIZE (word_mode))
6219 {
6220 emit_insn (gen_strset (destptr, dst, promoted_val));
6221 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6222 piece_size);
6223 continue;
6224 }
6225
6226 emit_insn (GEN_FCN (code) (dst, promoted_val));
6227
6228 emit_move_insn (destptr,
6229 plus_constant (Pmode, copy_rtx (destptr), piece_size));
6230
6231 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6232 piece_size);
6233 }
6234
6235 /* Update DST rtx. */
6236 return dst;
6237 }
6238 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6239 static void
6240 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6241 rtx count, int max_size)
6242 {
6243 count = expand_simple_binop (counter_mode (count), AND, count,
6244 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
6245 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
6246 gen_lowpart (QImode, value), count, QImode,
6247 1, max_size / 2, true);
6248 }
6249
6250 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6251 static void
6252 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6253 rtx count, int max_size)
6254 {
6255 rtx dest;
6256
6257 if (CONST_INT_P (count))
6258 {
6259 HOST_WIDE_INT countval = INTVAL (count);
6260 HOST_WIDE_INT epilogue_size = countval % max_size;
6261 int i;
6262
6263 /* For now MAX_SIZE should be a power of 2. This assert could be
6264 relaxed, but it'll require a bit more complicated epilogue
6265 expanding. */
6266 gcc_assert ((max_size & (max_size - 1)) == 0);
6267 for (i = max_size; i >= 1; i >>= 1)
6268 {
6269 if (epilogue_size & i)
6270 {
6271 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6272 destmem = emit_memset (destmem, destptr, vec_value, i);
6273 else
6274 destmem = emit_memset (destmem, destptr, value, i);
6275 }
6276 }
6277 return;
6278 }
6279 if (max_size > 32)
6280 {
6281 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6282 return;
6283 }
6284 if (max_size > 16)
6285 {
6286 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6287 if (TARGET_64BIT)
6288 {
6289 dest = change_address (destmem, DImode, destptr);
6290 emit_insn (gen_strset (destptr, dest, value));
6291 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6292 emit_insn (gen_strset (destptr, dest, value));
6293 }
6294 else
6295 {
6296 dest = change_address (destmem, SImode, destptr);
6297 emit_insn (gen_strset (destptr, dest, value));
6298 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6299 emit_insn (gen_strset (destptr, dest, value));
6300 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6301 emit_insn (gen_strset (destptr, dest, value));
6302 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6303 emit_insn (gen_strset (destptr, dest, value));
6304 }
6305 emit_label (label);
6306 LABEL_NUSES (label) = 1;
6307 }
6308 if (max_size > 8)
6309 {
6310 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6311 if (TARGET_64BIT)
6312 {
6313 dest = change_address (destmem, DImode, destptr);
6314 emit_insn (gen_strset (destptr, dest, value));
6315 }
6316 else
6317 {
6318 dest = change_address (destmem, SImode, destptr);
6319 emit_insn (gen_strset (destptr, dest, value));
6320 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6321 emit_insn (gen_strset (destptr, dest, value));
6322 }
6323 emit_label (label);
6324 LABEL_NUSES (label) = 1;
6325 }
6326 if (max_size > 4)
6327 {
6328 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6329 dest = change_address (destmem, SImode, destptr);
6330 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6331 emit_label (label);
6332 LABEL_NUSES (label) = 1;
6333 }
6334 if (max_size > 2)
6335 {
6336 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6337 dest = change_address (destmem, HImode, destptr);
6338 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6339 emit_label (label);
6340 LABEL_NUSES (label) = 1;
6341 }
6342 if (max_size > 1)
6343 {
6344 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6345 dest = change_address (destmem, QImode, destptr);
6346 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6347 emit_label (label);
6348 LABEL_NUSES (label) = 1;
6349 }
6350 }
6351
6352 /* Adjust COUNTER by the VALUE. */
6353 static void
6354 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6355 {
6356 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
6357 }
6358
6359 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6360 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6361 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6362 ignored.
6363 Return value is updated DESTMEM. */
6364
6365 static rtx
6366 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
6367 rtx destptr, rtx srcptr, rtx value,
6368 rtx vec_value, rtx count, int align,
6369 int desired_alignment, bool issetmem)
6370 {
6371 int i;
6372 for (i = 1; i < desired_alignment; i <<= 1)
6373 {
6374 if (align <= i)
6375 {
6376 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6377 if (issetmem)
6378 {
6379 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6380 destmem = emit_memset (destmem, destptr, vec_value, i);
6381 else
6382 destmem = emit_memset (destmem, destptr, value, i);
6383 }
6384 else
6385 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6386 ix86_adjust_counter (count, i);
6387 emit_label (label);
6388 LABEL_NUSES (label) = 1;
6389 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6390 }
6391 }
6392 return destmem;
6393 }
6394
6395 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6396 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6397 and jump to DONE_LABEL. */
6398 static void
6399 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
6400 rtx destptr, rtx srcptr,
6401 rtx value, rtx vec_value,
6402 rtx count, int size,
6403 rtx done_label, bool issetmem)
6404 {
6405 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6406 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6407 rtx modesize;
6408 int n;
6409
6410 /* If we do not have vector value to copy, we must reduce size. */
6411 if (issetmem)
6412 {
6413 if (!vec_value)
6414 {
6415 if (GET_MODE (value) == VOIDmode && size > 8)
6416 mode = Pmode;
6417 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6418 mode = GET_MODE (value);
6419 }
6420 else
6421 mode = GET_MODE (vec_value), value = vec_value;
6422 }
6423 else
6424 {
6425 /* Choose appropriate vector mode. */
6426 if (size >= 32)
6427 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6428 else if (size >= 16)
6429 mode = TARGET_SSE ? V16QImode : DImode;
6430 srcmem = change_address (srcmem, mode, srcptr);
6431 }
6432 destmem = change_address (destmem, mode, destptr);
6433 modesize = GEN_INT (GET_MODE_SIZE (mode));
6434 gcc_assert (GET_MODE_SIZE (mode) <= size);
6435 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6436 {
6437 if (issetmem)
6438 emit_move_insn (destmem, gen_lowpart (mode, value));
6439 else
6440 {
6441 emit_move_insn (destmem, srcmem);
6442 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6443 }
6444 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6445 }
6446
6447 destmem = offset_address (destmem, count, 1);
6448 destmem = offset_address (destmem, GEN_INT (-2 * size),
6449 GET_MODE_SIZE (mode));
6450 if (!issetmem)
6451 {
6452 srcmem = offset_address (srcmem, count, 1);
6453 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6454 GET_MODE_SIZE (mode));
6455 }
6456 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6457 {
6458 if (issetmem)
6459 emit_move_insn (destmem, gen_lowpart (mode, value));
6460 else
6461 {
6462 emit_move_insn (destmem, srcmem);
6463 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6464 }
6465 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6466 }
6467 emit_jump_insn (gen_jump (done_label));
6468 emit_barrier ();
6469
6470 emit_label (label);
6471 LABEL_NUSES (label) = 1;
6472 }
6473
6474 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6475 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6476 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6477 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6478 DONE_LABEL is a label after the whole copying sequence. The label is created
6479 on demand if *DONE_LABEL is NULL.
6480 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6481 bounds after the initial copies.
6482
6483 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6484 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6485 we will dispatch to a library call for large blocks.
6486
6487 In pseudocode we do:
6488
6489 if (COUNT < SIZE)
6490 {
6491 Assume that SIZE is 4. Bigger sizes are handled analogously
6492 if (COUNT & 4)
6493 {
6494 copy 4 bytes from SRCPTR to DESTPTR
6495 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6496 goto done_label
6497 }
6498 if (!COUNT)
6499 goto done_label;
6500 copy 1 byte from SRCPTR to DESTPTR
6501 if (COUNT & 2)
6502 {
6503 copy 2 bytes from SRCPTR to DESTPTR
6504 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6505 }
6506 }
6507 else
6508 {
6509 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6510 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6511
6512 OLD_DESPTR = DESTPTR;
6513 Align DESTPTR up to DESIRED_ALIGN
6514 SRCPTR += DESTPTR - OLD_DESTPTR
6515 COUNT -= DEST_PTR - OLD_DESTPTR
6516 if (DYNAMIC_CHECK)
6517 Round COUNT down to multiple of SIZE
6518 << optional caller supplied zero size guard is here >>
6519 << optional caller supplied dynamic check is here >>
6520 << caller supplied main copy loop is here >>
6521 }
6522 done_label:
6523 */
6524 static void
6525 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
6526 rtx *destptr, rtx *srcptr,
6527 machine_mode mode,
6528 rtx value, rtx vec_value,
6529 rtx *count,
6530 rtx_code_label **done_label,
6531 int size,
6532 int desired_align,
6533 int align,
6534 unsigned HOST_WIDE_INT *min_size,
6535 bool dynamic_check,
6536 bool issetmem)
6537 {
6538 rtx_code_label *loop_label = NULL, *label;
6539 int n;
6540 rtx modesize;
6541 int prolog_size = 0;
6542 rtx mode_value;
6543
6544 /* Chose proper value to copy. */
6545 if (issetmem && VECTOR_MODE_P (mode))
6546 mode_value = vec_value;
6547 else
6548 mode_value = value;
6549 gcc_assert (GET_MODE_SIZE (mode) <= size);
6550
6551 /* See if block is big or small, handle small blocks. */
6552 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6553 {
6554 int size2 = size;
6555 loop_label = gen_label_rtx ();
6556
6557 if (!*done_label)
6558 *done_label = gen_label_rtx ();
6559
6560 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6561 1, loop_label);
6562 size2 >>= 1;
6563
6564 /* Handle sizes > 3. */
6565 for (;size2 > 2; size2 >>= 1)
6566 expand_small_cpymem_or_setmem (destmem, srcmem,
6567 *destptr, *srcptr,
6568 value, vec_value,
6569 *count,
6570 size2, *done_label, issetmem);
6571 /* Nothing to copy? Jump to DONE_LABEL if so */
6572 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6573 1, *done_label);
6574
6575 /* Do a byte copy. */
6576 destmem = change_address (destmem, QImode, *destptr);
6577 if (issetmem)
6578 emit_move_insn (destmem, gen_lowpart (QImode, value));
6579 else
6580 {
6581 srcmem = change_address (srcmem, QImode, *srcptr);
6582 emit_move_insn (destmem, srcmem);
6583 }
6584
6585 /* Handle sizes 2 and 3. */
6586 label = ix86_expand_aligntest (*count, 2, false);
6587 destmem = change_address (destmem, HImode, *destptr);
6588 destmem = offset_address (destmem, *count, 1);
6589 destmem = offset_address (destmem, GEN_INT (-2), 2);
6590 if (issetmem)
6591 emit_move_insn (destmem, gen_lowpart (HImode, value));
6592 else
6593 {
6594 srcmem = change_address (srcmem, HImode, *srcptr);
6595 srcmem = offset_address (srcmem, *count, 1);
6596 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6597 emit_move_insn (destmem, srcmem);
6598 }
6599
6600 emit_label (label);
6601 LABEL_NUSES (label) = 1;
6602 emit_jump_insn (gen_jump (*done_label));
6603 emit_barrier ();
6604 }
6605 else
6606 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6607 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6608
6609 /* Start memcpy for COUNT >= SIZE. */
6610 if (loop_label)
6611 {
6612 emit_label (loop_label);
6613 LABEL_NUSES (loop_label) = 1;
6614 }
6615
6616 /* Copy first desired_align bytes. */
6617 if (!issetmem)
6618 srcmem = change_address (srcmem, mode, *srcptr);
6619 destmem = change_address (destmem, mode, *destptr);
6620 modesize = GEN_INT (GET_MODE_SIZE (mode));
6621 for (n = 0; prolog_size < desired_align - align; n++)
6622 {
6623 if (issetmem)
6624 emit_move_insn (destmem, mode_value);
6625 else
6626 {
6627 emit_move_insn (destmem, srcmem);
6628 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6629 }
6630 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6631 prolog_size += GET_MODE_SIZE (mode);
6632 }
6633
6634
6635 /* Copy last SIZE bytes. */
6636 destmem = offset_address (destmem, *count, 1);
6637 destmem = offset_address (destmem,
6638 GEN_INT (-size - prolog_size),
6639 1);
6640 if (issetmem)
6641 emit_move_insn (destmem, mode_value);
6642 else
6643 {
6644 srcmem = offset_address (srcmem, *count, 1);
6645 srcmem = offset_address (srcmem,
6646 GEN_INT (-size - prolog_size),
6647 1);
6648 emit_move_insn (destmem, srcmem);
6649 }
6650 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6651 {
6652 destmem = offset_address (destmem, modesize, 1);
6653 if (issetmem)
6654 emit_move_insn (destmem, mode_value);
6655 else
6656 {
6657 srcmem = offset_address (srcmem, modesize, 1);
6658 emit_move_insn (destmem, srcmem);
6659 }
6660 }
6661
6662 /* Align destination. */
6663 if (desired_align > 1 && desired_align > align)
6664 {
6665 rtx saveddest = *destptr;
6666
6667 gcc_assert (desired_align <= size);
6668 /* Align destptr up, place it to new register. */
6669 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6670 GEN_INT (prolog_size),
6671 NULL_RTX, 1, OPTAB_DIRECT);
6672 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6673 REG_POINTER (*destptr) = 1;
6674 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6675 GEN_INT (-desired_align),
6676 *destptr, 1, OPTAB_DIRECT);
6677 /* See how many bytes we skipped. */
6678 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6679 *destptr,
6680 saveddest, 1, OPTAB_DIRECT);
6681 /* Adjust srcptr and count. */
6682 if (!issetmem)
6683 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6684 saveddest, *srcptr, 1, OPTAB_DIRECT);
6685 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6686 saveddest, *count, 1, OPTAB_DIRECT);
6687 /* We copied at most size + prolog_size. */
6688 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6689 *min_size
6690 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6691 else
6692 *min_size = 0;
6693
6694 /* Our loops always round down the block size, but for dispatch to
6695 library we need precise value. */
6696 if (dynamic_check)
6697 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
6698 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6699 }
6700 else
6701 {
6702 gcc_assert (prolog_size == 0);
6703 /* Decrease count, so we won't end up copying last word twice. */
6704 if (!CONST_INT_P (*count))
6705 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6706 constm1_rtx, *count, 1, OPTAB_DIRECT);
6707 else
6708 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6709 (unsigned HOST_WIDE_INT)size));
6710 if (*min_size)
6711 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6712 }
6713 }
6714
6715
6716 /* This function is like the previous one, except here we know how many bytes
6717 need to be copied. That allows us to update alignment not only of DST, which
6718 is returned, but also of SRC, which is passed as a pointer for that
6719 reason. */
6720 static rtx
6721 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
6722 rtx srcreg, rtx value, rtx vec_value,
6723 int desired_align, int align_bytes,
6724 bool issetmem)
6725 {
6726 rtx src = NULL;
6727 rtx orig_dst = dst;
6728 rtx orig_src = NULL;
6729 int piece_size = 1;
6730 int copied_bytes = 0;
6731
6732 if (!issetmem)
6733 {
6734 gcc_assert (srcp != NULL);
6735 src = *srcp;
6736 orig_src = src;
6737 }
6738
6739 for (piece_size = 1;
6740 piece_size <= desired_align && copied_bytes < align_bytes;
6741 piece_size <<= 1)
6742 {
6743 if (align_bytes & piece_size)
6744 {
6745 if (issetmem)
6746 {
6747 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6748 dst = emit_memset (dst, destreg, vec_value, piece_size);
6749 else
6750 dst = emit_memset (dst, destreg, value, piece_size);
6751 }
6752 else
6753 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6754 copied_bytes += piece_size;
6755 }
6756 }
6757 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6758 set_mem_align (dst, desired_align * BITS_PER_UNIT);
6759 if (MEM_SIZE_KNOWN_P (orig_dst))
6760 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6761
6762 if (!issetmem)
6763 {
6764 int src_align_bytes = get_mem_align_offset (src, desired_align
6765 * BITS_PER_UNIT);
6766 if (src_align_bytes >= 0)
6767 src_align_bytes = desired_align - src_align_bytes;
6768 if (src_align_bytes >= 0)
6769 {
6770 unsigned int src_align;
6771 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6772 {
6773 if ((src_align_bytes & (src_align - 1))
6774 == (align_bytes & (src_align - 1)))
6775 break;
6776 }
6777 if (src_align > (unsigned int) desired_align)
6778 src_align = desired_align;
6779 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6780 set_mem_align (src, src_align * BITS_PER_UNIT);
6781 }
6782 if (MEM_SIZE_KNOWN_P (orig_src))
6783 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6784 *srcp = src;
6785 }
6786
6787 return dst;
6788 }
6789
6790 /* Return true if ALG can be used in current context.
6791 Assume we expand memset if MEMSET is true. */
6792 static bool
6793 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6794 {
6795 if (alg == no_stringop)
6796 return false;
6797 if (alg == vector_loop)
6798 return TARGET_SSE || TARGET_AVX;
6799 /* Algorithms using the rep prefix want at least edi and ecx;
6800 additionally, memset wants eax and memcpy wants esi. Don't
6801 consider such algorithms if the user has appropriated those
6802 registers for their own purposes, or if we have a non-default
6803 address space, since some string insns cannot override the segment. */
6804 if (alg == rep_prefix_1_byte
6805 || alg == rep_prefix_4_byte
6806 || alg == rep_prefix_8_byte)
6807 {
6808 if (have_as)
6809 return false;
6810 if (fixed_regs[CX_REG]
6811 || fixed_regs[DI_REG]
6812 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6813 return false;
6814 }
6815 return true;
6816 }
6817
6818 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6819 static enum stringop_alg
6820 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6821 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6822 bool memset, bool zero_memset, bool have_as,
6823 int *dynamic_check, bool *noalign, bool recur)
6824 {
6825 const struct stringop_algs *algs;
6826 bool optimize_for_speed;
6827 int max = 0;
6828 const struct processor_costs *cost;
6829 int i;
6830 bool any_alg_usable_p = false;
6831
6832 *noalign = false;
6833 *dynamic_check = -1;
6834
6835 /* Even if the string operation call is cold, we still might spend a lot
6836 of time processing large blocks. */
6837 if (optimize_function_for_size_p (cfun)
6838 || (optimize_insn_for_size_p ()
6839 && (max_size < 256
6840 || (expected_size != -1 && expected_size < 256))))
6841 optimize_for_speed = false;
6842 else
6843 optimize_for_speed = true;
6844
6845 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6846 if (memset)
6847 algs = &cost->memset[TARGET_64BIT != 0];
6848 else
6849 algs = &cost->memcpy[TARGET_64BIT != 0];
6850
6851 /* See maximal size for user defined algorithm. */
6852 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6853 {
6854 enum stringop_alg candidate = algs->size[i].alg;
6855 bool usable = alg_usable_p (candidate, memset, have_as);
6856 any_alg_usable_p |= usable;
6857
6858 if (candidate != libcall && candidate && usable)
6859 max = algs->size[i].max;
6860 }
6861
6862 /* If expected size is not known but max size is small enough
6863 so inline version is a win, set expected size into
6864 the range. */
6865 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6866 && expected_size == -1)
6867 expected_size = min_size / 2 + max_size / 2;
6868
6869 /* If user specified the algorithm, honor it if possible. */
6870 if (ix86_stringop_alg != no_stringop
6871 && alg_usable_p (ix86_stringop_alg, memset, have_as))
6872 return ix86_stringop_alg;
6873 /* rep; movq or rep; movl is the smallest variant. */
6874 else if (!optimize_for_speed)
6875 {
6876 *noalign = true;
6877 if (!count || (count & 3) || (memset && !zero_memset))
6878 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6879 ? rep_prefix_1_byte : loop_1_byte;
6880 else
6881 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6882 ? rep_prefix_4_byte : loop;
6883 }
6884 /* Very tiny blocks are best handled via the loop, REP is expensive to
6885 setup. */
6886 else if (expected_size != -1 && expected_size < 4)
6887 return loop_1_byte;
6888 else if (expected_size != -1)
6889 {
6890 enum stringop_alg alg = libcall;
6891 bool alg_noalign = false;
6892 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6893 {
6894 /* We get here if the algorithms that were not libcall-based
6895 were rep-prefix based and we are unable to use rep prefixes
6896 based on global register usage. Break out of the loop and
6897 use the heuristic below. */
6898 if (algs->size[i].max == 0)
6899 break;
6900 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6901 {
6902 enum stringop_alg candidate = algs->size[i].alg;
6903
6904 if (candidate != libcall
6905 && alg_usable_p (candidate, memset, have_as))
6906 {
6907 alg = candidate;
6908 alg_noalign = algs->size[i].noalign;
6909 }
6910 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6911 last non-libcall inline algorithm. */
6912 if (TARGET_INLINE_ALL_STRINGOPS)
6913 {
6914 /* When the current size is best to be copied by a libcall,
6915 but we are still forced to inline, run the heuristic below
6916 that will pick code for medium sized blocks. */
6917 if (alg != libcall)
6918 {
6919 *noalign = alg_noalign;
6920 return alg;
6921 }
6922 else if (!any_alg_usable_p)
6923 break;
6924 }
6925 else if (alg_usable_p (candidate, memset, have_as))
6926 {
6927 *noalign = algs->size[i].noalign;
6928 return candidate;
6929 }
6930 }
6931 }
6932 }
6933 /* When asked to inline the call anyway, try to pick meaningful choice.
6934 We look for maximal size of block that is faster to copy by hand and
6935 take blocks of at most of that size guessing that average size will
6936 be roughly half of the block.
6937
6938 If this turns out to be bad, we might simply specify the preferred
6939 choice in ix86_costs. */
6940 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6941 && (algs->unknown_size == libcall
6942 || !alg_usable_p (algs->unknown_size, memset, have_as)))
6943 {
6944 enum stringop_alg alg;
6945 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
6946
6947 /* If there aren't any usable algorithms or if recursing already,
6948 then recursing on smaller sizes or same size isn't going to
6949 find anything. Just return the simple byte-at-a-time copy loop. */
6950 if (!any_alg_usable_p || recur)
6951 {
6952 /* Pick something reasonable. */
6953 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
6954 *dynamic_check = 128;
6955 return loop_1_byte;
6956 }
6957 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
6958 zero_memset, have_as, dynamic_check, noalign, true);
6959 gcc_assert (*dynamic_check == -1);
6960 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6961 *dynamic_check = max;
6962 else
6963 gcc_assert (alg != libcall);
6964 return alg;
6965 }
6966 return (alg_usable_p (algs->unknown_size, memset, have_as)
6967 ? algs->unknown_size : libcall);
6968 }
6969
6970 /* Decide on alignment. We know that the operand is already aligned to ALIGN
6971 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
6972 static int
6973 decide_alignment (int align,
6974 enum stringop_alg alg,
6975 int expected_size,
6976 machine_mode move_mode)
6977 {
6978 int desired_align = 0;
6979
6980 gcc_assert (alg != no_stringop);
6981
6982 if (alg == libcall)
6983 return 0;
6984 if (move_mode == VOIDmode)
6985 return 0;
6986
6987 desired_align = GET_MODE_SIZE (move_mode);
6988 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
6989 copying whole cacheline at once. */
6990 if (TARGET_PENTIUMPRO
6991 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
6992 desired_align = 8;
6993
6994 if (optimize_size)
6995 desired_align = 1;
6996 if (desired_align < align)
6997 desired_align = align;
6998 if (expected_size != -1 && expected_size < 4)
6999 desired_align = align;
7000
7001 return desired_align;
7002 }
7003
7004
7005 /* Helper function for memcpy. For QImode value 0xXY produce
7006 0xXYXYXYXY of wide specified by MODE. This is essentially
7007 a * 0x10101010, but we can do slightly better than
7008 synth_mult by unwinding the sequence by hand on CPUs with
7009 slow multiply. */
7010 static rtx
7011 promote_duplicated_reg (machine_mode mode, rtx val)
7012 {
7013 machine_mode valmode = GET_MODE (val);
7014 rtx tmp;
7015 int nops = mode == DImode ? 3 : 2;
7016
7017 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7018 if (val == const0_rtx)
7019 return copy_to_mode_reg (mode, CONST0_RTX (mode));
7020 if (CONST_INT_P (val))
7021 {
7022 HOST_WIDE_INT v = INTVAL (val) & 255;
7023
7024 v |= v << 8;
7025 v |= v << 16;
7026 if (mode == DImode)
7027 v |= (v << 16) << 16;
7028 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7029 }
7030
7031 if (valmode == VOIDmode)
7032 valmode = QImode;
7033 if (valmode != QImode)
7034 val = gen_lowpart (QImode, val);
7035 if (mode == QImode)
7036 return val;
7037 if (!TARGET_PARTIAL_REG_STALL)
7038 nops--;
7039 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7040 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7041 <= (ix86_cost->shift_const + ix86_cost->add) * nops
7042 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7043 {
7044 rtx reg = convert_modes (mode, QImode, val, true);
7045 tmp = promote_duplicated_reg (mode, const1_rtx);
7046 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7047 OPTAB_DIRECT);
7048 }
7049 else
7050 {
7051 rtx reg = convert_modes (mode, QImode, val, true);
7052
7053 if (!TARGET_PARTIAL_REG_STALL)
7054 emit_insn (gen_insv_1 (mode, reg, reg));
7055 else
7056 {
7057 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7058 NULL, 1, OPTAB_DIRECT);
7059 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7060 OPTAB_DIRECT);
7061 }
7062 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7063 NULL, 1, OPTAB_DIRECT);
7064 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7065 if (mode == SImode)
7066 return reg;
7067 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7068 NULL, 1, OPTAB_DIRECT);
7069 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7070 return reg;
7071 }
7072 }
7073
7074 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7075 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7076 alignment from ALIGN to DESIRED_ALIGN. */
7077 static rtx
7078 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7079 int align)
7080 {
7081 rtx promoted_val;
7082
7083 if (TARGET_64BIT
7084 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7085 promoted_val = promote_duplicated_reg (DImode, val);
7086 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7087 promoted_val = promote_duplicated_reg (SImode, val);
7088 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7089 promoted_val = promote_duplicated_reg (HImode, val);
7090 else
7091 promoted_val = val;
7092
7093 return promoted_val;
7094 }
7095
7096 /* Copy the address to a Pmode register. This is used for x32 to
7097 truncate DImode TLS address to a SImode register. */
7098
7099 static rtx
7100 ix86_copy_addr_to_reg (rtx addr)
7101 {
7102 rtx reg;
7103 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7104 {
7105 reg = copy_addr_to_reg (addr);
7106 REG_POINTER (reg) = 1;
7107 return reg;
7108 }
7109 else
7110 {
7111 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7112 reg = copy_to_mode_reg (DImode, addr);
7113 REG_POINTER (reg) = 1;
7114 return gen_rtx_SUBREG (SImode, reg, 0);
7115 }
7116 }
7117
7118 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7119 operations when profitable. The code depends upon architecture, block size
7120 and alignment, but always has one of the following overall structures:
7121
7122 Aligned move sequence:
7123
7124 1) Prologue guard: Conditional that jumps up to epilogues for small
7125 blocks that can be handled by epilogue alone. This is faster
7126 but also needed for correctness, since prologue assume the block
7127 is larger than the desired alignment.
7128
7129 Optional dynamic check for size and libcall for large
7130 blocks is emitted here too, with -minline-stringops-dynamically.
7131
7132 2) Prologue: copy first few bytes in order to get destination
7133 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7134 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7135 copied. We emit either a jump tree on power of two sized
7136 blocks, or a byte loop.
7137
7138 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7139 with specified algorithm.
7140
7141 4) Epilogue: code copying tail of the block that is too small to be
7142 handled by main body (or up to size guarded by prologue guard).
7143
7144 Misaligned move sequence
7145
7146 1) missaligned move prologue/epilogue containing:
7147 a) Prologue handling small memory blocks and jumping to done_label
7148 (skipped if blocks are known to be large enough)
7149 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7150 needed by single possibly misaligned move
7151 (skipped if alignment is not needed)
7152 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7153
7154 2) Zero size guard dispatching to done_label, if needed
7155
7156 3) dispatch to library call, if needed,
7157
7158 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7159 with specified algorithm. */
7160 bool
7161 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
7162 rtx align_exp, rtx expected_align_exp,
7163 rtx expected_size_exp, rtx min_size_exp,
7164 rtx max_size_exp, rtx probable_max_size_exp,
7165 bool issetmem)
7166 {
7167 rtx destreg;
7168 rtx srcreg = NULL;
7169 rtx_code_label *label = NULL;
7170 rtx tmp;
7171 rtx_code_label *jump_around_label = NULL;
7172 HOST_WIDE_INT align = 1;
7173 unsigned HOST_WIDE_INT count = 0;
7174 HOST_WIDE_INT expected_size = -1;
7175 int size_needed = 0, epilogue_size_needed;
7176 int desired_align = 0, align_bytes = 0;
7177 enum stringop_alg alg;
7178 rtx promoted_val = NULL;
7179 rtx vec_promoted_val = NULL;
7180 bool force_loopy_epilogue = false;
7181 int dynamic_check;
7182 bool need_zero_guard = false;
7183 bool noalign;
7184 machine_mode move_mode = VOIDmode;
7185 machine_mode wider_mode;
7186 int unroll_factor = 1;
7187 /* TODO: Once value ranges are available, fill in proper data. */
7188 unsigned HOST_WIDE_INT min_size = 0;
7189 unsigned HOST_WIDE_INT max_size = -1;
7190 unsigned HOST_WIDE_INT probable_max_size = -1;
7191 bool misaligned_prologue_used = false;
7192 bool have_as;
7193
7194 if (CONST_INT_P (align_exp))
7195 align = INTVAL (align_exp);
7196 /* i386 can do misaligned access on reasonably increased cost. */
7197 if (CONST_INT_P (expected_align_exp)
7198 && INTVAL (expected_align_exp) > align)
7199 align = INTVAL (expected_align_exp);
7200 /* ALIGN is the minimum of destination and source alignment, but we care here
7201 just about destination alignment. */
7202 else if (!issetmem
7203 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7204 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7205
7206 if (CONST_INT_P (count_exp))
7207 {
7208 min_size = max_size = probable_max_size = count = expected_size
7209 = INTVAL (count_exp);
7210 /* When COUNT is 0, there is nothing to do. */
7211 if (!count)
7212 return true;
7213 }
7214 else
7215 {
7216 if (min_size_exp)
7217 min_size = INTVAL (min_size_exp);
7218 if (max_size_exp)
7219 max_size = INTVAL (max_size_exp);
7220 if (probable_max_size_exp)
7221 probable_max_size = INTVAL (probable_max_size_exp);
7222 if (CONST_INT_P (expected_size_exp))
7223 expected_size = INTVAL (expected_size_exp);
7224 }
7225
7226 /* Make sure we don't need to care about overflow later on. */
7227 if (count > (HOST_WIDE_INT_1U << 30))
7228 return false;
7229
7230 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7231 if (!issetmem)
7232 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7233
7234 /* Step 0: Decide on preferred algorithm, desired alignment and
7235 size of chunks to be copied by main loop. */
7236 alg = decide_alg (count, expected_size, min_size, probable_max_size,
7237 issetmem,
7238 issetmem && val_exp == const0_rtx, have_as,
7239 &dynamic_check, &noalign, false);
7240
7241 if (dump_file)
7242 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7243 stringop_alg_names[alg]);
7244
7245 if (alg == libcall)
7246 return false;
7247 gcc_assert (alg != no_stringop);
7248
7249 /* For now vector-version of memset is generated only for memory zeroing, as
7250 creating of promoted vector value is very cheap in this case. */
7251 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7252 alg = unrolled_loop;
7253
7254 if (!count)
7255 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7256 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7257 if (!issetmem)
7258 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7259
7260 unroll_factor = 1;
7261 move_mode = word_mode;
7262 switch (alg)
7263 {
7264 case libcall:
7265 case no_stringop:
7266 case last_alg:
7267 gcc_unreachable ();
7268 case loop_1_byte:
7269 need_zero_guard = true;
7270 move_mode = QImode;
7271 break;
7272 case loop:
7273 need_zero_guard = true;
7274 break;
7275 case unrolled_loop:
7276 need_zero_guard = true;
7277 unroll_factor = (TARGET_64BIT ? 4 : 2);
7278 break;
7279 case vector_loop:
7280 need_zero_guard = true;
7281 unroll_factor = 4;
7282 /* Find the widest supported mode. */
7283 move_mode = word_mode;
7284 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7285 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7286 move_mode = wider_mode;
7287
7288 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
7289 move_mode = TImode;
7290
7291 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7292 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7293 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7294 {
7295 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7296 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7297 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7298 move_mode = word_mode;
7299 }
7300 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7301 break;
7302 case rep_prefix_8_byte:
7303 move_mode = DImode;
7304 break;
7305 case rep_prefix_4_byte:
7306 move_mode = SImode;
7307 break;
7308 case rep_prefix_1_byte:
7309 move_mode = QImode;
7310 break;
7311 }
7312 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7313 epilogue_size_needed = size_needed;
7314
7315 /* If we are going to call any library calls conditionally, make sure any
7316 pending stack adjustment happen before the first conditional branch,
7317 otherwise they will be emitted before the library call only and won't
7318 happen from the other branches. */
7319 if (dynamic_check != -1)
7320 do_pending_stack_adjust ();
7321
7322 desired_align = decide_alignment (align, alg, expected_size, move_mode);
7323 if (!TARGET_ALIGN_STRINGOPS || noalign)
7324 align = desired_align;
7325
7326 /* Step 1: Prologue guard. */
7327
7328 /* Alignment code needs count to be in register. */
7329 if (CONST_INT_P (count_exp) && desired_align > align)
7330 {
7331 if (INTVAL (count_exp) > desired_align
7332 && INTVAL (count_exp) > size_needed)
7333 {
7334 align_bytes
7335 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7336 if (align_bytes <= 0)
7337 align_bytes = 0;
7338 else
7339 align_bytes = desired_align - align_bytes;
7340 }
7341 if (align_bytes == 0)
7342 count_exp = force_reg (counter_mode (count_exp), count_exp);
7343 }
7344 gcc_assert (desired_align >= 1 && align >= 1);
7345
7346 /* Misaligned move sequences handle both prologue and epilogue at once.
7347 Default code generation results in a smaller code for large alignments
7348 and also avoids redundant job when sizes are known precisely. */
7349 misaligned_prologue_used
7350 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7351 && MAX (desired_align, epilogue_size_needed) <= 32
7352 && desired_align <= epilogue_size_needed
7353 && ((desired_align > align && !align_bytes)
7354 || (!count && epilogue_size_needed > 1)));
7355
7356 /* Do the cheap promotion to allow better CSE across the
7357 main loop and epilogue (ie one load of the big constant in the
7358 front of all code.
7359 For now the misaligned move sequences do not have fast path
7360 without broadcasting. */
7361 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7362 {
7363 if (alg == vector_loop)
7364 {
7365 gcc_assert (val_exp == const0_rtx);
7366 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7367 promoted_val = promote_duplicated_reg_to_size (val_exp,
7368 GET_MODE_SIZE (word_mode),
7369 desired_align, align);
7370 }
7371 else
7372 {
7373 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7374 desired_align, align);
7375 }
7376 }
7377 /* Misaligned move sequences handles both prologues and epilogues at once.
7378 Default code generation results in smaller code for large alignments and
7379 also avoids redundant job when sizes are known precisely. */
7380 if (misaligned_prologue_used)
7381 {
7382 /* Misaligned move prologue handled small blocks by itself. */
7383 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7384 (dst, src, &destreg, &srcreg,
7385 move_mode, promoted_val, vec_promoted_val,
7386 &count_exp,
7387 &jump_around_label,
7388 desired_align < align
7389 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7390 desired_align, align, &min_size, dynamic_check, issetmem);
7391 if (!issetmem)
7392 src = change_address (src, BLKmode, srcreg);
7393 dst = change_address (dst, BLKmode, destreg);
7394 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7395 epilogue_size_needed = 0;
7396 if (need_zero_guard
7397 && min_size < (unsigned HOST_WIDE_INT) size_needed)
7398 {
7399 /* It is possible that we copied enough so the main loop will not
7400 execute. */
7401 gcc_assert (size_needed > 1);
7402 if (jump_around_label == NULL_RTX)
7403 jump_around_label = gen_label_rtx ();
7404 emit_cmp_and_jump_insns (count_exp,
7405 GEN_INT (size_needed),
7406 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7407 if (expected_size == -1
7408 || expected_size < (desired_align - align) / 2 + size_needed)
7409 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7410 else
7411 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7412 }
7413 }
7414 /* Ensure that alignment prologue won't copy past end of block. */
7415 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7416 {
7417 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7418 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7419 Make sure it is power of 2. */
7420 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7421
7422 /* To improve performance of small blocks, we jump around the VAL
7423 promoting mode. This mean that if the promoted VAL is not constant,
7424 we might not use it in the epilogue and have to use byte
7425 loop variant. */
7426 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7427 force_loopy_epilogue = true;
7428 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7429 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7430 {
7431 /* If main algorithm works on QImode, no epilogue is needed.
7432 For small sizes just don't align anything. */
7433 if (size_needed == 1)
7434 desired_align = align;
7435 else
7436 goto epilogue;
7437 }
7438 else if (!count
7439 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7440 {
7441 label = gen_label_rtx ();
7442 emit_cmp_and_jump_insns (count_exp,
7443 GEN_INT (epilogue_size_needed),
7444 LTU, 0, counter_mode (count_exp), 1, label);
7445 if (expected_size == -1 || expected_size < epilogue_size_needed)
7446 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7447 else
7448 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7449 }
7450 }
7451
7452 /* Emit code to decide on runtime whether library call or inline should be
7453 used. */
7454 if (dynamic_check != -1)
7455 {
7456 if (!issetmem && CONST_INT_P (count_exp))
7457 {
7458 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7459 {
7460 emit_block_copy_via_libcall (dst, src, count_exp);
7461 count_exp = const0_rtx;
7462 goto epilogue;
7463 }
7464 }
7465 else
7466 {
7467 rtx_code_label *hot_label = gen_label_rtx ();
7468 if (jump_around_label == NULL_RTX)
7469 jump_around_label = gen_label_rtx ();
7470 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7471 LEU, 0, counter_mode (count_exp),
7472 1, hot_label);
7473 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7474 if (issetmem)
7475 set_storage_via_libcall (dst, count_exp, val_exp);
7476 else
7477 emit_block_copy_via_libcall (dst, src, count_exp);
7478 emit_jump (jump_around_label);
7479 emit_label (hot_label);
7480 }
7481 }
7482
7483 /* Step 2: Alignment prologue. */
7484 /* Do the expensive promotion once we branched off the small blocks. */
7485 if (issetmem && !promoted_val)
7486 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7487 desired_align, align);
7488
7489 if (desired_align > align && !misaligned_prologue_used)
7490 {
7491 if (align_bytes == 0)
7492 {
7493 /* Except for the first move in prologue, we no longer know
7494 constant offset in aliasing info. It don't seems to worth
7495 the pain to maintain it for the first move, so throw away
7496 the info early. */
7497 dst = change_address (dst, BLKmode, destreg);
7498 if (!issetmem)
7499 src = change_address (src, BLKmode, srcreg);
7500 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
7501 promoted_val, vec_promoted_val,
7502 count_exp, align, desired_align,
7503 issetmem);
7504 /* At most desired_align - align bytes are copied. */
7505 if (min_size < (unsigned)(desired_align - align))
7506 min_size = 0;
7507 else
7508 min_size -= desired_align - align;
7509 }
7510 else
7511 {
7512 /* If we know how many bytes need to be stored before dst is
7513 sufficiently aligned, maintain aliasing info accurately. */
7514 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
7515 srcreg,
7516 promoted_val,
7517 vec_promoted_val,
7518 desired_align,
7519 align_bytes,
7520 issetmem);
7521
7522 count_exp = plus_constant (counter_mode (count_exp),
7523 count_exp, -align_bytes);
7524 count -= align_bytes;
7525 min_size -= align_bytes;
7526 max_size -= align_bytes;
7527 }
7528 if (need_zero_guard
7529 && min_size < (unsigned HOST_WIDE_INT) size_needed
7530 && (count < (unsigned HOST_WIDE_INT) size_needed
7531 || (align_bytes == 0
7532 && count < ((unsigned HOST_WIDE_INT) size_needed
7533 + desired_align - align))))
7534 {
7535 /* It is possible that we copied enough so the main loop will not
7536 execute. */
7537 gcc_assert (size_needed > 1);
7538 if (label == NULL_RTX)
7539 label = gen_label_rtx ();
7540 emit_cmp_and_jump_insns (count_exp,
7541 GEN_INT (size_needed),
7542 LTU, 0, counter_mode (count_exp), 1, label);
7543 if (expected_size == -1
7544 || expected_size < (desired_align - align) / 2 + size_needed)
7545 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7546 else
7547 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7548 }
7549 }
7550 if (label && size_needed == 1)
7551 {
7552 emit_label (label);
7553 LABEL_NUSES (label) = 1;
7554 label = NULL;
7555 epilogue_size_needed = 1;
7556 if (issetmem)
7557 promoted_val = val_exp;
7558 }
7559 else if (label == NULL_RTX && !misaligned_prologue_used)
7560 epilogue_size_needed = size_needed;
7561
7562 /* Step 3: Main loop. */
7563
7564 switch (alg)
7565 {
7566 case libcall:
7567 case no_stringop:
7568 case last_alg:
7569 gcc_unreachable ();
7570 case loop_1_byte:
7571 case loop:
7572 case unrolled_loop:
7573 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
7574 count_exp, move_mode, unroll_factor,
7575 expected_size, issetmem);
7576 break;
7577 case vector_loop:
7578 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
7579 vec_promoted_val, count_exp, move_mode,
7580 unroll_factor, expected_size, issetmem);
7581 break;
7582 case rep_prefix_8_byte:
7583 case rep_prefix_4_byte:
7584 case rep_prefix_1_byte:
7585 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
7586 val_exp, count_exp, move_mode, issetmem);
7587 break;
7588 }
7589 /* Adjust properly the offset of src and dest memory for aliasing. */
7590 if (CONST_INT_P (count_exp))
7591 {
7592 if (!issetmem)
7593 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7594 (count / size_needed) * size_needed);
7595 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7596 (count / size_needed) * size_needed);
7597 }
7598 else
7599 {
7600 if (!issetmem)
7601 src = change_address (src, BLKmode, srcreg);
7602 dst = change_address (dst, BLKmode, destreg);
7603 }
7604
7605 /* Step 4: Epilogue to copy the remaining bytes. */
7606 epilogue:
7607 if (label)
7608 {
7609 /* When the main loop is done, COUNT_EXP might hold original count,
7610 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7611 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7612 bytes. Compensate if needed. */
7613
7614 if (size_needed < epilogue_size_needed)
7615 {
7616 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7617 GEN_INT (size_needed - 1), count_exp, 1,
7618 OPTAB_DIRECT);
7619 if (tmp != count_exp)
7620 emit_move_insn (count_exp, tmp);
7621 }
7622 emit_label (label);
7623 LABEL_NUSES (label) = 1;
7624 }
7625
7626 if (count_exp != const0_rtx && epilogue_size_needed > 1)
7627 {
7628 if (force_loopy_epilogue)
7629 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7630 epilogue_size_needed);
7631 else
7632 {
7633 if (issetmem)
7634 expand_setmem_epilogue (dst, destreg, promoted_val,
7635 vec_promoted_val, count_exp,
7636 epilogue_size_needed);
7637 else
7638 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
7639 epilogue_size_needed);
7640 }
7641 }
7642 if (jump_around_label)
7643 emit_label (jump_around_label);
7644 return true;
7645 }
7646
7647
7648 /* Expand the appropriate insns for doing strlen if not just doing
7649 repnz; scasb
7650
7651 out = result, initialized with the start address
7652 align_rtx = alignment of the address.
7653 scratch = scratch register, initialized with the startaddress when
7654 not aligned, otherwise undefined
7655
7656 This is just the body. It needs the initializations mentioned above and
7657 some address computing at the end. These things are done in i386.md. */
7658
7659 static void
7660 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7661 {
7662 int align;
7663 rtx tmp;
7664 rtx_code_label *align_2_label = NULL;
7665 rtx_code_label *align_3_label = NULL;
7666 rtx_code_label *align_4_label = gen_label_rtx ();
7667 rtx_code_label *end_0_label = gen_label_rtx ();
7668 rtx mem;
7669 rtx tmpreg = gen_reg_rtx (SImode);
7670 rtx scratch = gen_reg_rtx (SImode);
7671 rtx cmp;
7672
7673 align = 0;
7674 if (CONST_INT_P (align_rtx))
7675 align = INTVAL (align_rtx);
7676
7677 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7678
7679 /* Is there a known alignment and is it less than 4? */
7680 if (align < 4)
7681 {
7682 rtx scratch1 = gen_reg_rtx (Pmode);
7683 emit_move_insn (scratch1, out);
7684 /* Is there a known alignment and is it not 2? */
7685 if (align != 2)
7686 {
7687 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7688 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7689
7690 /* Leave just the 3 lower bits. */
7691 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7692 NULL_RTX, 0, OPTAB_WIDEN);
7693
7694 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7695 Pmode, 1, align_4_label);
7696 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7697 Pmode, 1, align_2_label);
7698 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7699 Pmode, 1, align_3_label);
7700 }
7701 else
7702 {
7703 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7704 check if is aligned to 4 - byte. */
7705
7706 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7707 NULL_RTX, 0, OPTAB_WIDEN);
7708
7709 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7710 Pmode, 1, align_4_label);
7711 }
7712
7713 mem = change_address (src, QImode, out);
7714
7715 /* Now compare the bytes. */
7716
7717 /* Compare the first n unaligned byte on a byte per byte basis. */
7718 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7719 QImode, 1, end_0_label);
7720
7721 /* Increment the address. */
7722 emit_insn (gen_add2_insn (out, const1_rtx));
7723
7724 /* Not needed with an alignment of 2 */
7725 if (align != 2)
7726 {
7727 emit_label (align_2_label);
7728
7729 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7730 end_0_label);
7731
7732 emit_insn (gen_add2_insn (out, const1_rtx));
7733
7734 emit_label (align_3_label);
7735 }
7736
7737 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7738 end_0_label);
7739
7740 emit_insn (gen_add2_insn (out, const1_rtx));
7741 }
7742
7743 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7744 align this loop. It gives only huge programs, but does not help to
7745 speed up. */
7746 emit_label (align_4_label);
7747
7748 mem = change_address (src, SImode, out);
7749 emit_move_insn (scratch, mem);
7750 emit_insn (gen_add2_insn (out, GEN_INT (4)));
7751
7752 /* This formula yields a nonzero result iff one of the bytes is zero.
7753 This saves three branches inside loop and many cycles. */
7754
7755 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7756 emit_insn (gen_one_cmplsi2 (scratch, scratch));
7757 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7758 emit_insn (gen_andsi3 (tmpreg, tmpreg,
7759 gen_int_mode (0x80808080, SImode)));
7760 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7761 align_4_label);
7762
7763 if (TARGET_CMOVE)
7764 {
7765 rtx reg = gen_reg_rtx (SImode);
7766 rtx reg2 = gen_reg_rtx (Pmode);
7767 emit_move_insn (reg, tmpreg);
7768 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7769
7770 /* If zero is not in the first two bytes, move two bytes forward. */
7771 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7772 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7773 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7774 emit_insn (gen_rtx_SET (tmpreg,
7775 gen_rtx_IF_THEN_ELSE (SImode, tmp,
7776 reg,
7777 tmpreg)));
7778 /* Emit lea manually to avoid clobbering of flags. */
7779 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
7780
7781 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7782 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7783 emit_insn (gen_rtx_SET (out,
7784 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7785 reg2,
7786 out)));
7787 }
7788 else
7789 {
7790 rtx_code_label *end_2_label = gen_label_rtx ();
7791 /* Is zero in the first two bytes? */
7792
7793 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7794 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7795 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7796 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7797 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7798 pc_rtx);
7799 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7800 JUMP_LABEL (tmp) = end_2_label;
7801
7802 /* Not in the first two. Move two bytes forward. */
7803 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
7804 emit_insn (gen_add2_insn (out, const2_rtx));
7805
7806 emit_label (end_2_label);
7807
7808 }
7809
7810 /* Avoid branch in fixing the byte. */
7811 tmpreg = gen_lowpart (QImode, tmpreg);
7812 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7813 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7814 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
7815 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
7816
7817 emit_label (end_0_label);
7818 }
7819
7820 /* Expand strlen. */
7821
7822 bool
7823 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7824 {
7825 if (TARGET_UNROLL_STRLEN
7826 && TARGET_INLINE_ALL_STRINGOPS
7827 && eoschar == const0_rtx
7828 && optimize > 1)
7829 {
7830 /* The generic case of strlen expander is long. Avoid it's
7831 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7832 rtx addr = force_reg (Pmode, XEXP (src, 0));
7833 /* Well it seems that some optimizer does not combine a call like
7834 foo(strlen(bar), strlen(bar));
7835 when the move and the subtraction is done here. It does calculate
7836 the length just once when these instructions are done inside of
7837 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7838 often used and I use one fewer register for the lifetime of
7839 output_strlen_unroll() this is better. */
7840
7841 emit_move_insn (out, addr);
7842
7843 ix86_expand_strlensi_unroll_1 (out, src, align);
7844
7845 /* strlensi_unroll_1 returns the address of the zero at the end of
7846 the string, like memchr(), so compute the length by subtracting
7847 the start address. */
7848 emit_insn (gen_sub2_insn (out, addr));
7849 return true;
7850 }
7851 else
7852 return false;
7853 }
7854
7855 /* For given symbol (function) construct code to compute address of it's PLT
7856 entry in large x86-64 PIC model. */
7857
7858 static rtx
7859 construct_plt_address (rtx symbol)
7860 {
7861 rtx tmp, unspec;
7862
7863 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
7864 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
7865 gcc_assert (Pmode == DImode);
7866
7867 tmp = gen_reg_rtx (Pmode);
7868 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
7869
7870 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
7871 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
7872 return tmp;
7873 }
7874
7875 /* Additional registers that are clobbered by SYSV calls. */
7876
7877 static int const x86_64_ms_sysv_extra_clobbered_registers
7878 [NUM_X86_64_MS_CLOBBERED_REGS] =
7879 {
7880 SI_REG, DI_REG,
7881 XMM6_REG, XMM7_REG,
7882 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
7883 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
7884 };
7885
7886 rtx_insn *
7887 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
7888 rtx callarg2,
7889 rtx pop, bool sibcall)
7890 {
7891 rtx vec[3];
7892 rtx use = NULL, call;
7893 unsigned int vec_len = 0;
7894 tree fndecl;
7895
7896 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7897 {
7898 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
7899 if (fndecl
7900 && (lookup_attribute ("interrupt",
7901 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
7902 error ("interrupt service routine cannot be called directly");
7903 }
7904 else
7905 fndecl = NULL_TREE;
7906
7907 if (pop == const0_rtx)
7908 pop = NULL;
7909 gcc_assert (!TARGET_64BIT || !pop);
7910
7911 if (TARGET_MACHO && !TARGET_64BIT)
7912 {
7913 #if TARGET_MACHO
7914 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7915 fnaddr = machopic_indirect_call_target (fnaddr);
7916 #endif
7917 }
7918 else
7919 {
7920 /* Static functions and indirect calls don't need the pic register. Also,
7921 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7922 it an indirect call. */
7923 rtx addr = XEXP (fnaddr, 0);
7924 if (flag_pic
7925 && GET_CODE (addr) == SYMBOL_REF
7926 && !SYMBOL_REF_LOCAL_P (addr))
7927 {
7928 if (flag_plt
7929 && (SYMBOL_REF_DECL (addr) == NULL_TREE
7930 || !lookup_attribute ("noplt",
7931 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
7932 {
7933 if (!TARGET_64BIT
7934 || (ix86_cmodel == CM_LARGE_PIC
7935 && DEFAULT_ABI != MS_ABI))
7936 {
7937 use_reg (&use, gen_rtx_REG (Pmode,
7938 REAL_PIC_OFFSET_TABLE_REGNUM));
7939 if (ix86_use_pseudo_pic_reg ())
7940 emit_move_insn (gen_rtx_REG (Pmode,
7941 REAL_PIC_OFFSET_TABLE_REGNUM),
7942 pic_offset_table_rtx);
7943 }
7944 }
7945 else if (!TARGET_PECOFF && !TARGET_MACHO)
7946 {
7947 if (TARGET_64BIT)
7948 {
7949 fnaddr = gen_rtx_UNSPEC (Pmode,
7950 gen_rtvec (1, addr),
7951 UNSPEC_GOTPCREL);
7952 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
7953 }
7954 else
7955 {
7956 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
7957 UNSPEC_GOT);
7958 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
7959 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
7960 fnaddr);
7961 }
7962 fnaddr = gen_const_mem (Pmode, fnaddr);
7963 /* Pmode may not be the same as word_mode for x32, which
7964 doesn't support indirect branch via 32-bit memory slot.
7965 Since x32 GOT slot is 64 bit with zero upper 32 bits,
7966 indirect branch via x32 GOT slot is OK. */
7967 if (GET_MODE (fnaddr) != word_mode)
7968 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
7969 fnaddr = gen_rtx_MEM (QImode, fnaddr);
7970 }
7971 }
7972 }
7973
7974 /* Skip setting up RAX register for -mskip-rax-setup when there are no
7975 parameters passed in vector registers. */
7976 if (TARGET_64BIT
7977 && (INTVAL (callarg2) > 0
7978 || (INTVAL (callarg2) == 0
7979 && (TARGET_SSE || !flag_skip_rax_setup))))
7980 {
7981 rtx al = gen_rtx_REG (QImode, AX_REG);
7982 emit_move_insn (al, callarg2);
7983 use_reg (&use, al);
7984 }
7985
7986 if (ix86_cmodel == CM_LARGE_PIC
7987 && !TARGET_PECOFF
7988 && MEM_P (fnaddr)
7989 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
7990 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
7991 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
7992 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
7993 branch via x32 GOT slot is OK. */
7994 else if (!(TARGET_X32
7995 && MEM_P (fnaddr)
7996 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
7997 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
7998 && (sibcall
7999 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8000 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8001 {
8002 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8003 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8004 }
8005
8006 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8007
8008 if (retval)
8009 call = gen_rtx_SET (retval, call);
8010 vec[vec_len++] = call;
8011
8012 if (pop)
8013 {
8014 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8015 pop = gen_rtx_SET (stack_pointer_rtx, pop);
8016 vec[vec_len++] = pop;
8017 }
8018
8019 if (cfun->machine->no_caller_saved_registers
8020 && (!fndecl
8021 || (!TREE_THIS_VOLATILE (fndecl)
8022 && !lookup_attribute ("no_caller_saved_registers",
8023 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8024 {
8025 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8026 bool is_64bit_ms_abi = (TARGET_64BIT
8027 && ix86_function_abi (fndecl) == MS_ABI);
8028 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8029
8030 /* If there are no caller-saved registers, add all registers
8031 that are clobbered by the call which returns. */
8032 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8033 if (!fixed_regs[i]
8034 && (ix86_call_used_regs[i] == 1
8035 || (ix86_call_used_regs[i] & c_mask))
8036 && !STACK_REGNO_P (i)
8037 && !MMX_REGNO_P (i))
8038 clobber_reg (&use,
8039 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8040 }
8041 else if (TARGET_64BIT_MS_ABI
8042 && (!callarg2 || INTVAL (callarg2) != -2))
8043 {
8044 unsigned i;
8045
8046 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8047 {
8048 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8049 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8050
8051 clobber_reg (&use, gen_rtx_REG (mode, regno));
8052 }
8053
8054 /* Set here, but it may get cleared later. */
8055 if (TARGET_CALL_MS2SYSV_XLOGUES)
8056 {
8057 if (!TARGET_SSE)
8058 ;
8059
8060 /* Don't break hot-patched functions. */
8061 else if (ix86_function_ms_hook_prologue (current_function_decl))
8062 ;
8063
8064 /* TODO: Cases not yet examined. */
8065 else if (flag_split_stack)
8066 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8067
8068 else
8069 {
8070 gcc_assert (!reload_completed);
8071 cfun->machine->call_ms2sysv = true;
8072 }
8073 }
8074 }
8075
8076 if (vec_len > 1)
8077 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8078 rtx_insn *call_insn = emit_call_insn (call);
8079 if (use)
8080 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8081
8082 return call_insn;
8083 }
8084
8085 /* Split simple return with popping POPC bytes from stack to indirect
8086 branch with stack adjustment . */
8087
8088 void
8089 ix86_split_simple_return_pop_internal (rtx popc)
8090 {
8091 struct machine_function *m = cfun->machine;
8092 rtx ecx = gen_rtx_REG (SImode, CX_REG);
8093 rtx_insn *insn;
8094
8095 /* There is no "pascal" calling convention in any 64bit ABI. */
8096 gcc_assert (!TARGET_64BIT);
8097
8098 insn = emit_insn (gen_pop (ecx));
8099 m->fs.cfa_offset -= UNITS_PER_WORD;
8100 m->fs.sp_offset -= UNITS_PER_WORD;
8101
8102 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8103 x = gen_rtx_SET (stack_pointer_rtx, x);
8104 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8105 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8106 RTX_FRAME_RELATED_P (insn) = 1;
8107
8108 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8109 x = gen_rtx_SET (stack_pointer_rtx, x);
8110 insn = emit_insn (x);
8111 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8112 RTX_FRAME_RELATED_P (insn) = 1;
8113
8114 /* Now return address is in ECX. */
8115 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8116 }
8117
8118 /* Errors in the source file can cause expand_expr to return const0_rtx
8119 where we expect a vector. To avoid crashing, use one of the vector
8120 clear instructions. */
8121
8122 static rtx
8123 safe_vector_operand (rtx x, machine_mode mode)
8124 {
8125 if (x == const0_rtx)
8126 x = CONST0_RTX (mode);
8127 return x;
8128 }
8129
8130 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8131
8132 static rtx
8133 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8134 {
8135 rtx pat;
8136 tree arg0 = CALL_EXPR_ARG (exp, 0);
8137 tree arg1 = CALL_EXPR_ARG (exp, 1);
8138 rtx op0 = expand_normal (arg0);
8139 rtx op1 = expand_normal (arg1);
8140 machine_mode tmode = insn_data[icode].operand[0].mode;
8141 machine_mode mode0 = insn_data[icode].operand[1].mode;
8142 machine_mode mode1 = insn_data[icode].operand[2].mode;
8143
8144 if (VECTOR_MODE_P (mode0))
8145 op0 = safe_vector_operand (op0, mode0);
8146 if (VECTOR_MODE_P (mode1))
8147 op1 = safe_vector_operand (op1, mode1);
8148
8149 if (optimize || !target
8150 || GET_MODE (target) != tmode
8151 || !insn_data[icode].operand[0].predicate (target, tmode))
8152 target = gen_reg_rtx (tmode);
8153
8154 if (GET_MODE (op1) == SImode && mode1 == TImode)
8155 {
8156 rtx x = gen_reg_rtx (V4SImode);
8157 emit_insn (gen_sse2_loadd (x, op1));
8158 op1 = gen_lowpart (TImode, x);
8159 }
8160
8161 if (!insn_data[icode].operand[1].predicate (op0, mode0))
8162 op0 = copy_to_mode_reg (mode0, op0);
8163 if (!insn_data[icode].operand[2].predicate (op1, mode1))
8164 op1 = copy_to_mode_reg (mode1, op1);
8165
8166 pat = GEN_FCN (icode) (target, op0, op1);
8167 if (! pat)
8168 return 0;
8169
8170 emit_insn (pat);
8171
8172 return target;
8173 }
8174
8175 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8176
8177 static rtx
8178 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8179 enum ix86_builtin_func_type m_type,
8180 enum rtx_code sub_code)
8181 {
8182 rtx pat;
8183 int i;
8184 int nargs;
8185 bool comparison_p = false;
8186 bool tf_p = false;
8187 bool last_arg_constant = false;
8188 int num_memory = 0;
8189 struct {
8190 rtx op;
8191 machine_mode mode;
8192 } args[4];
8193
8194 machine_mode tmode = insn_data[icode].operand[0].mode;
8195
8196 switch (m_type)
8197 {
8198 case MULTI_ARG_4_DF2_DI_I:
8199 case MULTI_ARG_4_DF2_DI_I1:
8200 case MULTI_ARG_4_SF2_SI_I:
8201 case MULTI_ARG_4_SF2_SI_I1:
8202 nargs = 4;
8203 last_arg_constant = true;
8204 break;
8205
8206 case MULTI_ARG_3_SF:
8207 case MULTI_ARG_3_DF:
8208 case MULTI_ARG_3_SF2:
8209 case MULTI_ARG_3_DF2:
8210 case MULTI_ARG_3_DI:
8211 case MULTI_ARG_3_SI:
8212 case MULTI_ARG_3_SI_DI:
8213 case MULTI_ARG_3_HI:
8214 case MULTI_ARG_3_HI_SI:
8215 case MULTI_ARG_3_QI:
8216 case MULTI_ARG_3_DI2:
8217 case MULTI_ARG_3_SI2:
8218 case MULTI_ARG_3_HI2:
8219 case MULTI_ARG_3_QI2:
8220 nargs = 3;
8221 break;
8222
8223 case MULTI_ARG_2_SF:
8224 case MULTI_ARG_2_DF:
8225 case MULTI_ARG_2_DI:
8226 case MULTI_ARG_2_SI:
8227 case MULTI_ARG_2_HI:
8228 case MULTI_ARG_2_QI:
8229 nargs = 2;
8230 break;
8231
8232 case MULTI_ARG_2_DI_IMM:
8233 case MULTI_ARG_2_SI_IMM:
8234 case MULTI_ARG_2_HI_IMM:
8235 case MULTI_ARG_2_QI_IMM:
8236 nargs = 2;
8237 last_arg_constant = true;
8238 break;
8239
8240 case MULTI_ARG_1_SF:
8241 case MULTI_ARG_1_DF:
8242 case MULTI_ARG_1_SF2:
8243 case MULTI_ARG_1_DF2:
8244 case MULTI_ARG_1_DI:
8245 case MULTI_ARG_1_SI:
8246 case MULTI_ARG_1_HI:
8247 case MULTI_ARG_1_QI:
8248 case MULTI_ARG_1_SI_DI:
8249 case MULTI_ARG_1_HI_DI:
8250 case MULTI_ARG_1_HI_SI:
8251 case MULTI_ARG_1_QI_DI:
8252 case MULTI_ARG_1_QI_SI:
8253 case MULTI_ARG_1_QI_HI:
8254 nargs = 1;
8255 break;
8256
8257 case MULTI_ARG_2_DI_CMP:
8258 case MULTI_ARG_2_SI_CMP:
8259 case MULTI_ARG_2_HI_CMP:
8260 case MULTI_ARG_2_QI_CMP:
8261 nargs = 2;
8262 comparison_p = true;
8263 break;
8264
8265 case MULTI_ARG_2_SF_TF:
8266 case MULTI_ARG_2_DF_TF:
8267 case MULTI_ARG_2_DI_TF:
8268 case MULTI_ARG_2_SI_TF:
8269 case MULTI_ARG_2_HI_TF:
8270 case MULTI_ARG_2_QI_TF:
8271 nargs = 2;
8272 tf_p = true;
8273 break;
8274
8275 default:
8276 gcc_unreachable ();
8277 }
8278
8279 if (optimize || !target
8280 || GET_MODE (target) != tmode
8281 || !insn_data[icode].operand[0].predicate (target, tmode))
8282 target = gen_reg_rtx (tmode);
8283 else if (memory_operand (target, tmode))
8284 num_memory++;
8285
8286 gcc_assert (nargs <= 4);
8287
8288 for (i = 0; i < nargs; i++)
8289 {
8290 tree arg = CALL_EXPR_ARG (exp, i);
8291 rtx op = expand_normal (arg);
8292 int adjust = (comparison_p) ? 1 : 0;
8293 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8294
8295 if (last_arg_constant && i == nargs - 1)
8296 {
8297 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8298 {
8299 enum insn_code new_icode = icode;
8300 switch (icode)
8301 {
8302 case CODE_FOR_xop_vpermil2v2df3:
8303 case CODE_FOR_xop_vpermil2v4sf3:
8304 case CODE_FOR_xop_vpermil2v4df3:
8305 case CODE_FOR_xop_vpermil2v8sf3:
8306 error ("the last argument must be a 2-bit immediate");
8307 return gen_reg_rtx (tmode);
8308 case CODE_FOR_xop_rotlv2di3:
8309 new_icode = CODE_FOR_rotlv2di3;
8310 goto xop_rotl;
8311 case CODE_FOR_xop_rotlv4si3:
8312 new_icode = CODE_FOR_rotlv4si3;
8313 goto xop_rotl;
8314 case CODE_FOR_xop_rotlv8hi3:
8315 new_icode = CODE_FOR_rotlv8hi3;
8316 goto xop_rotl;
8317 case CODE_FOR_xop_rotlv16qi3:
8318 new_icode = CODE_FOR_rotlv16qi3;
8319 xop_rotl:
8320 if (CONST_INT_P (op))
8321 {
8322 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8323 op = GEN_INT (INTVAL (op) & mask);
8324 gcc_checking_assert
8325 (insn_data[icode].operand[i + 1].predicate (op, mode));
8326 }
8327 else
8328 {
8329 gcc_checking_assert
8330 (nargs == 2
8331 && insn_data[new_icode].operand[0].mode == tmode
8332 && insn_data[new_icode].operand[1].mode == tmode
8333 && insn_data[new_icode].operand[2].mode == mode
8334 && insn_data[new_icode].operand[0].predicate
8335 == insn_data[icode].operand[0].predicate
8336 && insn_data[new_icode].operand[1].predicate
8337 == insn_data[icode].operand[1].predicate);
8338 icode = new_icode;
8339 goto non_constant;
8340 }
8341 break;
8342 default:
8343 gcc_unreachable ();
8344 }
8345 }
8346 }
8347 else
8348 {
8349 non_constant:
8350 if (VECTOR_MODE_P (mode))
8351 op = safe_vector_operand (op, mode);
8352
8353 /* If we aren't optimizing, only allow one memory operand to be
8354 generated. */
8355 if (memory_operand (op, mode))
8356 num_memory++;
8357
8358 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8359
8360 if (optimize
8361 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8362 || num_memory > 1)
8363 op = force_reg (mode, op);
8364 }
8365
8366 args[i].op = op;
8367 args[i].mode = mode;
8368 }
8369
8370 switch (nargs)
8371 {
8372 case 1:
8373 pat = GEN_FCN (icode) (target, args[0].op);
8374 break;
8375
8376 case 2:
8377 if (tf_p)
8378 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
8379 GEN_INT ((int)sub_code));
8380 else if (! comparison_p)
8381 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
8382 else
8383 {
8384 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8385 args[0].op,
8386 args[1].op);
8387
8388 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
8389 }
8390 break;
8391
8392 case 3:
8393 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
8394 break;
8395
8396 case 4:
8397 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
8398 break;
8399
8400 default:
8401 gcc_unreachable ();
8402 }
8403
8404 if (! pat)
8405 return 0;
8406
8407 emit_insn (pat);
8408 return target;
8409 }
8410
8411 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8412 insns with vec_merge. */
8413
8414 static rtx
8415 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8416 rtx target)
8417 {
8418 rtx pat;
8419 tree arg0 = CALL_EXPR_ARG (exp, 0);
8420 rtx op1, op0 = expand_normal (arg0);
8421 machine_mode tmode = insn_data[icode].operand[0].mode;
8422 machine_mode mode0 = insn_data[icode].operand[1].mode;
8423
8424 if (optimize || !target
8425 || GET_MODE (target) != tmode
8426 || !insn_data[icode].operand[0].predicate (target, tmode))
8427 target = gen_reg_rtx (tmode);
8428
8429 if (VECTOR_MODE_P (mode0))
8430 op0 = safe_vector_operand (op0, mode0);
8431
8432 if ((optimize && !register_operand (op0, mode0))
8433 || !insn_data[icode].operand[1].predicate (op0, mode0))
8434 op0 = copy_to_mode_reg (mode0, op0);
8435
8436 op1 = op0;
8437 if (!insn_data[icode].operand[2].predicate (op1, mode0))
8438 op1 = copy_to_mode_reg (mode0, op1);
8439
8440 pat = GEN_FCN (icode) (target, op0, op1);
8441 if (! pat)
8442 return 0;
8443 emit_insn (pat);
8444 return target;
8445 }
8446
8447 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8448
8449 static rtx
8450 ix86_expand_sse_compare (const struct builtin_description *d,
8451 tree exp, rtx target, bool swap)
8452 {
8453 rtx pat;
8454 tree arg0 = CALL_EXPR_ARG (exp, 0);
8455 tree arg1 = CALL_EXPR_ARG (exp, 1);
8456 rtx op0 = expand_normal (arg0);
8457 rtx op1 = expand_normal (arg1);
8458 rtx op2;
8459 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8460 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8461 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8462 enum rtx_code comparison = d->comparison;
8463
8464 if (VECTOR_MODE_P (mode0))
8465 op0 = safe_vector_operand (op0, mode0);
8466 if (VECTOR_MODE_P (mode1))
8467 op1 = safe_vector_operand (op1, mode1);
8468
8469 /* Swap operands if we have a comparison that isn't available in
8470 hardware. */
8471 if (swap)
8472 std::swap (op0, op1);
8473
8474 if (optimize || !target
8475 || GET_MODE (target) != tmode
8476 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8477 target = gen_reg_rtx (tmode);
8478
8479 if ((optimize && !register_operand (op0, mode0))
8480 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8481 op0 = copy_to_mode_reg (mode0, op0);
8482 if ((optimize && !register_operand (op1, mode1))
8483 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8484 op1 = copy_to_mode_reg (mode1, op1);
8485
8486 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8487 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8488 if (! pat)
8489 return 0;
8490 emit_insn (pat);
8491 return target;
8492 }
8493
8494 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8495
8496 static rtx
8497 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8498 rtx target)
8499 {
8500 rtx pat;
8501 tree arg0 = CALL_EXPR_ARG (exp, 0);
8502 tree arg1 = CALL_EXPR_ARG (exp, 1);
8503 rtx op0 = expand_normal (arg0);
8504 rtx op1 = expand_normal (arg1);
8505 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8506 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8507 enum rtx_code comparison = d->comparison;
8508
8509 if (VECTOR_MODE_P (mode0))
8510 op0 = safe_vector_operand (op0, mode0);
8511 if (VECTOR_MODE_P (mode1))
8512 op1 = safe_vector_operand (op1, mode1);
8513
8514 /* Swap operands if we have a comparison that isn't available in
8515 hardware. */
8516 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
8517 std::swap (op0, op1);
8518
8519 target = gen_reg_rtx (SImode);
8520 emit_move_insn (target, const0_rtx);
8521 target = gen_rtx_SUBREG (QImode, target, 0);
8522
8523 if ((optimize && !register_operand (op0, mode0))
8524 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8525 op0 = copy_to_mode_reg (mode0, op0);
8526 if ((optimize && !register_operand (op1, mode1))
8527 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8528 op1 = copy_to_mode_reg (mode1, op1);
8529
8530 pat = GEN_FCN (d->icode) (op0, op1);
8531 if (! pat)
8532 return 0;
8533 emit_insn (pat);
8534 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8535 gen_rtx_fmt_ee (comparison, QImode,
8536 SET_DEST (pat),
8537 const0_rtx)));
8538
8539 return SUBREG_REG (target);
8540 }
8541
8542 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8543
8544 static rtx
8545 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8546 rtx target)
8547 {
8548 rtx pat;
8549 tree arg0 = CALL_EXPR_ARG (exp, 0);
8550 rtx op1, op0 = expand_normal (arg0);
8551 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8552 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8553
8554 if (optimize || target == 0
8555 || GET_MODE (target) != tmode
8556 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8557 target = gen_reg_rtx (tmode);
8558
8559 if (VECTOR_MODE_P (mode0))
8560 op0 = safe_vector_operand (op0, mode0);
8561
8562 if ((optimize && !register_operand (op0, mode0))
8563 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8564 op0 = copy_to_mode_reg (mode0, op0);
8565
8566 op1 = GEN_INT (d->comparison);
8567
8568 pat = GEN_FCN (d->icode) (target, op0, op1);
8569 if (! pat)
8570 return 0;
8571 emit_insn (pat);
8572 return target;
8573 }
8574
8575 static rtx
8576 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8577 tree exp, rtx target)
8578 {
8579 rtx pat;
8580 tree arg0 = CALL_EXPR_ARG (exp, 0);
8581 tree arg1 = CALL_EXPR_ARG (exp, 1);
8582 rtx op0 = expand_normal (arg0);
8583 rtx op1 = expand_normal (arg1);
8584 rtx op2;
8585 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8586 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8587 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8588
8589 if (optimize || target == 0
8590 || GET_MODE (target) != tmode
8591 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8592 target = gen_reg_rtx (tmode);
8593
8594 op0 = safe_vector_operand (op0, mode0);
8595 op1 = safe_vector_operand (op1, mode1);
8596
8597 if ((optimize && !register_operand (op0, mode0))
8598 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8599 op0 = copy_to_mode_reg (mode0, op0);
8600 if ((optimize && !register_operand (op1, mode1))
8601 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8602 op1 = copy_to_mode_reg (mode1, op1);
8603
8604 op2 = GEN_INT (d->comparison);
8605
8606 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8607 if (! pat)
8608 return 0;
8609 emit_insn (pat);
8610 return target;
8611 }
8612
8613 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8614
8615 static rtx
8616 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8617 rtx target)
8618 {
8619 rtx pat;
8620 tree arg0 = CALL_EXPR_ARG (exp, 0);
8621 tree arg1 = CALL_EXPR_ARG (exp, 1);
8622 rtx op0 = expand_normal (arg0);
8623 rtx op1 = expand_normal (arg1);
8624 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8625 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8626 enum rtx_code comparison = d->comparison;
8627
8628 if (VECTOR_MODE_P (mode0))
8629 op0 = safe_vector_operand (op0, mode0);
8630 if (VECTOR_MODE_P (mode1))
8631 op1 = safe_vector_operand (op1, mode1);
8632
8633 target = gen_reg_rtx (SImode);
8634 emit_move_insn (target, const0_rtx);
8635 target = gen_rtx_SUBREG (QImode, target, 0);
8636
8637 if ((optimize && !register_operand (op0, mode0))
8638 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8639 op0 = copy_to_mode_reg (mode0, op0);
8640 if ((optimize && !register_operand (op1, mode1))
8641 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8642 op1 = copy_to_mode_reg (mode1, op1);
8643
8644 pat = GEN_FCN (d->icode) (op0, op1);
8645 if (! pat)
8646 return 0;
8647 emit_insn (pat);
8648 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8649 gen_rtx_fmt_ee (comparison, QImode,
8650 SET_DEST (pat),
8651 const0_rtx)));
8652
8653 return SUBREG_REG (target);
8654 }
8655
8656 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8657
8658 static rtx
8659 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8660 tree exp, rtx target)
8661 {
8662 rtx pat;
8663 tree arg0 = CALL_EXPR_ARG (exp, 0);
8664 tree arg1 = CALL_EXPR_ARG (exp, 1);
8665 tree arg2 = CALL_EXPR_ARG (exp, 2);
8666 tree arg3 = CALL_EXPR_ARG (exp, 3);
8667 tree arg4 = CALL_EXPR_ARG (exp, 4);
8668 rtx scratch0, scratch1;
8669 rtx op0 = expand_normal (arg0);
8670 rtx op1 = expand_normal (arg1);
8671 rtx op2 = expand_normal (arg2);
8672 rtx op3 = expand_normal (arg3);
8673 rtx op4 = expand_normal (arg4);
8674 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8675
8676 tmode0 = insn_data[d->icode].operand[0].mode;
8677 tmode1 = insn_data[d->icode].operand[1].mode;
8678 modev2 = insn_data[d->icode].operand[2].mode;
8679 modei3 = insn_data[d->icode].operand[3].mode;
8680 modev4 = insn_data[d->icode].operand[4].mode;
8681 modei5 = insn_data[d->icode].operand[5].mode;
8682 modeimm = insn_data[d->icode].operand[6].mode;
8683
8684 if (VECTOR_MODE_P (modev2))
8685 op0 = safe_vector_operand (op0, modev2);
8686 if (VECTOR_MODE_P (modev4))
8687 op2 = safe_vector_operand (op2, modev4);
8688
8689 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8690 op0 = copy_to_mode_reg (modev2, op0);
8691 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8692 op1 = copy_to_mode_reg (modei3, op1);
8693 if ((optimize && !register_operand (op2, modev4))
8694 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8695 op2 = copy_to_mode_reg (modev4, op2);
8696 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8697 op3 = copy_to_mode_reg (modei5, op3);
8698
8699 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8700 {
8701 error ("the fifth argument must be an 8-bit immediate");
8702 return const0_rtx;
8703 }
8704
8705 if (d->code == IX86_BUILTIN_PCMPESTRI128)
8706 {
8707 if (optimize || !target
8708 || GET_MODE (target) != tmode0
8709 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8710 target = gen_reg_rtx (tmode0);
8711
8712 scratch1 = gen_reg_rtx (tmode1);
8713
8714 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8715 }
8716 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8717 {
8718 if (optimize || !target
8719 || GET_MODE (target) != tmode1
8720 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8721 target = gen_reg_rtx (tmode1);
8722
8723 scratch0 = gen_reg_rtx (tmode0);
8724
8725 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8726 }
8727 else
8728 {
8729 gcc_assert (d->flag);
8730
8731 scratch0 = gen_reg_rtx (tmode0);
8732 scratch1 = gen_reg_rtx (tmode1);
8733
8734 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8735 }
8736
8737 if (! pat)
8738 return 0;
8739
8740 emit_insn (pat);
8741
8742 if (d->flag)
8743 {
8744 target = gen_reg_rtx (SImode);
8745 emit_move_insn (target, const0_rtx);
8746 target = gen_rtx_SUBREG (QImode, target, 0);
8747
8748 emit_insn
8749 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8750 gen_rtx_fmt_ee (EQ, QImode,
8751 gen_rtx_REG ((machine_mode) d->flag,
8752 FLAGS_REG),
8753 const0_rtx)));
8754 return SUBREG_REG (target);
8755 }
8756 else
8757 return target;
8758 }
8759
8760
8761 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8762
8763 static rtx
8764 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8765 tree exp, rtx target)
8766 {
8767 rtx pat;
8768 tree arg0 = CALL_EXPR_ARG (exp, 0);
8769 tree arg1 = CALL_EXPR_ARG (exp, 1);
8770 tree arg2 = CALL_EXPR_ARG (exp, 2);
8771 rtx scratch0, scratch1;
8772 rtx op0 = expand_normal (arg0);
8773 rtx op1 = expand_normal (arg1);
8774 rtx op2 = expand_normal (arg2);
8775 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8776
8777 tmode0 = insn_data[d->icode].operand[0].mode;
8778 tmode1 = insn_data[d->icode].operand[1].mode;
8779 modev2 = insn_data[d->icode].operand[2].mode;
8780 modev3 = insn_data[d->icode].operand[3].mode;
8781 modeimm = insn_data[d->icode].operand[4].mode;
8782
8783 if (VECTOR_MODE_P (modev2))
8784 op0 = safe_vector_operand (op0, modev2);
8785 if (VECTOR_MODE_P (modev3))
8786 op1 = safe_vector_operand (op1, modev3);
8787
8788 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8789 op0 = copy_to_mode_reg (modev2, op0);
8790 if ((optimize && !register_operand (op1, modev3))
8791 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8792 op1 = copy_to_mode_reg (modev3, op1);
8793
8794 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8795 {
8796 error ("the third argument must be an 8-bit immediate");
8797 return const0_rtx;
8798 }
8799
8800 if (d->code == IX86_BUILTIN_PCMPISTRI128)
8801 {
8802 if (optimize || !target
8803 || GET_MODE (target) != tmode0
8804 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8805 target = gen_reg_rtx (tmode0);
8806
8807 scratch1 = gen_reg_rtx (tmode1);
8808
8809 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8810 }
8811 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8812 {
8813 if (optimize || !target
8814 || GET_MODE (target) != tmode1
8815 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8816 target = gen_reg_rtx (tmode1);
8817
8818 scratch0 = gen_reg_rtx (tmode0);
8819
8820 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8821 }
8822 else
8823 {
8824 gcc_assert (d->flag);
8825
8826 scratch0 = gen_reg_rtx (tmode0);
8827 scratch1 = gen_reg_rtx (tmode1);
8828
8829 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8830 }
8831
8832 if (! pat)
8833 return 0;
8834
8835 emit_insn (pat);
8836
8837 if (d->flag)
8838 {
8839 target = gen_reg_rtx (SImode);
8840 emit_move_insn (target, const0_rtx);
8841 target = gen_rtx_SUBREG (QImode, target, 0);
8842
8843 emit_insn
8844 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8845 gen_rtx_fmt_ee (EQ, QImode,
8846 gen_rtx_REG ((machine_mode) d->flag,
8847 FLAGS_REG),
8848 const0_rtx)));
8849 return SUBREG_REG (target);
8850 }
8851 else
8852 return target;
8853 }
8854
8855 /* Fixup modeless constants to fit required mode. */
8856
8857 static rtx
8858 fixup_modeless_constant (rtx x, machine_mode mode)
8859 {
8860 if (GET_MODE (x) == VOIDmode)
8861 x = convert_to_mode (mode, x, 1);
8862 return x;
8863 }
8864
8865 /* Subroutine of ix86_expand_builtin to take care of insns with
8866 variable number of operands. */
8867
8868 static rtx
8869 ix86_expand_args_builtin (const struct builtin_description *d,
8870 tree exp, rtx target)
8871 {
8872 rtx pat, real_target;
8873 unsigned int i, nargs;
8874 unsigned int nargs_constant = 0;
8875 unsigned int mask_pos = 0;
8876 int num_memory = 0;
8877 struct
8878 {
8879 rtx op;
8880 machine_mode mode;
8881 } args[6];
8882 bool second_arg_count = false;
8883 enum insn_code icode = d->icode;
8884 const struct insn_data_d *insn_p = &insn_data[icode];
8885 machine_mode tmode = insn_p->operand[0].mode;
8886 machine_mode rmode = VOIDmode;
8887 bool swap = false;
8888 enum rtx_code comparison = d->comparison;
8889
8890 switch ((enum ix86_builtin_func_type) d->flag)
8891 {
8892 case V2DF_FTYPE_V2DF_ROUND:
8893 case V4DF_FTYPE_V4DF_ROUND:
8894 case V8DF_FTYPE_V8DF_ROUND:
8895 case V4SF_FTYPE_V4SF_ROUND:
8896 case V8SF_FTYPE_V8SF_ROUND:
8897 case V16SF_FTYPE_V16SF_ROUND:
8898 case V4SI_FTYPE_V4SF_ROUND:
8899 case V8SI_FTYPE_V8SF_ROUND:
8900 case V16SI_FTYPE_V16SF_ROUND:
8901 return ix86_expand_sse_round (d, exp, target);
8902 case V4SI_FTYPE_V2DF_V2DF_ROUND:
8903 case V8SI_FTYPE_V4DF_V4DF_ROUND:
8904 case V16SI_FTYPE_V8DF_V8DF_ROUND:
8905 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
8906 case INT_FTYPE_V8SF_V8SF_PTEST:
8907 case INT_FTYPE_V4DI_V4DI_PTEST:
8908 case INT_FTYPE_V4DF_V4DF_PTEST:
8909 case INT_FTYPE_V4SF_V4SF_PTEST:
8910 case INT_FTYPE_V2DI_V2DI_PTEST:
8911 case INT_FTYPE_V2DF_V2DF_PTEST:
8912 return ix86_expand_sse_ptest (d, exp, target);
8913 case FLOAT128_FTYPE_FLOAT128:
8914 case FLOAT_FTYPE_FLOAT:
8915 case INT_FTYPE_INT:
8916 case UINT_FTYPE_UINT:
8917 case UINT16_FTYPE_UINT16:
8918 case UINT64_FTYPE_INT:
8919 case UINT64_FTYPE_UINT64:
8920 case INT64_FTYPE_INT64:
8921 case INT64_FTYPE_V4SF:
8922 case INT64_FTYPE_V2DF:
8923 case INT_FTYPE_V16QI:
8924 case INT_FTYPE_V8QI:
8925 case INT_FTYPE_V8SF:
8926 case INT_FTYPE_V4DF:
8927 case INT_FTYPE_V4SF:
8928 case INT_FTYPE_V2DF:
8929 case INT_FTYPE_V32QI:
8930 case V16QI_FTYPE_V16QI:
8931 case V8SI_FTYPE_V8SF:
8932 case V8SI_FTYPE_V4SI:
8933 case V8HI_FTYPE_V8HI:
8934 case V8HI_FTYPE_V16QI:
8935 case V8QI_FTYPE_V8QI:
8936 case V8SF_FTYPE_V8SF:
8937 case V8SF_FTYPE_V8SI:
8938 case V8SF_FTYPE_V4SF:
8939 case V8SF_FTYPE_V8HI:
8940 case V4SI_FTYPE_V4SI:
8941 case V4SI_FTYPE_V16QI:
8942 case V4SI_FTYPE_V4SF:
8943 case V4SI_FTYPE_V8SI:
8944 case V4SI_FTYPE_V8HI:
8945 case V4SI_FTYPE_V4DF:
8946 case V4SI_FTYPE_V2DF:
8947 case V4HI_FTYPE_V4HI:
8948 case V4DF_FTYPE_V4DF:
8949 case V4DF_FTYPE_V4SI:
8950 case V4DF_FTYPE_V4SF:
8951 case V4DF_FTYPE_V2DF:
8952 case V4SF_FTYPE_V4SF:
8953 case V4SF_FTYPE_V4SI:
8954 case V4SF_FTYPE_V8SF:
8955 case V4SF_FTYPE_V4DF:
8956 case V4SF_FTYPE_V8HI:
8957 case V4SF_FTYPE_V2DF:
8958 case V2DI_FTYPE_V2DI:
8959 case V2DI_FTYPE_V16QI:
8960 case V2DI_FTYPE_V8HI:
8961 case V2DI_FTYPE_V4SI:
8962 case V2DF_FTYPE_V2DF:
8963 case V2DF_FTYPE_V4SI:
8964 case V2DF_FTYPE_V4DF:
8965 case V2DF_FTYPE_V4SF:
8966 case V2DF_FTYPE_V2SI:
8967 case V2SI_FTYPE_V2SI:
8968 case V2SI_FTYPE_V4SF:
8969 case V2SI_FTYPE_V2SF:
8970 case V2SI_FTYPE_V2DF:
8971 case V2SF_FTYPE_V2SF:
8972 case V2SF_FTYPE_V2SI:
8973 case V32QI_FTYPE_V32QI:
8974 case V32QI_FTYPE_V16QI:
8975 case V16HI_FTYPE_V16HI:
8976 case V16HI_FTYPE_V8HI:
8977 case V8SI_FTYPE_V8SI:
8978 case V16HI_FTYPE_V16QI:
8979 case V8SI_FTYPE_V16QI:
8980 case V4DI_FTYPE_V16QI:
8981 case V8SI_FTYPE_V8HI:
8982 case V4DI_FTYPE_V8HI:
8983 case V4DI_FTYPE_V4SI:
8984 case V4DI_FTYPE_V2DI:
8985 case UQI_FTYPE_UQI:
8986 case UHI_FTYPE_UHI:
8987 case USI_FTYPE_USI:
8988 case USI_FTYPE_UQI:
8989 case USI_FTYPE_UHI:
8990 case UDI_FTYPE_UDI:
8991 case UHI_FTYPE_V16QI:
8992 case USI_FTYPE_V32QI:
8993 case UDI_FTYPE_V64QI:
8994 case V16QI_FTYPE_UHI:
8995 case V32QI_FTYPE_USI:
8996 case V64QI_FTYPE_UDI:
8997 case V8HI_FTYPE_UQI:
8998 case V16HI_FTYPE_UHI:
8999 case V32HI_FTYPE_USI:
9000 case V4SI_FTYPE_UQI:
9001 case V8SI_FTYPE_UQI:
9002 case V4SI_FTYPE_UHI:
9003 case V8SI_FTYPE_UHI:
9004 case UQI_FTYPE_V8HI:
9005 case UHI_FTYPE_V16HI:
9006 case USI_FTYPE_V32HI:
9007 case UQI_FTYPE_V4SI:
9008 case UQI_FTYPE_V8SI:
9009 case UHI_FTYPE_V16SI:
9010 case UQI_FTYPE_V2DI:
9011 case UQI_FTYPE_V4DI:
9012 case UQI_FTYPE_V8DI:
9013 case V16SI_FTYPE_UHI:
9014 case V2DI_FTYPE_UQI:
9015 case V4DI_FTYPE_UQI:
9016 case V16SI_FTYPE_INT:
9017 case V16SF_FTYPE_V8SF:
9018 case V16SI_FTYPE_V8SI:
9019 case V16SF_FTYPE_V4SF:
9020 case V16SI_FTYPE_V4SI:
9021 case V16SI_FTYPE_V16SF:
9022 case V16SI_FTYPE_V16SI:
9023 case V64QI_FTYPE_V64QI:
9024 case V32HI_FTYPE_V32HI:
9025 case V16SF_FTYPE_V16SF:
9026 case V8DI_FTYPE_UQI:
9027 case V8DI_FTYPE_V8DI:
9028 case V8DF_FTYPE_V4DF:
9029 case V8DF_FTYPE_V2DF:
9030 case V8DF_FTYPE_V8DF:
9031 case V4DI_FTYPE_V4DI:
9032 case V16HI_FTYPE_V16SF:
9033 case V8HI_FTYPE_V8SF:
9034 case V8HI_FTYPE_V4SF:
9035 nargs = 1;
9036 break;
9037 case V4SF_FTYPE_V4SF_VEC_MERGE:
9038 case V2DF_FTYPE_V2DF_VEC_MERGE:
9039 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9040 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9041 case V16QI_FTYPE_V16QI_V16QI:
9042 case V16QI_FTYPE_V8HI_V8HI:
9043 case V16SF_FTYPE_V16SF_V16SF:
9044 case V8QI_FTYPE_V8QI_V8QI:
9045 case V8QI_FTYPE_V4HI_V4HI:
9046 case V8HI_FTYPE_V8HI_V8HI:
9047 case V8HI_FTYPE_V16QI_V16QI:
9048 case V8HI_FTYPE_V4SI_V4SI:
9049 case V8SF_FTYPE_V8SF_V8SF:
9050 case V8SF_FTYPE_V8SF_V8SI:
9051 case V8DF_FTYPE_V8DF_V8DF:
9052 case V4SI_FTYPE_V4SI_V4SI:
9053 case V4SI_FTYPE_V8HI_V8HI:
9054 case V4SI_FTYPE_V2DF_V2DF:
9055 case V4HI_FTYPE_V4HI_V4HI:
9056 case V4HI_FTYPE_V8QI_V8QI:
9057 case V4HI_FTYPE_V2SI_V2SI:
9058 case V4DF_FTYPE_V4DF_V4DF:
9059 case V4DF_FTYPE_V4DF_V4DI:
9060 case V4SF_FTYPE_V4SF_V4SF:
9061 case V4SF_FTYPE_V4SF_V4SI:
9062 case V4SF_FTYPE_V4SF_V2SI:
9063 case V4SF_FTYPE_V4SF_V2DF:
9064 case V4SF_FTYPE_V4SF_UINT:
9065 case V4SF_FTYPE_V4SF_DI:
9066 case V4SF_FTYPE_V4SF_SI:
9067 case V2DI_FTYPE_V2DI_V2DI:
9068 case V2DI_FTYPE_V16QI_V16QI:
9069 case V2DI_FTYPE_V4SI_V4SI:
9070 case V2DI_FTYPE_V2DI_V16QI:
9071 case V2SI_FTYPE_V2SI_V2SI:
9072 case V2SI_FTYPE_V4HI_V4HI:
9073 case V2SI_FTYPE_V2SF_V2SF:
9074 case V2DF_FTYPE_V2DF_V2DF:
9075 case V2DF_FTYPE_V2DF_V4SF:
9076 case V2DF_FTYPE_V2DF_V2DI:
9077 case V2DF_FTYPE_V2DF_DI:
9078 case V2DF_FTYPE_V2DF_SI:
9079 case V2DF_FTYPE_V2DF_UINT:
9080 case V2SF_FTYPE_V2SF_V2SF:
9081 case V1DI_FTYPE_V1DI_V1DI:
9082 case V1DI_FTYPE_V8QI_V8QI:
9083 case V1DI_FTYPE_V2SI_V2SI:
9084 case V32QI_FTYPE_V16HI_V16HI:
9085 case V16HI_FTYPE_V8SI_V8SI:
9086 case V64QI_FTYPE_V64QI_V64QI:
9087 case V32QI_FTYPE_V32QI_V32QI:
9088 case V16HI_FTYPE_V32QI_V32QI:
9089 case V16HI_FTYPE_V16HI_V16HI:
9090 case V8SI_FTYPE_V4DF_V4DF:
9091 case V8SI_FTYPE_V8SI_V8SI:
9092 case V8SI_FTYPE_V16HI_V16HI:
9093 case V4DI_FTYPE_V4DI_V4DI:
9094 case V4DI_FTYPE_V8SI_V8SI:
9095 case V8DI_FTYPE_V64QI_V64QI:
9096 if (comparison == UNKNOWN)
9097 return ix86_expand_binop_builtin (icode, exp, target);
9098 nargs = 2;
9099 break;
9100 case V4SF_FTYPE_V4SF_V4SF_SWAP:
9101 case V2DF_FTYPE_V2DF_V2DF_SWAP:
9102 gcc_assert (comparison != UNKNOWN);
9103 nargs = 2;
9104 swap = true;
9105 break;
9106 case V16HI_FTYPE_V16HI_V8HI_COUNT:
9107 case V16HI_FTYPE_V16HI_SI_COUNT:
9108 case V8SI_FTYPE_V8SI_V4SI_COUNT:
9109 case V8SI_FTYPE_V8SI_SI_COUNT:
9110 case V4DI_FTYPE_V4DI_V2DI_COUNT:
9111 case V4DI_FTYPE_V4DI_INT_COUNT:
9112 case V8HI_FTYPE_V8HI_V8HI_COUNT:
9113 case V8HI_FTYPE_V8HI_SI_COUNT:
9114 case V4SI_FTYPE_V4SI_V4SI_COUNT:
9115 case V4SI_FTYPE_V4SI_SI_COUNT:
9116 case V4HI_FTYPE_V4HI_V4HI_COUNT:
9117 case V4HI_FTYPE_V4HI_SI_COUNT:
9118 case V2DI_FTYPE_V2DI_V2DI_COUNT:
9119 case V2DI_FTYPE_V2DI_SI_COUNT:
9120 case V2SI_FTYPE_V2SI_V2SI_COUNT:
9121 case V2SI_FTYPE_V2SI_SI_COUNT:
9122 case V1DI_FTYPE_V1DI_V1DI_COUNT:
9123 case V1DI_FTYPE_V1DI_SI_COUNT:
9124 nargs = 2;
9125 second_arg_count = true;
9126 break;
9127 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9128 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9129 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9130 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9131 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9132 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9133 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9134 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9135 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9136 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9137 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9138 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9139 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9140 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9141 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9142 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9143 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9144 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9145 nargs = 4;
9146 second_arg_count = true;
9147 break;
9148 case UINT64_FTYPE_UINT64_UINT64:
9149 case UINT_FTYPE_UINT_UINT:
9150 case UINT_FTYPE_UINT_USHORT:
9151 case UINT_FTYPE_UINT_UCHAR:
9152 case UINT16_FTYPE_UINT16_INT:
9153 case UINT8_FTYPE_UINT8_INT:
9154 case UQI_FTYPE_UQI_UQI:
9155 case UHI_FTYPE_UHI_UHI:
9156 case USI_FTYPE_USI_USI:
9157 case UDI_FTYPE_UDI_UDI:
9158 case V16SI_FTYPE_V8DF_V8DF:
9159 case V32HI_FTYPE_V16SF_V16SF:
9160 case V16HI_FTYPE_V8SF_V8SF:
9161 case V8HI_FTYPE_V4SF_V4SF:
9162 case V16HI_FTYPE_V16SF_UHI:
9163 case V8HI_FTYPE_V8SF_UQI:
9164 case V8HI_FTYPE_V4SF_UQI:
9165 nargs = 2;
9166 break;
9167 case V2DI_FTYPE_V2DI_INT_CONVERT:
9168 nargs = 2;
9169 rmode = V1TImode;
9170 nargs_constant = 1;
9171 break;
9172 case V4DI_FTYPE_V4DI_INT_CONVERT:
9173 nargs = 2;
9174 rmode = V2TImode;
9175 nargs_constant = 1;
9176 break;
9177 case V8DI_FTYPE_V8DI_INT_CONVERT:
9178 nargs = 2;
9179 rmode = V4TImode;
9180 nargs_constant = 1;
9181 break;
9182 case V8HI_FTYPE_V8HI_INT:
9183 case V8HI_FTYPE_V8SF_INT:
9184 case V16HI_FTYPE_V16SF_INT:
9185 case V8HI_FTYPE_V4SF_INT:
9186 case V8SF_FTYPE_V8SF_INT:
9187 case V4SF_FTYPE_V16SF_INT:
9188 case V16SF_FTYPE_V16SF_INT:
9189 case V4SI_FTYPE_V4SI_INT:
9190 case V4SI_FTYPE_V8SI_INT:
9191 case V4HI_FTYPE_V4HI_INT:
9192 case V4DF_FTYPE_V4DF_INT:
9193 case V4DF_FTYPE_V8DF_INT:
9194 case V4SF_FTYPE_V4SF_INT:
9195 case V4SF_FTYPE_V8SF_INT:
9196 case V2DI_FTYPE_V2DI_INT:
9197 case V2DF_FTYPE_V2DF_INT:
9198 case V2DF_FTYPE_V4DF_INT:
9199 case V16HI_FTYPE_V16HI_INT:
9200 case V8SI_FTYPE_V8SI_INT:
9201 case V16SI_FTYPE_V16SI_INT:
9202 case V4SI_FTYPE_V16SI_INT:
9203 case V4DI_FTYPE_V4DI_INT:
9204 case V2DI_FTYPE_V4DI_INT:
9205 case V4DI_FTYPE_V8DI_INT:
9206 case UQI_FTYPE_UQI_UQI_CONST:
9207 case UHI_FTYPE_UHI_UQI:
9208 case USI_FTYPE_USI_UQI:
9209 case UDI_FTYPE_UDI_UQI:
9210 nargs = 2;
9211 nargs_constant = 1;
9212 break;
9213 case V16QI_FTYPE_V16QI_V16QI_V16QI:
9214 case V8SF_FTYPE_V8SF_V8SF_V8SF:
9215 case V4DF_FTYPE_V4DF_V4DF_V4DF:
9216 case V4SF_FTYPE_V4SF_V4SF_V4SF:
9217 case V2DF_FTYPE_V2DF_V2DF_V2DF:
9218 case V32QI_FTYPE_V32QI_V32QI_V32QI:
9219 case UHI_FTYPE_V16SI_V16SI_UHI:
9220 case UQI_FTYPE_V8DI_V8DI_UQI:
9221 case V16HI_FTYPE_V16SI_V16HI_UHI:
9222 case V16QI_FTYPE_V16SI_V16QI_UHI:
9223 case V16QI_FTYPE_V8DI_V16QI_UQI:
9224 case V16SF_FTYPE_V16SF_V16SF_UHI:
9225 case V16SF_FTYPE_V4SF_V16SF_UHI:
9226 case V16SI_FTYPE_SI_V16SI_UHI:
9227 case V16SI_FTYPE_V16HI_V16SI_UHI:
9228 case V16SI_FTYPE_V16QI_V16SI_UHI:
9229 case V8SF_FTYPE_V4SF_V8SF_UQI:
9230 case V4DF_FTYPE_V2DF_V4DF_UQI:
9231 case V8SI_FTYPE_V4SI_V8SI_UQI:
9232 case V8SI_FTYPE_SI_V8SI_UQI:
9233 case V4SI_FTYPE_V4SI_V4SI_UQI:
9234 case V4SI_FTYPE_SI_V4SI_UQI:
9235 case V4DI_FTYPE_V2DI_V4DI_UQI:
9236 case V4DI_FTYPE_DI_V4DI_UQI:
9237 case V2DI_FTYPE_V2DI_V2DI_UQI:
9238 case V2DI_FTYPE_DI_V2DI_UQI:
9239 case V64QI_FTYPE_V64QI_V64QI_UDI:
9240 case V64QI_FTYPE_V16QI_V64QI_UDI:
9241 case V64QI_FTYPE_QI_V64QI_UDI:
9242 case V32QI_FTYPE_V32QI_V32QI_USI:
9243 case V32QI_FTYPE_V16QI_V32QI_USI:
9244 case V32QI_FTYPE_QI_V32QI_USI:
9245 case V16QI_FTYPE_V16QI_V16QI_UHI:
9246 case V16QI_FTYPE_QI_V16QI_UHI:
9247 case V32HI_FTYPE_V8HI_V32HI_USI:
9248 case V32HI_FTYPE_HI_V32HI_USI:
9249 case V16HI_FTYPE_V8HI_V16HI_UHI:
9250 case V16HI_FTYPE_HI_V16HI_UHI:
9251 case V8HI_FTYPE_V8HI_V8HI_UQI:
9252 case V8HI_FTYPE_HI_V8HI_UQI:
9253 case V8SF_FTYPE_V8HI_V8SF_UQI:
9254 case V4SF_FTYPE_V8HI_V4SF_UQI:
9255 case V8SI_FTYPE_V8SF_V8SI_UQI:
9256 case V4SI_FTYPE_V4SF_V4SI_UQI:
9257 case V4DI_FTYPE_V4SF_V4DI_UQI:
9258 case V2DI_FTYPE_V4SF_V2DI_UQI:
9259 case V4SF_FTYPE_V4DI_V4SF_UQI:
9260 case V4SF_FTYPE_V2DI_V4SF_UQI:
9261 case V4DF_FTYPE_V4DI_V4DF_UQI:
9262 case V2DF_FTYPE_V2DI_V2DF_UQI:
9263 case V16QI_FTYPE_V8HI_V16QI_UQI:
9264 case V16QI_FTYPE_V16HI_V16QI_UHI:
9265 case V16QI_FTYPE_V4SI_V16QI_UQI:
9266 case V16QI_FTYPE_V8SI_V16QI_UQI:
9267 case V8HI_FTYPE_V4SI_V8HI_UQI:
9268 case V8HI_FTYPE_V8SI_V8HI_UQI:
9269 case V16QI_FTYPE_V2DI_V16QI_UQI:
9270 case V16QI_FTYPE_V4DI_V16QI_UQI:
9271 case V8HI_FTYPE_V2DI_V8HI_UQI:
9272 case V8HI_FTYPE_V4DI_V8HI_UQI:
9273 case V4SI_FTYPE_V2DI_V4SI_UQI:
9274 case V4SI_FTYPE_V4DI_V4SI_UQI:
9275 case V32QI_FTYPE_V32HI_V32QI_USI:
9276 case UHI_FTYPE_V16QI_V16QI_UHI:
9277 case USI_FTYPE_V32QI_V32QI_USI:
9278 case UDI_FTYPE_V64QI_V64QI_UDI:
9279 case UQI_FTYPE_V8HI_V8HI_UQI:
9280 case UHI_FTYPE_V16HI_V16HI_UHI:
9281 case USI_FTYPE_V32HI_V32HI_USI:
9282 case UQI_FTYPE_V4SI_V4SI_UQI:
9283 case UQI_FTYPE_V8SI_V8SI_UQI:
9284 case UQI_FTYPE_V2DI_V2DI_UQI:
9285 case UQI_FTYPE_V4DI_V4DI_UQI:
9286 case V4SF_FTYPE_V2DF_V4SF_UQI:
9287 case V4SF_FTYPE_V4DF_V4SF_UQI:
9288 case V16SI_FTYPE_V16SI_V16SI_UHI:
9289 case V16SI_FTYPE_V4SI_V16SI_UHI:
9290 case V2DI_FTYPE_V4SI_V2DI_UQI:
9291 case V2DI_FTYPE_V8HI_V2DI_UQI:
9292 case V2DI_FTYPE_V16QI_V2DI_UQI:
9293 case V4DI_FTYPE_V4DI_V4DI_UQI:
9294 case V4DI_FTYPE_V4SI_V4DI_UQI:
9295 case V4DI_FTYPE_V8HI_V4DI_UQI:
9296 case V4DI_FTYPE_V16QI_V4DI_UQI:
9297 case V4DI_FTYPE_V4DF_V4DI_UQI:
9298 case V2DI_FTYPE_V2DF_V2DI_UQI:
9299 case V4SI_FTYPE_V4DF_V4SI_UQI:
9300 case V4SI_FTYPE_V2DF_V4SI_UQI:
9301 case V4SI_FTYPE_V8HI_V4SI_UQI:
9302 case V4SI_FTYPE_V16QI_V4SI_UQI:
9303 case V4DI_FTYPE_V4DI_V4DI_V4DI:
9304 case V8DF_FTYPE_V2DF_V8DF_UQI:
9305 case V8DF_FTYPE_V4DF_V8DF_UQI:
9306 case V8DF_FTYPE_V8DF_V8DF_UQI:
9307 case V8SF_FTYPE_V8SF_V8SF_UQI:
9308 case V8SF_FTYPE_V8SI_V8SF_UQI:
9309 case V4DF_FTYPE_V4DF_V4DF_UQI:
9310 case V4SF_FTYPE_V4SF_V4SF_UQI:
9311 case V2DF_FTYPE_V2DF_V2DF_UQI:
9312 case V2DF_FTYPE_V4SF_V2DF_UQI:
9313 case V2DF_FTYPE_V4SI_V2DF_UQI:
9314 case V4SF_FTYPE_V4SI_V4SF_UQI:
9315 case V4DF_FTYPE_V4SF_V4DF_UQI:
9316 case V4DF_FTYPE_V4SI_V4DF_UQI:
9317 case V8SI_FTYPE_V8SI_V8SI_UQI:
9318 case V8SI_FTYPE_V8HI_V8SI_UQI:
9319 case V8SI_FTYPE_V16QI_V8SI_UQI:
9320 case V8DF_FTYPE_V8SI_V8DF_UQI:
9321 case V8DI_FTYPE_DI_V8DI_UQI:
9322 case V16SF_FTYPE_V8SF_V16SF_UHI:
9323 case V16SI_FTYPE_V8SI_V16SI_UHI:
9324 case V16HI_FTYPE_V16HI_V16HI_UHI:
9325 case V8HI_FTYPE_V16QI_V8HI_UQI:
9326 case V16HI_FTYPE_V16QI_V16HI_UHI:
9327 case V32HI_FTYPE_V32HI_V32HI_USI:
9328 case V32HI_FTYPE_V32QI_V32HI_USI:
9329 case V8DI_FTYPE_V16QI_V8DI_UQI:
9330 case V8DI_FTYPE_V2DI_V8DI_UQI:
9331 case V8DI_FTYPE_V4DI_V8DI_UQI:
9332 case V8DI_FTYPE_V8DI_V8DI_UQI:
9333 case V8DI_FTYPE_V8HI_V8DI_UQI:
9334 case V8DI_FTYPE_V8SI_V8DI_UQI:
9335 case V8HI_FTYPE_V8DI_V8HI_UQI:
9336 case V8SI_FTYPE_V8DI_V8SI_UQI:
9337 case V4SI_FTYPE_V4SI_V4SI_V4SI:
9338 case V16SI_FTYPE_V16SI_V16SI_V16SI:
9339 case V8DI_FTYPE_V8DI_V8DI_V8DI:
9340 case V32HI_FTYPE_V32HI_V32HI_V32HI:
9341 case V2DI_FTYPE_V2DI_V2DI_V2DI:
9342 case V16HI_FTYPE_V16HI_V16HI_V16HI:
9343 case V8SI_FTYPE_V8SI_V8SI_V8SI:
9344 case V8HI_FTYPE_V8HI_V8HI_V8HI:
9345 case V32HI_FTYPE_V16SF_V16SF_USI:
9346 case V16HI_FTYPE_V8SF_V8SF_UHI:
9347 case V8HI_FTYPE_V4SF_V4SF_UQI:
9348 case V16HI_FTYPE_V16SF_V16HI_UHI:
9349 case V8HI_FTYPE_V8SF_V8HI_UQI:
9350 case V8HI_FTYPE_V4SF_V8HI_UQI:
9351 case V16SF_FTYPE_V16SF_V32HI_V32HI:
9352 case V8SF_FTYPE_V8SF_V16HI_V16HI:
9353 case V4SF_FTYPE_V4SF_V8HI_V8HI:
9354 nargs = 3;
9355 break;
9356 case V32QI_FTYPE_V32QI_V32QI_INT:
9357 case V16HI_FTYPE_V16HI_V16HI_INT:
9358 case V16QI_FTYPE_V16QI_V16QI_INT:
9359 case V4DI_FTYPE_V4DI_V4DI_INT:
9360 case V8HI_FTYPE_V8HI_V8HI_INT:
9361 case V8SI_FTYPE_V8SI_V8SI_INT:
9362 case V8SI_FTYPE_V8SI_V4SI_INT:
9363 case V8SF_FTYPE_V8SF_V8SF_INT:
9364 case V8SF_FTYPE_V8SF_V4SF_INT:
9365 case V4SI_FTYPE_V4SI_V4SI_INT:
9366 case V4DF_FTYPE_V4DF_V4DF_INT:
9367 case V16SF_FTYPE_V16SF_V16SF_INT:
9368 case V16SF_FTYPE_V16SF_V4SF_INT:
9369 case V16SI_FTYPE_V16SI_V4SI_INT:
9370 case V4DF_FTYPE_V4DF_V2DF_INT:
9371 case V4SF_FTYPE_V4SF_V4SF_INT:
9372 case V2DI_FTYPE_V2DI_V2DI_INT:
9373 case V4DI_FTYPE_V4DI_V2DI_INT:
9374 case V2DF_FTYPE_V2DF_V2DF_INT:
9375 case UQI_FTYPE_V8DI_V8UDI_INT:
9376 case UQI_FTYPE_V8DF_V8DF_INT:
9377 case UQI_FTYPE_V2DF_V2DF_INT:
9378 case UQI_FTYPE_V4SF_V4SF_INT:
9379 case UHI_FTYPE_V16SI_V16SI_INT:
9380 case UHI_FTYPE_V16SF_V16SF_INT:
9381 case V64QI_FTYPE_V64QI_V64QI_INT:
9382 case V32HI_FTYPE_V32HI_V32HI_INT:
9383 case V16SI_FTYPE_V16SI_V16SI_INT:
9384 case V8DI_FTYPE_V8DI_V8DI_INT:
9385 nargs = 3;
9386 nargs_constant = 1;
9387 break;
9388 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9389 nargs = 3;
9390 rmode = V4DImode;
9391 nargs_constant = 1;
9392 break;
9393 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9394 nargs = 3;
9395 rmode = V2DImode;
9396 nargs_constant = 1;
9397 break;
9398 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9399 nargs = 3;
9400 rmode = DImode;
9401 nargs_constant = 1;
9402 break;
9403 case V2DI_FTYPE_V2DI_UINT_UINT:
9404 nargs = 3;
9405 nargs_constant = 2;
9406 break;
9407 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9408 nargs = 3;
9409 rmode = V8DImode;
9410 nargs_constant = 1;
9411 break;
9412 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9413 nargs = 5;
9414 rmode = V8DImode;
9415 mask_pos = 2;
9416 nargs_constant = 1;
9417 break;
9418 case QI_FTYPE_V8DF_INT_UQI:
9419 case QI_FTYPE_V4DF_INT_UQI:
9420 case QI_FTYPE_V2DF_INT_UQI:
9421 case HI_FTYPE_V16SF_INT_UHI:
9422 case QI_FTYPE_V8SF_INT_UQI:
9423 case QI_FTYPE_V4SF_INT_UQI:
9424 case V4SI_FTYPE_V4SI_V4SI_UHI:
9425 case V8SI_FTYPE_V8SI_V8SI_UHI:
9426 nargs = 3;
9427 mask_pos = 1;
9428 nargs_constant = 1;
9429 break;
9430 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9431 nargs = 5;
9432 rmode = V4DImode;
9433 mask_pos = 2;
9434 nargs_constant = 1;
9435 break;
9436 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9437 nargs = 5;
9438 rmode = V2DImode;
9439 mask_pos = 2;
9440 nargs_constant = 1;
9441 break;
9442 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9443 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9444 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9445 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9446 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9447 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9448 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9449 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9450 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9451 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9452 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9453 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9454 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9455 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9456 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9457 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9458 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9459 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9460 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9461 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9462 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9463 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9464 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9465 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9466 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9467 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9468 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9469 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9470 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9471 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9472 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9473 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9474 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9475 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9476 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9477 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9478 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9479 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9480 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9481 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9482 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9483 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9484 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9485 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9486 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9487 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9488 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9489 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9490 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9491 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9492 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
9493 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9494 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9495 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
9496 nargs = 4;
9497 break;
9498 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9499 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9500 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9501 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9502 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9503 nargs = 4;
9504 nargs_constant = 1;
9505 break;
9506 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9507 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9508 case QI_FTYPE_V4DF_V4DF_INT_UQI:
9509 case QI_FTYPE_V8SF_V8SF_INT_UQI:
9510 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9511 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9512 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9513 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9514 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9515 case USI_FTYPE_V32QI_V32QI_INT_USI:
9516 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9517 case USI_FTYPE_V32HI_V32HI_INT_USI:
9518 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9519 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
9520 nargs = 4;
9521 mask_pos = 1;
9522 nargs_constant = 1;
9523 break;
9524 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9525 nargs = 4;
9526 nargs_constant = 2;
9527 break;
9528 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9529 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
9530 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9531 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9532 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
9533 nargs = 4;
9534 break;
9535 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9536 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9537 mask_pos = 1;
9538 nargs = 4;
9539 nargs_constant = 1;
9540 break;
9541 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9542 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9543 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9544 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9545 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9546 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9547 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9548 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9549 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9550 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9551 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9552 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9553 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9554 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9555 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9556 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9557 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9558 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9559 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9560 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9561 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9562 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9563 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9564 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9565 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9566 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9567 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9568 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9569 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9570 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9571 nargs = 4;
9572 mask_pos = 2;
9573 nargs_constant = 1;
9574 break;
9575 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9576 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9577 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9578 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9579 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9580 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9581 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9582 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9583 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9584 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9585 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9586 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9587 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9588 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9589 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9590 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9591 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9592 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9593 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9594 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9595 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9596 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9597 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9598 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9599 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9600 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9601 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9602 nargs = 5;
9603 mask_pos = 2;
9604 nargs_constant = 1;
9605 break;
9606 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9607 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9608 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9609 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9610 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9611 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9612 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9613 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9614 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9615 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9616 nargs = 5;
9617 mask_pos = 1;
9618 nargs_constant = 1;
9619 break;
9620 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9621 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9622 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9623 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9624 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9625 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9626 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9627 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9628 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9629 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9630 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9631 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9632 nargs = 5;
9633 mask_pos = 1;
9634 nargs_constant = 2;
9635 break;
9636
9637 default:
9638 gcc_unreachable ();
9639 }
9640
9641 gcc_assert (nargs <= ARRAY_SIZE (args));
9642
9643 if (comparison != UNKNOWN)
9644 {
9645 gcc_assert (nargs == 2);
9646 return ix86_expand_sse_compare (d, exp, target, swap);
9647 }
9648
9649 if (rmode == VOIDmode || rmode == tmode)
9650 {
9651 if (optimize
9652 || target == 0
9653 || GET_MODE (target) != tmode
9654 || !insn_p->operand[0].predicate (target, tmode))
9655 target = gen_reg_rtx (tmode);
9656 else if (memory_operand (target, tmode))
9657 num_memory++;
9658 real_target = target;
9659 }
9660 else
9661 {
9662 real_target = gen_reg_rtx (tmode);
9663 target = lowpart_subreg (rmode, real_target, tmode);
9664 }
9665
9666 for (i = 0; i < nargs; i++)
9667 {
9668 tree arg = CALL_EXPR_ARG (exp, i);
9669 rtx op = expand_normal (arg);
9670 machine_mode mode = insn_p->operand[i + 1].mode;
9671 bool match = insn_p->operand[i + 1].predicate (op, mode);
9672
9673 if (second_arg_count && i == 1)
9674 {
9675 /* SIMD shift insns take either an 8-bit immediate or
9676 register as count. But builtin functions take int as
9677 count. If count doesn't match, we put it in register.
9678 The instructions are using 64-bit count, if op is just
9679 32-bit, zero-extend it, as negative shift counts
9680 are undefined behavior and zero-extension is more
9681 efficient. */
9682 if (!match)
9683 {
9684 if (SCALAR_INT_MODE_P (GET_MODE (op)))
9685 op = convert_modes (mode, GET_MODE (op), op, 1);
9686 else
9687 op = lowpart_subreg (mode, op, GET_MODE (op));
9688 if (!insn_p->operand[i + 1].predicate (op, mode))
9689 op = copy_to_reg (op);
9690 }
9691 }
9692 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9693 (!mask_pos && (nargs - i) <= nargs_constant))
9694 {
9695 if (!match)
9696 switch (icode)
9697 {
9698 case CODE_FOR_avx_vinsertf128v4di:
9699 case CODE_FOR_avx_vextractf128v4di:
9700 error ("the last argument must be an 1-bit immediate");
9701 return const0_rtx;
9702
9703 case CODE_FOR_avx512f_cmpv8di3_mask:
9704 case CODE_FOR_avx512f_cmpv16si3_mask:
9705 case CODE_FOR_avx512f_ucmpv8di3_mask:
9706 case CODE_FOR_avx512f_ucmpv16si3_mask:
9707 case CODE_FOR_avx512vl_cmpv4di3_mask:
9708 case CODE_FOR_avx512vl_cmpv8si3_mask:
9709 case CODE_FOR_avx512vl_ucmpv4di3_mask:
9710 case CODE_FOR_avx512vl_ucmpv8si3_mask:
9711 case CODE_FOR_avx512vl_cmpv2di3_mask:
9712 case CODE_FOR_avx512vl_cmpv4si3_mask:
9713 case CODE_FOR_avx512vl_ucmpv2di3_mask:
9714 case CODE_FOR_avx512vl_ucmpv4si3_mask:
9715 error ("the last argument must be a 3-bit immediate");
9716 return const0_rtx;
9717
9718 case CODE_FOR_sse4_1_roundsd:
9719 case CODE_FOR_sse4_1_roundss:
9720
9721 case CODE_FOR_sse4_1_roundpd:
9722 case CODE_FOR_sse4_1_roundps:
9723 case CODE_FOR_avx_roundpd256:
9724 case CODE_FOR_avx_roundps256:
9725
9726 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9727 case CODE_FOR_sse4_1_roundps_sfix:
9728 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9729 case CODE_FOR_avx_roundps_sfix256:
9730
9731 case CODE_FOR_sse4_1_blendps:
9732 case CODE_FOR_avx_blendpd256:
9733 case CODE_FOR_avx_vpermilv4df:
9734 case CODE_FOR_avx_vpermilv4df_mask:
9735 case CODE_FOR_avx512f_getmantv8df_mask:
9736 case CODE_FOR_avx512f_getmantv16sf_mask:
9737 case CODE_FOR_avx512vl_getmantv8sf_mask:
9738 case CODE_FOR_avx512vl_getmantv4df_mask:
9739 case CODE_FOR_avx512vl_getmantv4sf_mask:
9740 case CODE_FOR_avx512vl_getmantv2df_mask:
9741 case CODE_FOR_avx512dq_rangepv8df_mask_round:
9742 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9743 case CODE_FOR_avx512dq_rangepv4df_mask:
9744 case CODE_FOR_avx512dq_rangepv8sf_mask:
9745 case CODE_FOR_avx512dq_rangepv2df_mask:
9746 case CODE_FOR_avx512dq_rangepv4sf_mask:
9747 case CODE_FOR_avx_shufpd256_mask:
9748 error ("the last argument must be a 4-bit immediate");
9749 return const0_rtx;
9750
9751 case CODE_FOR_sha1rnds4:
9752 case CODE_FOR_sse4_1_blendpd:
9753 case CODE_FOR_avx_vpermilv2df:
9754 case CODE_FOR_avx_vpermilv2df_mask:
9755 case CODE_FOR_xop_vpermil2v2df3:
9756 case CODE_FOR_xop_vpermil2v4sf3:
9757 case CODE_FOR_xop_vpermil2v4df3:
9758 case CODE_FOR_xop_vpermil2v8sf3:
9759 case CODE_FOR_avx512f_vinsertf32x4_mask:
9760 case CODE_FOR_avx512f_vinserti32x4_mask:
9761 case CODE_FOR_avx512f_vextractf32x4_mask:
9762 case CODE_FOR_avx512f_vextracti32x4_mask:
9763 case CODE_FOR_sse2_shufpd:
9764 case CODE_FOR_sse2_shufpd_mask:
9765 case CODE_FOR_avx512dq_shuf_f64x2_mask:
9766 case CODE_FOR_avx512dq_shuf_i64x2_mask:
9767 case CODE_FOR_avx512vl_shuf_i32x4_mask:
9768 case CODE_FOR_avx512vl_shuf_f32x4_mask:
9769 error ("the last argument must be a 2-bit immediate");
9770 return const0_rtx;
9771
9772 case CODE_FOR_avx_vextractf128v4df:
9773 case CODE_FOR_avx_vextractf128v8sf:
9774 case CODE_FOR_avx_vextractf128v8si:
9775 case CODE_FOR_avx_vinsertf128v4df:
9776 case CODE_FOR_avx_vinsertf128v8sf:
9777 case CODE_FOR_avx_vinsertf128v8si:
9778 case CODE_FOR_avx512f_vinsertf64x4_mask:
9779 case CODE_FOR_avx512f_vinserti64x4_mask:
9780 case CODE_FOR_avx512f_vextractf64x4_mask:
9781 case CODE_FOR_avx512f_vextracti64x4_mask:
9782 case CODE_FOR_avx512dq_vinsertf32x8_mask:
9783 case CODE_FOR_avx512dq_vinserti32x8_mask:
9784 case CODE_FOR_avx512vl_vinsertv4df:
9785 case CODE_FOR_avx512vl_vinsertv4di:
9786 case CODE_FOR_avx512vl_vinsertv8sf:
9787 case CODE_FOR_avx512vl_vinsertv8si:
9788 error ("the last argument must be a 1-bit immediate");
9789 return const0_rtx;
9790
9791 case CODE_FOR_avx_vmcmpv2df3:
9792 case CODE_FOR_avx_vmcmpv4sf3:
9793 case CODE_FOR_avx_cmpv2df3:
9794 case CODE_FOR_avx_cmpv4sf3:
9795 case CODE_FOR_avx_cmpv4df3:
9796 case CODE_FOR_avx_cmpv8sf3:
9797 case CODE_FOR_avx512f_cmpv8df3_mask:
9798 case CODE_FOR_avx512f_cmpv16sf3_mask:
9799 case CODE_FOR_avx512f_vmcmpv2df3_mask:
9800 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9801 error ("the last argument must be a 5-bit immediate");
9802 return const0_rtx;
9803
9804 default:
9805 switch (nargs_constant)
9806 {
9807 case 2:
9808 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9809 (!mask_pos && (nargs - i) == nargs_constant))
9810 {
9811 error ("the next to last argument must be an 8-bit immediate");
9812 break;
9813 }
9814 /* FALLTHRU */
9815 case 1:
9816 error ("the last argument must be an 8-bit immediate");
9817 break;
9818 default:
9819 gcc_unreachable ();
9820 }
9821 return const0_rtx;
9822 }
9823 }
9824 else
9825 {
9826 if (VECTOR_MODE_P (mode))
9827 op = safe_vector_operand (op, mode);
9828
9829 /* If we aren't optimizing, only allow one memory operand to
9830 be generated. */
9831 if (memory_operand (op, mode))
9832 num_memory++;
9833
9834 op = fixup_modeless_constant (op, mode);
9835
9836 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9837 {
9838 if (optimize || !match || num_memory > 1)
9839 op = copy_to_mode_reg (mode, op);
9840 }
9841 else
9842 {
9843 op = copy_to_reg (op);
9844 op = lowpart_subreg (mode, op, GET_MODE (op));
9845 }
9846 }
9847
9848 args[i].op = op;
9849 args[i].mode = mode;
9850 }
9851
9852 switch (nargs)
9853 {
9854 case 1:
9855 pat = GEN_FCN (icode) (real_target, args[0].op);
9856 break;
9857 case 2:
9858 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
9859 break;
9860 case 3:
9861 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9862 args[2].op);
9863 break;
9864 case 4:
9865 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9866 args[2].op, args[3].op);
9867 break;
9868 case 5:
9869 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9870 args[2].op, args[3].op, args[4].op);
9871 break;
9872 case 6:
9873 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9874 args[2].op, args[3].op, args[4].op,
9875 args[5].op);
9876 break;
9877 default:
9878 gcc_unreachable ();
9879 }
9880
9881 if (! pat)
9882 return 0;
9883
9884 emit_insn (pat);
9885 return target;
9886 }
9887
9888 /* Transform pattern of following layout:
9889 (set A
9890 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9891 )
9892 into:
9893 (set (A B)) */
9894
9895 static rtx
9896 ix86_erase_embedded_rounding (rtx pat)
9897 {
9898 if (GET_CODE (pat) == INSN)
9899 pat = PATTERN (pat);
9900
9901 gcc_assert (GET_CODE (pat) == SET);
9902 rtx src = SET_SRC (pat);
9903 gcc_assert (XVECLEN (src, 0) == 2);
9904 rtx p0 = XVECEXP (src, 0, 0);
9905 gcc_assert (GET_CODE (src) == UNSPEC
9906 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
9907 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
9908 return res;
9909 }
9910
9911 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9912 with rounding. */
9913 static rtx
9914 ix86_expand_sse_comi_round (const struct builtin_description *d,
9915 tree exp, rtx target)
9916 {
9917 rtx pat, set_dst;
9918 tree arg0 = CALL_EXPR_ARG (exp, 0);
9919 tree arg1 = CALL_EXPR_ARG (exp, 1);
9920 tree arg2 = CALL_EXPR_ARG (exp, 2);
9921 tree arg3 = CALL_EXPR_ARG (exp, 3);
9922 rtx op0 = expand_normal (arg0);
9923 rtx op1 = expand_normal (arg1);
9924 rtx op2 = expand_normal (arg2);
9925 rtx op3 = expand_normal (arg3);
9926 enum insn_code icode = d->icode;
9927 const struct insn_data_d *insn_p = &insn_data[icode];
9928 machine_mode mode0 = insn_p->operand[0].mode;
9929 machine_mode mode1 = insn_p->operand[1].mode;
9930
9931 /* See avxintrin.h for values. */
9932 static const enum rtx_code comparisons[32] =
9933 {
9934 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9935 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
9936 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9937 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
9938 };
9939 static const bool ordereds[32] =
9940 {
9941 true, true, true, false, false, false, false, true,
9942 false, false, false, true, true, true, true, false,
9943 true, true, true, false, false, false, false, true,
9944 false, false, false, true, true, true, true, false
9945 };
9946 static const bool non_signalings[32] =
9947 {
9948 true, false, false, true, true, false, false, true,
9949 true, false, false, true, true, false, false, true,
9950 false, true, true, false, false, true, true, false,
9951 false, true, true, false, false, true, true, false
9952 };
9953
9954 if (!CONST_INT_P (op2))
9955 {
9956 error ("the third argument must be comparison constant");
9957 return const0_rtx;
9958 }
9959 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
9960 {
9961 error ("incorrect comparison mode");
9962 return const0_rtx;
9963 }
9964
9965 if (!insn_p->operand[2].predicate (op3, SImode))
9966 {
9967 error ("incorrect rounding operand");
9968 return const0_rtx;
9969 }
9970
9971 if (VECTOR_MODE_P (mode0))
9972 op0 = safe_vector_operand (op0, mode0);
9973 if (VECTOR_MODE_P (mode1))
9974 op1 = safe_vector_operand (op1, mode1);
9975
9976 enum rtx_code comparison = comparisons[INTVAL (op2)];
9977 bool ordered = ordereds[INTVAL (op2)];
9978 bool non_signaling = non_signalings[INTVAL (op2)];
9979 rtx const_val = const0_rtx;
9980
9981 bool check_unordered = false;
9982 machine_mode mode = CCFPmode;
9983 switch (comparison)
9984 {
9985 case ORDERED:
9986 if (!ordered)
9987 {
9988 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
9989 if (!non_signaling)
9990 ordered = true;
9991 mode = CCSmode;
9992 }
9993 else
9994 {
9995 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
9996 if (non_signaling)
9997 ordered = false;
9998 mode = CCPmode;
9999 }
10000 comparison = NE;
10001 break;
10002 case UNORDERED:
10003 if (ordered)
10004 {
10005 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10006 if (non_signaling)
10007 ordered = false;
10008 mode = CCSmode;
10009 }
10010 else
10011 {
10012 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10013 if (!non_signaling)
10014 ordered = true;
10015 mode = CCPmode;
10016 }
10017 comparison = EQ;
10018 break;
10019
10020 case LE: /* -> GE */
10021 case LT: /* -> GT */
10022 case UNGE: /* -> UNLE */
10023 case UNGT: /* -> UNLT */
10024 std::swap (op0, op1);
10025 comparison = swap_condition (comparison);
10026 /* FALLTHRU */
10027 case GT:
10028 case GE:
10029 case UNEQ:
10030 case UNLT:
10031 case UNLE:
10032 case LTGT:
10033 /* These are supported by CCFPmode. NB: Use ordered/signaling
10034 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10035 with NAN operands. */
10036 if (ordered == non_signaling)
10037 ordered = !ordered;
10038 break;
10039 case EQ:
10040 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10041 _CMP_EQ_OQ/_CMP_EQ_OS. */
10042 check_unordered = true;
10043 mode = CCZmode;
10044 break;
10045 case NE:
10046 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10047 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10048 gcc_assert (!ordered);
10049 check_unordered = true;
10050 mode = CCZmode;
10051 const_val = const1_rtx;
10052 break;
10053 default:
10054 gcc_unreachable ();
10055 }
10056
10057 target = gen_reg_rtx (SImode);
10058 emit_move_insn (target, const_val);
10059 target = gen_rtx_SUBREG (QImode, target, 0);
10060
10061 if ((optimize && !register_operand (op0, mode0))
10062 || !insn_p->operand[0].predicate (op0, mode0))
10063 op0 = copy_to_mode_reg (mode0, op0);
10064 if ((optimize && !register_operand (op1, mode1))
10065 || !insn_p->operand[1].predicate (op1, mode1))
10066 op1 = copy_to_mode_reg (mode1, op1);
10067
10068 /*
10069 1. COMI: ordered and signaling.
10070 2. UCOMI: unordered and non-signaling.
10071 */
10072 if (non_signaling)
10073 icode = (icode == CODE_FOR_sse_comi_round
10074 ? CODE_FOR_sse_ucomi_round
10075 : CODE_FOR_sse2_ucomi_round);
10076
10077 pat = GEN_FCN (icode) (op0, op1, op3);
10078 if (! pat)
10079 return 0;
10080
10081 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10082 if (INTVAL (op3) == NO_ROUND)
10083 {
10084 pat = ix86_erase_embedded_rounding (pat);
10085 if (! pat)
10086 return 0;
10087
10088 set_dst = SET_DEST (pat);
10089 }
10090 else
10091 {
10092 gcc_assert (GET_CODE (pat) == SET);
10093 set_dst = SET_DEST (pat);
10094 }
10095
10096 emit_insn (pat);
10097
10098 rtx_code_label *label = NULL;
10099
10100 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10101 with NAN operands. */
10102 if (check_unordered)
10103 {
10104 gcc_assert (comparison == EQ || comparison == NE);
10105
10106 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10107 label = gen_label_rtx ();
10108 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10109 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10110 gen_rtx_LABEL_REF (VOIDmode, label),
10111 pc_rtx);
10112 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10113 }
10114
10115 /* NB: Set CCFPmode and check a different CCmode which is in subset
10116 of CCFPmode. */
10117 if (GET_MODE (set_dst) != mode)
10118 {
10119 gcc_assert (mode == CCAmode || mode == CCCmode
10120 || mode == CCOmode || mode == CCPmode
10121 || mode == CCSmode || mode == CCZmode);
10122 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10123 }
10124
10125 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10126 gen_rtx_fmt_ee (comparison, QImode,
10127 set_dst,
10128 const0_rtx)));
10129
10130 if (label)
10131 emit_label (label);
10132
10133 return SUBREG_REG (target);
10134 }
10135
10136 static rtx
10137 ix86_expand_round_builtin (const struct builtin_description *d,
10138 tree exp, rtx target)
10139 {
10140 rtx pat;
10141 unsigned int i, nargs;
10142 struct
10143 {
10144 rtx op;
10145 machine_mode mode;
10146 } args[6];
10147 enum insn_code icode = d->icode;
10148 const struct insn_data_d *insn_p = &insn_data[icode];
10149 machine_mode tmode = insn_p->operand[0].mode;
10150 unsigned int nargs_constant = 0;
10151 unsigned int redundant_embed_rnd = 0;
10152
10153 switch ((enum ix86_builtin_func_type) d->flag)
10154 {
10155 case UINT64_FTYPE_V2DF_INT:
10156 case UINT64_FTYPE_V4SF_INT:
10157 case UINT_FTYPE_V2DF_INT:
10158 case UINT_FTYPE_V4SF_INT:
10159 case INT64_FTYPE_V2DF_INT:
10160 case INT64_FTYPE_V4SF_INT:
10161 case INT_FTYPE_V2DF_INT:
10162 case INT_FTYPE_V4SF_INT:
10163 nargs = 2;
10164 break;
10165 case V4SF_FTYPE_V4SF_UINT_INT:
10166 case V4SF_FTYPE_V4SF_UINT64_INT:
10167 case V2DF_FTYPE_V2DF_UINT64_INT:
10168 case V4SF_FTYPE_V4SF_INT_INT:
10169 case V4SF_FTYPE_V4SF_INT64_INT:
10170 case V2DF_FTYPE_V2DF_INT64_INT:
10171 case V4SF_FTYPE_V4SF_V4SF_INT:
10172 case V2DF_FTYPE_V2DF_V2DF_INT:
10173 case V4SF_FTYPE_V4SF_V2DF_INT:
10174 case V2DF_FTYPE_V2DF_V4SF_INT:
10175 nargs = 3;
10176 break;
10177 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10178 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10179 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10180 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10181 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10182 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10183 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10184 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10185 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10186 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10187 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10188 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10189 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10190 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10191 nargs = 4;
10192 break;
10193 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10194 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10195 nargs_constant = 2;
10196 nargs = 4;
10197 break;
10198 case INT_FTYPE_V4SF_V4SF_INT_INT:
10199 case INT_FTYPE_V2DF_V2DF_INT_INT:
10200 return ix86_expand_sse_comi_round (d, exp, target);
10201 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10202 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10203 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10204 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10205 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10206 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10207 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10208 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10209 nargs = 5;
10210 break;
10211 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10212 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10213 nargs_constant = 4;
10214 nargs = 5;
10215 break;
10216 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10217 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10218 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10219 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10220 nargs_constant = 3;
10221 nargs = 5;
10222 break;
10223 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10224 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10225 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10226 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10227 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10228 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10229 nargs = 6;
10230 nargs_constant = 4;
10231 break;
10232 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10233 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10234 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10235 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10236 nargs = 6;
10237 nargs_constant = 3;
10238 break;
10239 default:
10240 gcc_unreachable ();
10241 }
10242 gcc_assert (nargs <= ARRAY_SIZE (args));
10243
10244 if (optimize
10245 || target == 0
10246 || GET_MODE (target) != tmode
10247 || !insn_p->operand[0].predicate (target, tmode))
10248 target = gen_reg_rtx (tmode);
10249
10250 for (i = 0; i < nargs; i++)
10251 {
10252 tree arg = CALL_EXPR_ARG (exp, i);
10253 rtx op = expand_normal (arg);
10254 machine_mode mode = insn_p->operand[i + 1].mode;
10255 bool match = insn_p->operand[i + 1].predicate (op, mode);
10256
10257 if (i == nargs - nargs_constant)
10258 {
10259 if (!match)
10260 {
10261 switch (icode)
10262 {
10263 case CODE_FOR_avx512f_getmantv8df_mask_round:
10264 case CODE_FOR_avx512f_getmantv16sf_mask_round:
10265 case CODE_FOR_avx512f_vgetmantv2df_round:
10266 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10267 case CODE_FOR_avx512f_vgetmantv4sf_round:
10268 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10269 error ("the immediate argument must be a 4-bit immediate");
10270 return const0_rtx;
10271 case CODE_FOR_avx512f_cmpv8df3_mask_round:
10272 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10273 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10274 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10275 error ("the immediate argument must be a 5-bit immediate");
10276 return const0_rtx;
10277 default:
10278 error ("the immediate argument must be an 8-bit immediate");
10279 return const0_rtx;
10280 }
10281 }
10282 }
10283 else if (i == nargs-1)
10284 {
10285 if (!insn_p->operand[nargs].predicate (op, SImode))
10286 {
10287 error ("incorrect rounding operand");
10288 return const0_rtx;
10289 }
10290
10291 /* If there is no rounding use normal version of the pattern. */
10292 if (INTVAL (op) == NO_ROUND)
10293 redundant_embed_rnd = 1;
10294 }
10295 else
10296 {
10297 if (VECTOR_MODE_P (mode))
10298 op = safe_vector_operand (op, mode);
10299
10300 op = fixup_modeless_constant (op, mode);
10301
10302 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10303 {
10304 if (optimize || !match)
10305 op = copy_to_mode_reg (mode, op);
10306 }
10307 else
10308 {
10309 op = copy_to_reg (op);
10310 op = lowpart_subreg (mode, op, GET_MODE (op));
10311 }
10312 }
10313
10314 args[i].op = op;
10315 args[i].mode = mode;
10316 }
10317
10318 switch (nargs)
10319 {
10320 case 1:
10321 pat = GEN_FCN (icode) (target, args[0].op);
10322 break;
10323 case 2:
10324 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10325 break;
10326 case 3:
10327 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10328 args[2].op);
10329 break;
10330 case 4:
10331 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10332 args[2].op, args[3].op);
10333 break;
10334 case 5:
10335 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10336 args[2].op, args[3].op, args[4].op);
10337 break;
10338 case 6:
10339 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10340 args[2].op, args[3].op, args[4].op,
10341 args[5].op);
10342 break;
10343 default:
10344 gcc_unreachable ();
10345 }
10346
10347 if (!pat)
10348 return 0;
10349
10350 if (redundant_embed_rnd)
10351 pat = ix86_erase_embedded_rounding (pat);
10352
10353 emit_insn (pat);
10354 return target;
10355 }
10356
10357 /* Subroutine of ix86_expand_builtin to take care of special insns
10358 with variable number of operands. */
10359
10360 static rtx
10361 ix86_expand_special_args_builtin (const struct builtin_description *d,
10362 tree exp, rtx target)
10363 {
10364 tree arg;
10365 rtx pat, op;
10366 unsigned int i, nargs, arg_adjust, memory;
10367 bool aligned_mem = false;
10368 struct
10369 {
10370 rtx op;
10371 machine_mode mode;
10372 } args[3];
10373 enum insn_code icode = d->icode;
10374 bool last_arg_constant = false;
10375 const struct insn_data_d *insn_p = &insn_data[icode];
10376 machine_mode tmode = insn_p->operand[0].mode;
10377 enum { load, store } klass;
10378
10379 switch ((enum ix86_builtin_func_type) d->flag)
10380 {
10381 case VOID_FTYPE_VOID:
10382 emit_insn (GEN_FCN (icode) (target));
10383 return 0;
10384 case VOID_FTYPE_UINT64:
10385 case VOID_FTYPE_UNSIGNED:
10386 nargs = 0;
10387 klass = store;
10388 memory = 0;
10389 break;
10390
10391 case INT_FTYPE_VOID:
10392 case USHORT_FTYPE_VOID:
10393 case UINT64_FTYPE_VOID:
10394 case UINT_FTYPE_VOID:
10395 case UNSIGNED_FTYPE_VOID:
10396 nargs = 0;
10397 klass = load;
10398 memory = 0;
10399 break;
10400 case UINT64_FTYPE_PUNSIGNED:
10401 case V2DI_FTYPE_PV2DI:
10402 case V4DI_FTYPE_PV4DI:
10403 case V32QI_FTYPE_PCCHAR:
10404 case V16QI_FTYPE_PCCHAR:
10405 case V8SF_FTYPE_PCV4SF:
10406 case V8SF_FTYPE_PCFLOAT:
10407 case V4SF_FTYPE_PCFLOAT:
10408 case V4DF_FTYPE_PCV2DF:
10409 case V4DF_FTYPE_PCDOUBLE:
10410 case V2DF_FTYPE_PCDOUBLE:
10411 case VOID_FTYPE_PVOID:
10412 case V8DI_FTYPE_PV8DI:
10413 nargs = 1;
10414 klass = load;
10415 memory = 0;
10416 switch (icode)
10417 {
10418 case CODE_FOR_sse4_1_movntdqa:
10419 case CODE_FOR_avx2_movntdqa:
10420 case CODE_FOR_avx512f_movntdqa:
10421 aligned_mem = true;
10422 break;
10423 default:
10424 break;
10425 }
10426 break;
10427 case VOID_FTYPE_PV2SF_V4SF:
10428 case VOID_FTYPE_PV8DI_V8DI:
10429 case VOID_FTYPE_PV4DI_V4DI:
10430 case VOID_FTYPE_PV2DI_V2DI:
10431 case VOID_FTYPE_PCHAR_V32QI:
10432 case VOID_FTYPE_PCHAR_V16QI:
10433 case VOID_FTYPE_PFLOAT_V16SF:
10434 case VOID_FTYPE_PFLOAT_V8SF:
10435 case VOID_FTYPE_PFLOAT_V4SF:
10436 case VOID_FTYPE_PDOUBLE_V8DF:
10437 case VOID_FTYPE_PDOUBLE_V4DF:
10438 case VOID_FTYPE_PDOUBLE_V2DF:
10439 case VOID_FTYPE_PLONGLONG_LONGLONG:
10440 case VOID_FTYPE_PULONGLONG_ULONGLONG:
10441 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10442 case VOID_FTYPE_PINT_INT:
10443 nargs = 1;
10444 klass = store;
10445 /* Reserve memory operand for target. */
10446 memory = ARRAY_SIZE (args);
10447 switch (icode)
10448 {
10449 /* These builtins and instructions require the memory
10450 to be properly aligned. */
10451 case CODE_FOR_avx_movntv4di:
10452 case CODE_FOR_sse2_movntv2di:
10453 case CODE_FOR_avx_movntv8sf:
10454 case CODE_FOR_sse_movntv4sf:
10455 case CODE_FOR_sse4a_vmmovntv4sf:
10456 case CODE_FOR_avx_movntv4df:
10457 case CODE_FOR_sse2_movntv2df:
10458 case CODE_FOR_sse4a_vmmovntv2df:
10459 case CODE_FOR_sse2_movntidi:
10460 case CODE_FOR_sse_movntq:
10461 case CODE_FOR_sse2_movntisi:
10462 case CODE_FOR_avx512f_movntv16sf:
10463 case CODE_FOR_avx512f_movntv8df:
10464 case CODE_FOR_avx512f_movntv8di:
10465 aligned_mem = true;
10466 break;
10467 default:
10468 break;
10469 }
10470 break;
10471 case VOID_FTYPE_PVOID_PCVOID:
10472 nargs = 1;
10473 klass = store;
10474 memory = 0;
10475
10476 break;
10477 case V4SF_FTYPE_V4SF_PCV2SF:
10478 case V2DF_FTYPE_V2DF_PCDOUBLE:
10479 nargs = 2;
10480 klass = load;
10481 memory = 1;
10482 break;
10483 case V8SF_FTYPE_PCV8SF_V8SI:
10484 case V4DF_FTYPE_PCV4DF_V4DI:
10485 case V4SF_FTYPE_PCV4SF_V4SI:
10486 case V2DF_FTYPE_PCV2DF_V2DI:
10487 case V8SI_FTYPE_PCV8SI_V8SI:
10488 case V4DI_FTYPE_PCV4DI_V4DI:
10489 case V4SI_FTYPE_PCV4SI_V4SI:
10490 case V2DI_FTYPE_PCV2DI_V2DI:
10491 case VOID_FTYPE_INT_INT64:
10492 nargs = 2;
10493 klass = load;
10494 memory = 0;
10495 break;
10496 case VOID_FTYPE_PV8DF_V8DF_UQI:
10497 case VOID_FTYPE_PV4DF_V4DF_UQI:
10498 case VOID_FTYPE_PV2DF_V2DF_UQI:
10499 case VOID_FTYPE_PV16SF_V16SF_UHI:
10500 case VOID_FTYPE_PV8SF_V8SF_UQI:
10501 case VOID_FTYPE_PV4SF_V4SF_UQI:
10502 case VOID_FTYPE_PV8DI_V8DI_UQI:
10503 case VOID_FTYPE_PV4DI_V4DI_UQI:
10504 case VOID_FTYPE_PV2DI_V2DI_UQI:
10505 case VOID_FTYPE_PV16SI_V16SI_UHI:
10506 case VOID_FTYPE_PV8SI_V8SI_UQI:
10507 case VOID_FTYPE_PV4SI_V4SI_UQI:
10508 case VOID_FTYPE_PV64QI_V64QI_UDI:
10509 case VOID_FTYPE_PV32HI_V32HI_USI:
10510 case VOID_FTYPE_PV32QI_V32QI_USI:
10511 case VOID_FTYPE_PV16QI_V16QI_UHI:
10512 case VOID_FTYPE_PV16HI_V16HI_UHI:
10513 case VOID_FTYPE_PV8HI_V8HI_UQI:
10514 switch (icode)
10515 {
10516 /* These builtins and instructions require the memory
10517 to be properly aligned. */
10518 case CODE_FOR_avx512f_storev16sf_mask:
10519 case CODE_FOR_avx512f_storev16si_mask:
10520 case CODE_FOR_avx512f_storev8df_mask:
10521 case CODE_FOR_avx512f_storev8di_mask:
10522 case CODE_FOR_avx512vl_storev8sf_mask:
10523 case CODE_FOR_avx512vl_storev8si_mask:
10524 case CODE_FOR_avx512vl_storev4df_mask:
10525 case CODE_FOR_avx512vl_storev4di_mask:
10526 case CODE_FOR_avx512vl_storev4sf_mask:
10527 case CODE_FOR_avx512vl_storev4si_mask:
10528 case CODE_FOR_avx512vl_storev2df_mask:
10529 case CODE_FOR_avx512vl_storev2di_mask:
10530 aligned_mem = true;
10531 break;
10532 default:
10533 break;
10534 }
10535 /* FALLTHRU */
10536 case VOID_FTYPE_PV8SF_V8SI_V8SF:
10537 case VOID_FTYPE_PV4DF_V4DI_V4DF:
10538 case VOID_FTYPE_PV4SF_V4SI_V4SF:
10539 case VOID_FTYPE_PV2DF_V2DI_V2DF:
10540 case VOID_FTYPE_PV8SI_V8SI_V8SI:
10541 case VOID_FTYPE_PV4DI_V4DI_V4DI:
10542 case VOID_FTYPE_PV4SI_V4SI_V4SI:
10543 case VOID_FTYPE_PV2DI_V2DI_V2DI:
10544 case VOID_FTYPE_PV8SI_V8DI_UQI:
10545 case VOID_FTYPE_PV8HI_V8DI_UQI:
10546 case VOID_FTYPE_PV16HI_V16SI_UHI:
10547 case VOID_FTYPE_PUDI_V8DI_UQI:
10548 case VOID_FTYPE_PV16QI_V16SI_UHI:
10549 case VOID_FTYPE_PV4SI_V4DI_UQI:
10550 case VOID_FTYPE_PUDI_V2DI_UQI:
10551 case VOID_FTYPE_PUDI_V4DI_UQI:
10552 case VOID_FTYPE_PUSI_V2DI_UQI:
10553 case VOID_FTYPE_PV8HI_V8SI_UQI:
10554 case VOID_FTYPE_PUDI_V4SI_UQI:
10555 case VOID_FTYPE_PUSI_V4DI_UQI:
10556 case VOID_FTYPE_PUHI_V2DI_UQI:
10557 case VOID_FTYPE_PUDI_V8SI_UQI:
10558 case VOID_FTYPE_PUSI_V4SI_UQI:
10559 case VOID_FTYPE_PCHAR_V64QI_UDI:
10560 case VOID_FTYPE_PCHAR_V32QI_USI:
10561 case VOID_FTYPE_PCHAR_V16QI_UHI:
10562 case VOID_FTYPE_PSHORT_V32HI_USI:
10563 case VOID_FTYPE_PSHORT_V16HI_UHI:
10564 case VOID_FTYPE_PSHORT_V8HI_UQI:
10565 case VOID_FTYPE_PINT_V16SI_UHI:
10566 case VOID_FTYPE_PINT_V8SI_UQI:
10567 case VOID_FTYPE_PINT_V4SI_UQI:
10568 case VOID_FTYPE_PINT64_V8DI_UQI:
10569 case VOID_FTYPE_PINT64_V4DI_UQI:
10570 case VOID_FTYPE_PINT64_V2DI_UQI:
10571 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10572 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10573 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10574 case VOID_FTYPE_PFLOAT_V16SF_UHI:
10575 case VOID_FTYPE_PFLOAT_V8SF_UQI:
10576 case VOID_FTYPE_PFLOAT_V4SF_UQI:
10577 case VOID_FTYPE_PV32QI_V32HI_USI:
10578 case VOID_FTYPE_PV16QI_V16HI_UHI:
10579 case VOID_FTYPE_PUDI_V8HI_UQI:
10580 nargs = 2;
10581 klass = store;
10582 /* Reserve memory operand for target. */
10583 memory = ARRAY_SIZE (args);
10584 break;
10585 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10586 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10587 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10588 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10589 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10590 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10591 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10592 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10593 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10594 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10595 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10596 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10597 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10598 case V32HI_FTYPE_PCV32HI_V32HI_USI:
10599 case V32QI_FTYPE_PCV32QI_V32QI_USI:
10600 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10601 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10602 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10603 switch (icode)
10604 {
10605 /* These builtins and instructions require the memory
10606 to be properly aligned. */
10607 case CODE_FOR_avx512f_loadv16sf_mask:
10608 case CODE_FOR_avx512f_loadv16si_mask:
10609 case CODE_FOR_avx512f_loadv8df_mask:
10610 case CODE_FOR_avx512f_loadv8di_mask:
10611 case CODE_FOR_avx512vl_loadv8sf_mask:
10612 case CODE_FOR_avx512vl_loadv8si_mask:
10613 case CODE_FOR_avx512vl_loadv4df_mask:
10614 case CODE_FOR_avx512vl_loadv4di_mask:
10615 case CODE_FOR_avx512vl_loadv4sf_mask:
10616 case CODE_FOR_avx512vl_loadv4si_mask:
10617 case CODE_FOR_avx512vl_loadv2df_mask:
10618 case CODE_FOR_avx512vl_loadv2di_mask:
10619 case CODE_FOR_avx512bw_loadv64qi_mask:
10620 case CODE_FOR_avx512vl_loadv32qi_mask:
10621 case CODE_FOR_avx512vl_loadv16qi_mask:
10622 case CODE_FOR_avx512bw_loadv32hi_mask:
10623 case CODE_FOR_avx512vl_loadv16hi_mask:
10624 case CODE_FOR_avx512vl_loadv8hi_mask:
10625 aligned_mem = true;
10626 break;
10627 default:
10628 break;
10629 }
10630 /* FALLTHRU */
10631 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10632 case V32QI_FTYPE_PCCHAR_V32QI_USI:
10633 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10634 case V32HI_FTYPE_PCSHORT_V32HI_USI:
10635 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10636 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10637 case V16SI_FTYPE_PCINT_V16SI_UHI:
10638 case V8SI_FTYPE_PCINT_V8SI_UQI:
10639 case V4SI_FTYPE_PCINT_V4SI_UQI:
10640 case V8DI_FTYPE_PCINT64_V8DI_UQI:
10641 case V4DI_FTYPE_PCINT64_V4DI_UQI:
10642 case V2DI_FTYPE_PCINT64_V2DI_UQI:
10643 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10644 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10645 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10646 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10647 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10648 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10649 nargs = 3;
10650 klass = load;
10651 memory = 0;
10652 break;
10653 case VOID_FTYPE_UINT_UINT_UINT:
10654 case VOID_FTYPE_UINT64_UINT_UINT:
10655 case UCHAR_FTYPE_UINT_UINT_UINT:
10656 case UCHAR_FTYPE_UINT64_UINT_UINT:
10657 nargs = 3;
10658 klass = load;
10659 memory = ARRAY_SIZE (args);
10660 last_arg_constant = true;
10661 break;
10662 default:
10663 gcc_unreachable ();
10664 }
10665
10666 gcc_assert (nargs <= ARRAY_SIZE (args));
10667
10668 if (klass == store)
10669 {
10670 arg = CALL_EXPR_ARG (exp, 0);
10671 op = expand_normal (arg);
10672 gcc_assert (target == 0);
10673 if (memory)
10674 {
10675 op = ix86_zero_extend_to_Pmode (op);
10676 target = gen_rtx_MEM (tmode, op);
10677 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10678 on it. Try to improve it using get_pointer_alignment,
10679 and if the special builtin is one that requires strict
10680 mode alignment, also from it's GET_MODE_ALIGNMENT.
10681 Failure to do so could lead to ix86_legitimate_combined_insn
10682 rejecting all changes to such insns. */
10683 unsigned int align = get_pointer_alignment (arg);
10684 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10685 align = GET_MODE_ALIGNMENT (tmode);
10686 if (MEM_ALIGN (target) < align)
10687 set_mem_align (target, align);
10688 }
10689 else
10690 target = force_reg (tmode, op);
10691 arg_adjust = 1;
10692 }
10693 else
10694 {
10695 arg_adjust = 0;
10696 if (optimize
10697 || target == 0
10698 || !register_operand (target, tmode)
10699 || GET_MODE (target) != tmode)
10700 target = gen_reg_rtx (tmode);
10701 }
10702
10703 for (i = 0; i < nargs; i++)
10704 {
10705 machine_mode mode = insn_p->operand[i + 1].mode;
10706 bool match;
10707
10708 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10709 op = expand_normal (arg);
10710 match = insn_p->operand[i + 1].predicate (op, mode);
10711
10712 if (last_arg_constant && (i + 1) == nargs)
10713 {
10714 if (!match)
10715 {
10716 if (icode == CODE_FOR_lwp_lwpvalsi3
10717 || icode == CODE_FOR_lwp_lwpinssi3
10718 || icode == CODE_FOR_lwp_lwpvaldi3
10719 || icode == CODE_FOR_lwp_lwpinsdi3)
10720 error ("the last argument must be a 32-bit immediate");
10721 else
10722 error ("the last argument must be an 8-bit immediate");
10723 return const0_rtx;
10724 }
10725 }
10726 else
10727 {
10728 if (i == memory)
10729 {
10730 /* This must be the memory operand. */
10731 op = ix86_zero_extend_to_Pmode (op);
10732 op = gen_rtx_MEM (mode, op);
10733 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10734 on it. Try to improve it using get_pointer_alignment,
10735 and if the special builtin is one that requires strict
10736 mode alignment, also from it's GET_MODE_ALIGNMENT.
10737 Failure to do so could lead to ix86_legitimate_combined_insn
10738 rejecting all changes to such insns. */
10739 unsigned int align = get_pointer_alignment (arg);
10740 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10741 align = GET_MODE_ALIGNMENT (mode);
10742 if (MEM_ALIGN (op) < align)
10743 set_mem_align (op, align);
10744 }
10745 else
10746 {
10747 /* This must be register. */
10748 if (VECTOR_MODE_P (mode))
10749 op = safe_vector_operand (op, mode);
10750
10751 op = fixup_modeless_constant (op, mode);
10752
10753 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10754 op = copy_to_mode_reg (mode, op);
10755 else
10756 {
10757 op = copy_to_reg (op);
10758 op = lowpart_subreg (mode, op, GET_MODE (op));
10759 }
10760 }
10761 }
10762
10763 args[i].op = op;
10764 args[i].mode = mode;
10765 }
10766
10767 switch (nargs)
10768 {
10769 case 0:
10770 pat = GEN_FCN (icode) (target);
10771 break;
10772 case 1:
10773 pat = GEN_FCN (icode) (target, args[0].op);
10774 break;
10775 case 2:
10776 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10777 break;
10778 case 3:
10779 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
10780 break;
10781 default:
10782 gcc_unreachable ();
10783 }
10784
10785 if (! pat)
10786 return 0;
10787 emit_insn (pat);
10788 return klass == store ? 0 : target;
10789 }
10790
10791 /* Return the integer constant in ARG. Constrain it to be in the range
10792 of the subparts of VEC_TYPE; issue an error if not. */
10793
10794 static int
10795 get_element_number (tree vec_type, tree arg)
10796 {
10797 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10798
10799 if (!tree_fits_uhwi_p (arg)
10800 || (elt = tree_to_uhwi (arg), elt > max))
10801 {
10802 error ("selector must be an integer constant in the range "
10803 "[0, %wi]", max);
10804 return 0;
10805 }
10806
10807 return elt;
10808 }
10809
10810 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10811 ix86_expand_vector_init. We DO have language-level syntax for this, in
10812 the form of (type){ init-list }. Except that since we can't place emms
10813 instructions from inside the compiler, we can't allow the use of MMX
10814 registers unless the user explicitly asks for it. So we do *not* define
10815 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10816 we have builtins invoked by mmintrin.h that gives us license to emit
10817 these sorts of instructions. */
10818
10819 static rtx
10820 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10821 {
10822 machine_mode tmode = TYPE_MODE (type);
10823 machine_mode inner_mode = GET_MODE_INNER (tmode);
10824 int i, n_elt = GET_MODE_NUNITS (tmode);
10825 rtvec v = rtvec_alloc (n_elt);
10826
10827 gcc_assert (VECTOR_MODE_P (tmode));
10828 gcc_assert (call_expr_nargs (exp) == n_elt);
10829
10830 for (i = 0; i < n_elt; ++i)
10831 {
10832 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10833 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10834 }
10835
10836 if (!target || !register_operand (target, tmode))
10837 target = gen_reg_rtx (tmode);
10838
10839 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10840 return target;
10841 }
10842
10843 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10844 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10845 had a language-level syntax for referencing vector elements. */
10846
10847 static rtx
10848 ix86_expand_vec_ext_builtin (tree exp, rtx target)
10849 {
10850 machine_mode tmode, mode0;
10851 tree arg0, arg1;
10852 int elt;
10853 rtx op0;
10854
10855 arg0 = CALL_EXPR_ARG (exp, 0);
10856 arg1 = CALL_EXPR_ARG (exp, 1);
10857
10858 op0 = expand_normal (arg0);
10859 elt = get_element_number (TREE_TYPE (arg0), arg1);
10860
10861 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10862 mode0 = TYPE_MODE (TREE_TYPE (arg0));
10863 gcc_assert (VECTOR_MODE_P (mode0));
10864
10865 op0 = force_reg (mode0, op0);
10866
10867 if (optimize || !target || !register_operand (target, tmode))
10868 target = gen_reg_rtx (tmode);
10869
10870 ix86_expand_vector_extract (true, target, op0, elt);
10871
10872 return target;
10873 }
10874
10875 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10876 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10877 a language-level syntax for referencing vector elements. */
10878
10879 static rtx
10880 ix86_expand_vec_set_builtin (tree exp)
10881 {
10882 machine_mode tmode, mode1;
10883 tree arg0, arg1, arg2;
10884 int elt;
10885 rtx op0, op1, target;
10886
10887 arg0 = CALL_EXPR_ARG (exp, 0);
10888 arg1 = CALL_EXPR_ARG (exp, 1);
10889 arg2 = CALL_EXPR_ARG (exp, 2);
10890
10891 tmode = TYPE_MODE (TREE_TYPE (arg0));
10892 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10893 gcc_assert (VECTOR_MODE_P (tmode));
10894
10895 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
10896 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
10897 elt = get_element_number (TREE_TYPE (arg0), arg2);
10898
10899 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
10900 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
10901
10902 op0 = force_reg (tmode, op0);
10903 op1 = force_reg (mode1, op1);
10904
10905 /* OP0 is the source of these builtin functions and shouldn't be
10906 modified. Create a copy, use it and return it as target. */
10907 target = gen_reg_rtx (tmode);
10908 emit_move_insn (target, op0);
10909 ix86_expand_vector_set (true, target, op1, elt);
10910
10911 return target;
10912 }
10913
10914 /* Expand an expression EXP that calls a built-in function,
10915 with result going to TARGET if that's convenient
10916 (and in mode MODE if that's convenient).
10917 SUBTARGET may be used as the target for computing one of EXP's operands.
10918 IGNORE is nonzero if the value is to be ignored. */
10919
10920 rtx
10921 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
10922 machine_mode mode, int ignore)
10923 {
10924 size_t i;
10925 enum insn_code icode, icode2;
10926 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
10927 tree arg0, arg1, arg2, arg3, arg4;
10928 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
10929 machine_mode mode0, mode1, mode2, mode3, mode4;
10930 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
10931
10932 /* For CPU builtins that can be folded, fold first and expand the fold. */
10933 switch (fcode)
10934 {
10935 case IX86_BUILTIN_CPU_INIT:
10936 {
10937 /* Make it call __cpu_indicator_init in libgcc. */
10938 tree call_expr, fndecl, type;
10939 type = build_function_type_list (integer_type_node, NULL_TREE);
10940 fndecl = build_fn_decl ("__cpu_indicator_init", type);
10941 call_expr = build_call_expr (fndecl, 0);
10942 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
10943 }
10944 case IX86_BUILTIN_CPU_IS:
10945 case IX86_BUILTIN_CPU_SUPPORTS:
10946 {
10947 tree arg0 = CALL_EXPR_ARG (exp, 0);
10948 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
10949 gcc_assert (fold_expr != NULL_TREE);
10950 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
10951 }
10952 }
10953
10954 HOST_WIDE_INT isa = ix86_isa_flags;
10955 HOST_WIDE_INT isa2 = ix86_isa_flags2;
10956 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
10957 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
10958 /* The general case is we require all the ISAs specified in bisa{,2}
10959 to be enabled.
10960 The exceptions are:
10961 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
10962 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
10963 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
10964 where for each such pair it is sufficient if either of the ISAs is
10965 enabled, plus if it is ored with other options also those others.
10966 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
10967 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
10968 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
10969 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
10970 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
10971 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
10972 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
10973 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
10974 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
10975 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
10976 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
10977 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
10978 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
10979 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE)
10980 {
10981 bisa &= ~OPTION_MASK_ISA_MMX;
10982 bisa |= OPTION_MASK_ISA_SSE2;
10983 }
10984 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
10985 {
10986 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
10987 if (TARGET_ABI_X32)
10988 bisa |= OPTION_MASK_ABI_X32;
10989 else
10990 bisa |= OPTION_MASK_ABI_64;
10991 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
10992 (enum fpmath_unit) 0,
10993 (enum prefer_vector_width) 0,
10994 false, add_abi_p);
10995 if (!opts)
10996 error ("%qE needs unknown isa option", fndecl);
10997 else
10998 {
10999 gcc_assert (opts != NULL);
11000 error ("%qE needs isa option %s", fndecl, opts);
11001 free (opts);
11002 }
11003 return expand_call (exp, target, ignore);
11004 }
11005
11006 switch (fcode)
11007 {
11008 case IX86_BUILTIN_MASKMOVQ:
11009 case IX86_BUILTIN_MASKMOVDQU:
11010 icode = (fcode == IX86_BUILTIN_MASKMOVQ
11011 ? CODE_FOR_mmx_maskmovq
11012 : CODE_FOR_sse2_maskmovdqu);
11013 /* Note the arg order is different from the operand order. */
11014 arg1 = CALL_EXPR_ARG (exp, 0);
11015 arg2 = CALL_EXPR_ARG (exp, 1);
11016 arg0 = CALL_EXPR_ARG (exp, 2);
11017 op0 = expand_normal (arg0);
11018 op1 = expand_normal (arg1);
11019 op2 = expand_normal (arg2);
11020 mode0 = insn_data[icode].operand[0].mode;
11021 mode1 = insn_data[icode].operand[1].mode;
11022 mode2 = insn_data[icode].operand[2].mode;
11023
11024 op0 = ix86_zero_extend_to_Pmode (op0);
11025 op0 = gen_rtx_MEM (mode1, op0);
11026
11027 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11028 op0 = copy_to_mode_reg (mode0, op0);
11029 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11030 op1 = copy_to_mode_reg (mode1, op1);
11031 if (!insn_data[icode].operand[2].predicate (op2, mode2))
11032 op2 = copy_to_mode_reg (mode2, op2);
11033 pat = GEN_FCN (icode) (op0, op1, op2);
11034 if (! pat)
11035 return 0;
11036 emit_insn (pat);
11037 return 0;
11038
11039 case IX86_BUILTIN_LDMXCSR:
11040 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11041 target = assign_386_stack_local (SImode, SLOT_TEMP);
11042 emit_move_insn (target, op0);
11043 emit_insn (gen_sse_ldmxcsr (target));
11044 return 0;
11045
11046 case IX86_BUILTIN_STMXCSR:
11047 target = assign_386_stack_local (SImode, SLOT_TEMP);
11048 emit_insn (gen_sse_stmxcsr (target));
11049 return copy_to_mode_reg (SImode, target);
11050
11051 case IX86_BUILTIN_CLFLUSH:
11052 arg0 = CALL_EXPR_ARG (exp, 0);
11053 op0 = expand_normal (arg0);
11054 icode = CODE_FOR_sse2_clflush;
11055 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11056 op0 = ix86_zero_extend_to_Pmode (op0);
11057
11058 emit_insn (gen_sse2_clflush (op0));
11059 return 0;
11060
11061 case IX86_BUILTIN_CLWB:
11062 arg0 = CALL_EXPR_ARG (exp, 0);
11063 op0 = expand_normal (arg0);
11064 icode = CODE_FOR_clwb;
11065 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11066 op0 = ix86_zero_extend_to_Pmode (op0);
11067
11068 emit_insn (gen_clwb (op0));
11069 return 0;
11070
11071 case IX86_BUILTIN_CLFLUSHOPT:
11072 arg0 = CALL_EXPR_ARG (exp, 0);
11073 op0 = expand_normal (arg0);
11074 icode = CODE_FOR_clflushopt;
11075 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11076 op0 = ix86_zero_extend_to_Pmode (op0);
11077
11078 emit_insn (gen_clflushopt (op0));
11079 return 0;
11080
11081 case IX86_BUILTIN_MONITOR:
11082 case IX86_BUILTIN_MONITORX:
11083 arg0 = CALL_EXPR_ARG (exp, 0);
11084 arg1 = CALL_EXPR_ARG (exp, 1);
11085 arg2 = CALL_EXPR_ARG (exp, 2);
11086 op0 = expand_normal (arg0);
11087 op1 = expand_normal (arg1);
11088 op2 = expand_normal (arg2);
11089 if (!REG_P (op0))
11090 op0 = ix86_zero_extend_to_Pmode (op0);
11091 if (!REG_P (op1))
11092 op1 = copy_to_mode_reg (SImode, op1);
11093 if (!REG_P (op2))
11094 op2 = copy_to_mode_reg (SImode, op2);
11095
11096 emit_insn (fcode == IX86_BUILTIN_MONITOR
11097 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11098 : gen_monitorx (Pmode, op0, op1, op2));
11099 return 0;
11100
11101 case IX86_BUILTIN_MWAIT:
11102 arg0 = CALL_EXPR_ARG (exp, 0);
11103 arg1 = CALL_EXPR_ARG (exp, 1);
11104 op0 = expand_normal (arg0);
11105 op1 = expand_normal (arg1);
11106 if (!REG_P (op0))
11107 op0 = copy_to_mode_reg (SImode, op0);
11108 if (!REG_P (op1))
11109 op1 = copy_to_mode_reg (SImode, op1);
11110 emit_insn (gen_sse3_mwait (op0, op1));
11111 return 0;
11112
11113 case IX86_BUILTIN_MWAITX:
11114 arg0 = CALL_EXPR_ARG (exp, 0);
11115 arg1 = CALL_EXPR_ARG (exp, 1);
11116 arg2 = CALL_EXPR_ARG (exp, 2);
11117 op0 = expand_normal (arg0);
11118 op1 = expand_normal (arg1);
11119 op2 = expand_normal (arg2);
11120 if (!REG_P (op0))
11121 op0 = copy_to_mode_reg (SImode, op0);
11122 if (!REG_P (op1))
11123 op1 = copy_to_mode_reg (SImode, op1);
11124 if (!REG_P (op2))
11125 op2 = copy_to_mode_reg (SImode, op2);
11126 emit_insn (gen_mwaitx (op0, op1, op2));
11127 return 0;
11128
11129 case IX86_BUILTIN_UMONITOR:
11130 arg0 = CALL_EXPR_ARG (exp, 0);
11131 op0 = expand_normal (arg0);
11132
11133 op0 = ix86_zero_extend_to_Pmode (op0);
11134 emit_insn (gen_umonitor (Pmode, op0));
11135 return 0;
11136
11137 case IX86_BUILTIN_UMWAIT:
11138 case IX86_BUILTIN_TPAUSE:
11139 arg0 = CALL_EXPR_ARG (exp, 0);
11140 arg1 = CALL_EXPR_ARG (exp, 1);
11141 op0 = expand_normal (arg0);
11142 op1 = expand_normal (arg1);
11143
11144 if (!REG_P (op0))
11145 op0 = copy_to_mode_reg (SImode, op0);
11146
11147 op1 = force_reg (DImode, op1);
11148
11149 if (TARGET_64BIT)
11150 {
11151 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11152 NULL, 1, OPTAB_DIRECT);
11153 switch (fcode)
11154 {
11155 case IX86_BUILTIN_UMWAIT:
11156 icode = CODE_FOR_umwait_rex64;
11157 break;
11158 case IX86_BUILTIN_TPAUSE:
11159 icode = CODE_FOR_tpause_rex64;
11160 break;
11161 default:
11162 gcc_unreachable ();
11163 }
11164
11165 op2 = gen_lowpart (SImode, op2);
11166 op1 = gen_lowpart (SImode, op1);
11167 pat = GEN_FCN (icode) (op0, op1, op2);
11168 }
11169 else
11170 {
11171 switch (fcode)
11172 {
11173 case IX86_BUILTIN_UMWAIT:
11174 icode = CODE_FOR_umwait;
11175 break;
11176 case IX86_BUILTIN_TPAUSE:
11177 icode = CODE_FOR_tpause;
11178 break;
11179 default:
11180 gcc_unreachable ();
11181 }
11182 pat = GEN_FCN (icode) (op0, op1);
11183 }
11184
11185 if (!pat)
11186 return 0;
11187
11188 emit_insn (pat);
11189
11190 if (target == 0
11191 || !register_operand (target, QImode))
11192 target = gen_reg_rtx (QImode);
11193
11194 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11195 const0_rtx);
11196 emit_insn (gen_rtx_SET (target, pat));
11197
11198 return target;
11199
11200 case IX86_BUILTIN_CLZERO:
11201 arg0 = CALL_EXPR_ARG (exp, 0);
11202 op0 = expand_normal (arg0);
11203 if (!REG_P (op0))
11204 op0 = ix86_zero_extend_to_Pmode (op0);
11205 emit_insn (gen_clzero (Pmode, op0));
11206 return 0;
11207
11208 case IX86_BUILTIN_CLDEMOTE:
11209 arg0 = CALL_EXPR_ARG (exp, 0);
11210 op0 = expand_normal (arg0);
11211 icode = CODE_FOR_cldemote;
11212 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11213 op0 = ix86_zero_extend_to_Pmode (op0);
11214
11215 emit_insn (gen_cldemote (op0));
11216 return 0;
11217
11218 case IX86_BUILTIN_VEC_INIT_V2SI:
11219 case IX86_BUILTIN_VEC_INIT_V4HI:
11220 case IX86_BUILTIN_VEC_INIT_V8QI:
11221 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11222
11223 case IX86_BUILTIN_VEC_EXT_V2DF:
11224 case IX86_BUILTIN_VEC_EXT_V2DI:
11225 case IX86_BUILTIN_VEC_EXT_V4SF:
11226 case IX86_BUILTIN_VEC_EXT_V4SI:
11227 case IX86_BUILTIN_VEC_EXT_V8HI:
11228 case IX86_BUILTIN_VEC_EXT_V2SI:
11229 case IX86_BUILTIN_VEC_EXT_V4HI:
11230 case IX86_BUILTIN_VEC_EXT_V16QI:
11231 return ix86_expand_vec_ext_builtin (exp, target);
11232
11233 case IX86_BUILTIN_VEC_SET_V2DI:
11234 case IX86_BUILTIN_VEC_SET_V4SF:
11235 case IX86_BUILTIN_VEC_SET_V4SI:
11236 case IX86_BUILTIN_VEC_SET_V8HI:
11237 case IX86_BUILTIN_VEC_SET_V4HI:
11238 case IX86_BUILTIN_VEC_SET_V16QI:
11239 return ix86_expand_vec_set_builtin (exp);
11240
11241 case IX86_BUILTIN_NANQ:
11242 case IX86_BUILTIN_NANSQ:
11243 return expand_call (exp, target, ignore);
11244
11245 case IX86_BUILTIN_RDPID:
11246
11247 op0 = gen_reg_rtx (word_mode);
11248
11249 if (TARGET_64BIT)
11250 {
11251 insn = gen_rdpid_rex64 (op0);
11252 op0 = convert_to_mode (SImode, op0, 1);
11253 }
11254 else
11255 insn = gen_rdpid (op0);
11256
11257 emit_insn (insn);
11258
11259 if (target == 0
11260 || !register_operand (target, SImode))
11261 target = gen_reg_rtx (SImode);
11262
11263 emit_move_insn (target, op0);
11264 return target;
11265
11266 case IX86_BUILTIN_2INTERSECTD512:
11267 case IX86_BUILTIN_2INTERSECTQ512:
11268 case IX86_BUILTIN_2INTERSECTD256:
11269 case IX86_BUILTIN_2INTERSECTQ256:
11270 case IX86_BUILTIN_2INTERSECTD128:
11271 case IX86_BUILTIN_2INTERSECTQ128:
11272 arg0 = CALL_EXPR_ARG (exp, 0);
11273 arg1 = CALL_EXPR_ARG (exp, 1);
11274 arg2 = CALL_EXPR_ARG (exp, 2);
11275 arg3 = CALL_EXPR_ARG (exp, 3);
11276 op0 = expand_normal (arg0);
11277 op1 = expand_normal (arg1);
11278 op2 = expand_normal (arg2);
11279 op3 = expand_normal (arg3);
11280
11281 if (!address_operand (op0, VOIDmode))
11282 {
11283 op0 = convert_memory_address (Pmode, op0);
11284 op0 = copy_addr_to_reg (op0);
11285 }
11286 if (!address_operand (op1, VOIDmode))
11287 {
11288 op1 = convert_memory_address (Pmode, op1);
11289 op1 = copy_addr_to_reg (op1);
11290 }
11291
11292 switch (fcode)
11293 {
11294 case IX86_BUILTIN_2INTERSECTD512:
11295 mode4 = P2HImode;
11296 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11297 break;
11298 case IX86_BUILTIN_2INTERSECTQ512:
11299 mode4 = P2QImode;
11300 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11301 break;
11302 case IX86_BUILTIN_2INTERSECTD256:
11303 mode4 = P2QImode;
11304 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11305 break;
11306 case IX86_BUILTIN_2INTERSECTQ256:
11307 mode4 = P2QImode;
11308 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11309 break;
11310 case IX86_BUILTIN_2INTERSECTD128:
11311 mode4 = P2QImode;
11312 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11313 break;
11314 case IX86_BUILTIN_2INTERSECTQ128:
11315 mode4 = P2QImode;
11316 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11317 break;
11318 default:
11319 gcc_unreachable ();
11320 }
11321
11322 mode2 = insn_data[icode].operand[1].mode;
11323 mode3 = insn_data[icode].operand[2].mode;
11324 if (!insn_data[icode].operand[1].predicate (op2, mode2))
11325 op2 = copy_to_mode_reg (mode2, op2);
11326 if (!insn_data[icode].operand[2].predicate (op3, mode3))
11327 op3 = copy_to_mode_reg (mode3, op3);
11328
11329 op4 = gen_reg_rtx (mode4);
11330 emit_insn (GEN_FCN (icode) (op4, op2, op3));
11331 mode0 = mode4 == P2HImode ? HImode : QImode;
11332 emit_move_insn (gen_rtx_MEM (mode0, op0),
11333 gen_lowpart (mode0, op4));
11334 emit_move_insn (gen_rtx_MEM (mode0, op1),
11335 gen_highpart (mode0, op4));
11336
11337 return 0;
11338
11339 case IX86_BUILTIN_RDPMC:
11340 case IX86_BUILTIN_RDTSC:
11341 case IX86_BUILTIN_RDTSCP:
11342 case IX86_BUILTIN_XGETBV:
11343
11344 op0 = gen_reg_rtx (DImode);
11345 op1 = gen_reg_rtx (DImode);
11346
11347 if (fcode == IX86_BUILTIN_RDPMC)
11348 {
11349 arg0 = CALL_EXPR_ARG (exp, 0);
11350 op2 = expand_normal (arg0);
11351 if (!register_operand (op2, SImode))
11352 op2 = copy_to_mode_reg (SImode, op2);
11353
11354 insn = (TARGET_64BIT
11355 ? gen_rdpmc_rex64 (op0, op1, op2)
11356 : gen_rdpmc (op0, op2));
11357 emit_insn (insn);
11358 }
11359 else if (fcode == IX86_BUILTIN_XGETBV)
11360 {
11361 arg0 = CALL_EXPR_ARG (exp, 0);
11362 op2 = expand_normal (arg0);
11363 if (!register_operand (op2, SImode))
11364 op2 = copy_to_mode_reg (SImode, op2);
11365
11366 insn = (TARGET_64BIT
11367 ? gen_xgetbv_rex64 (op0, op1, op2)
11368 : gen_xgetbv (op0, op2));
11369 emit_insn (insn);
11370 }
11371 else if (fcode == IX86_BUILTIN_RDTSC)
11372 {
11373 insn = (TARGET_64BIT
11374 ? gen_rdtsc_rex64 (op0, op1)
11375 : gen_rdtsc (op0));
11376 emit_insn (insn);
11377 }
11378 else
11379 {
11380 op2 = gen_reg_rtx (SImode);
11381
11382 insn = (TARGET_64BIT
11383 ? gen_rdtscp_rex64 (op0, op1, op2)
11384 : gen_rdtscp (op0, op2));
11385 emit_insn (insn);
11386
11387 arg0 = CALL_EXPR_ARG (exp, 0);
11388 op4 = expand_normal (arg0);
11389 if (!address_operand (op4, VOIDmode))
11390 {
11391 op4 = convert_memory_address (Pmode, op4);
11392 op4 = copy_addr_to_reg (op4);
11393 }
11394 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11395 }
11396
11397 if (target == 0
11398 || !register_operand (target, DImode))
11399 target = gen_reg_rtx (DImode);
11400
11401 if (TARGET_64BIT)
11402 {
11403 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11404 op1, 1, OPTAB_DIRECT);
11405 op0 = expand_simple_binop (DImode, IOR, op0, op1,
11406 op0, 1, OPTAB_DIRECT);
11407 }
11408
11409 emit_move_insn (target, op0);
11410 return target;
11411
11412 case IX86_BUILTIN_ENQCMD:
11413 case IX86_BUILTIN_ENQCMDS:
11414 case IX86_BUILTIN_MOVDIR64B:
11415
11416 arg0 = CALL_EXPR_ARG (exp, 0);
11417 arg1 = CALL_EXPR_ARG (exp, 1);
11418 op0 = expand_normal (arg0);
11419 op1 = expand_normal (arg1);
11420
11421 op0 = ix86_zero_extend_to_Pmode (op0);
11422 if (!address_operand (op1, VOIDmode))
11423 {
11424 op1 = convert_memory_address (Pmode, op1);
11425 op1 = copy_addr_to_reg (op1);
11426 }
11427 op1 = gen_rtx_MEM (XImode, op1);
11428
11429 if (fcode == IX86_BUILTIN_MOVDIR64B)
11430 {
11431 emit_insn (gen_movdir64b (Pmode, op0, op1));
11432 return 0;
11433 }
11434 else
11435 {
11436 rtx pat;
11437
11438 target = gen_reg_rtx (SImode);
11439 emit_move_insn (target, const0_rtx);
11440 target = gen_rtx_SUBREG (QImode, target, 0);
11441
11442 if (fcode == IX86_BUILTIN_ENQCMD)
11443 pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1);
11444 else
11445 pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1);
11446
11447 emit_insn (pat);
11448
11449 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11450 gen_rtx_fmt_ee (EQ, QImode,
11451 SET_DEST (pat),
11452 const0_rtx)));
11453
11454 return SUBREG_REG (target);
11455 }
11456
11457 case IX86_BUILTIN_FXSAVE:
11458 case IX86_BUILTIN_FXRSTOR:
11459 case IX86_BUILTIN_FXSAVE64:
11460 case IX86_BUILTIN_FXRSTOR64:
11461 case IX86_BUILTIN_FNSTENV:
11462 case IX86_BUILTIN_FLDENV:
11463 mode0 = BLKmode;
11464 switch (fcode)
11465 {
11466 case IX86_BUILTIN_FXSAVE:
11467 icode = CODE_FOR_fxsave;
11468 break;
11469 case IX86_BUILTIN_FXRSTOR:
11470 icode = CODE_FOR_fxrstor;
11471 break;
11472 case IX86_BUILTIN_FXSAVE64:
11473 icode = CODE_FOR_fxsave64;
11474 break;
11475 case IX86_BUILTIN_FXRSTOR64:
11476 icode = CODE_FOR_fxrstor64;
11477 break;
11478 case IX86_BUILTIN_FNSTENV:
11479 icode = CODE_FOR_fnstenv;
11480 break;
11481 case IX86_BUILTIN_FLDENV:
11482 icode = CODE_FOR_fldenv;
11483 break;
11484 default:
11485 gcc_unreachable ();
11486 }
11487
11488 arg0 = CALL_EXPR_ARG (exp, 0);
11489 op0 = expand_normal (arg0);
11490
11491 if (!address_operand (op0, VOIDmode))
11492 {
11493 op0 = convert_memory_address (Pmode, op0);
11494 op0 = copy_addr_to_reg (op0);
11495 }
11496 op0 = gen_rtx_MEM (mode0, op0);
11497
11498 pat = GEN_FCN (icode) (op0);
11499 if (pat)
11500 emit_insn (pat);
11501 return 0;
11502
11503 case IX86_BUILTIN_XSETBV:
11504 arg0 = CALL_EXPR_ARG (exp, 0);
11505 arg1 = CALL_EXPR_ARG (exp, 1);
11506 op0 = expand_normal (arg0);
11507 op1 = expand_normal (arg1);
11508
11509 if (!REG_P (op0))
11510 op0 = copy_to_mode_reg (SImode, op0);
11511
11512 op1 = force_reg (DImode, op1);
11513
11514 if (TARGET_64BIT)
11515 {
11516 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11517 NULL, 1, OPTAB_DIRECT);
11518
11519 icode = CODE_FOR_xsetbv_rex64;
11520
11521 op2 = gen_lowpart (SImode, op2);
11522 op1 = gen_lowpart (SImode, op1);
11523 pat = GEN_FCN (icode) (op0, op1, op2);
11524 }
11525 else
11526 {
11527 icode = CODE_FOR_xsetbv;
11528
11529 pat = GEN_FCN (icode) (op0, op1);
11530 }
11531 if (pat)
11532 emit_insn (pat);
11533 return 0;
11534
11535 case IX86_BUILTIN_XSAVE:
11536 case IX86_BUILTIN_XRSTOR:
11537 case IX86_BUILTIN_XSAVE64:
11538 case IX86_BUILTIN_XRSTOR64:
11539 case IX86_BUILTIN_XSAVEOPT:
11540 case IX86_BUILTIN_XSAVEOPT64:
11541 case IX86_BUILTIN_XSAVES:
11542 case IX86_BUILTIN_XRSTORS:
11543 case IX86_BUILTIN_XSAVES64:
11544 case IX86_BUILTIN_XRSTORS64:
11545 case IX86_BUILTIN_XSAVEC:
11546 case IX86_BUILTIN_XSAVEC64:
11547 arg0 = CALL_EXPR_ARG (exp, 0);
11548 arg1 = CALL_EXPR_ARG (exp, 1);
11549 op0 = expand_normal (arg0);
11550 op1 = expand_normal (arg1);
11551
11552 if (!address_operand (op0, VOIDmode))
11553 {
11554 op0 = convert_memory_address (Pmode, op0);
11555 op0 = copy_addr_to_reg (op0);
11556 }
11557 op0 = gen_rtx_MEM (BLKmode, op0);
11558
11559 op1 = force_reg (DImode, op1);
11560
11561 if (TARGET_64BIT)
11562 {
11563 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11564 NULL, 1, OPTAB_DIRECT);
11565 switch (fcode)
11566 {
11567 case IX86_BUILTIN_XSAVE:
11568 icode = CODE_FOR_xsave_rex64;
11569 break;
11570 case IX86_BUILTIN_XRSTOR:
11571 icode = CODE_FOR_xrstor_rex64;
11572 break;
11573 case IX86_BUILTIN_XSAVE64:
11574 icode = CODE_FOR_xsave64;
11575 break;
11576 case IX86_BUILTIN_XRSTOR64:
11577 icode = CODE_FOR_xrstor64;
11578 break;
11579 case IX86_BUILTIN_XSAVEOPT:
11580 icode = CODE_FOR_xsaveopt_rex64;
11581 break;
11582 case IX86_BUILTIN_XSAVEOPT64:
11583 icode = CODE_FOR_xsaveopt64;
11584 break;
11585 case IX86_BUILTIN_XSAVES:
11586 icode = CODE_FOR_xsaves_rex64;
11587 break;
11588 case IX86_BUILTIN_XRSTORS:
11589 icode = CODE_FOR_xrstors_rex64;
11590 break;
11591 case IX86_BUILTIN_XSAVES64:
11592 icode = CODE_FOR_xsaves64;
11593 break;
11594 case IX86_BUILTIN_XRSTORS64:
11595 icode = CODE_FOR_xrstors64;
11596 break;
11597 case IX86_BUILTIN_XSAVEC:
11598 icode = CODE_FOR_xsavec_rex64;
11599 break;
11600 case IX86_BUILTIN_XSAVEC64:
11601 icode = CODE_FOR_xsavec64;
11602 break;
11603 default:
11604 gcc_unreachable ();
11605 }
11606
11607 op2 = gen_lowpart (SImode, op2);
11608 op1 = gen_lowpart (SImode, op1);
11609 pat = GEN_FCN (icode) (op0, op1, op2);
11610 }
11611 else
11612 {
11613 switch (fcode)
11614 {
11615 case IX86_BUILTIN_XSAVE:
11616 icode = CODE_FOR_xsave;
11617 break;
11618 case IX86_BUILTIN_XRSTOR:
11619 icode = CODE_FOR_xrstor;
11620 break;
11621 case IX86_BUILTIN_XSAVEOPT:
11622 icode = CODE_FOR_xsaveopt;
11623 break;
11624 case IX86_BUILTIN_XSAVES:
11625 icode = CODE_FOR_xsaves;
11626 break;
11627 case IX86_BUILTIN_XRSTORS:
11628 icode = CODE_FOR_xrstors;
11629 break;
11630 case IX86_BUILTIN_XSAVEC:
11631 icode = CODE_FOR_xsavec;
11632 break;
11633 default:
11634 gcc_unreachable ();
11635 }
11636 pat = GEN_FCN (icode) (op0, op1);
11637 }
11638
11639 if (pat)
11640 emit_insn (pat);
11641 return 0;
11642
11643 case IX86_BUILTIN_LLWPCB:
11644 arg0 = CALL_EXPR_ARG (exp, 0);
11645 op0 = expand_normal (arg0);
11646 icode = CODE_FOR_lwp_llwpcb;
11647 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11648 op0 = ix86_zero_extend_to_Pmode (op0);
11649 emit_insn (gen_lwp_llwpcb (op0));
11650 return 0;
11651
11652 case IX86_BUILTIN_SLWPCB:
11653 icode = CODE_FOR_lwp_slwpcb;
11654 if (!target
11655 || !insn_data[icode].operand[0].predicate (target, Pmode))
11656 target = gen_reg_rtx (Pmode);
11657 emit_insn (gen_lwp_slwpcb (target));
11658 return target;
11659
11660 case IX86_BUILTIN_BEXTRI32:
11661 case IX86_BUILTIN_BEXTRI64:
11662 arg0 = CALL_EXPR_ARG (exp, 0);
11663 arg1 = CALL_EXPR_ARG (exp, 1);
11664 op0 = expand_normal (arg0);
11665 op1 = expand_normal (arg1);
11666 icode = (fcode == IX86_BUILTIN_BEXTRI32
11667 ? CODE_FOR_tbm_bextri_si
11668 : CODE_FOR_tbm_bextri_di);
11669 if (!CONST_INT_P (op1))
11670 {
11671 error ("last argument must be an immediate");
11672 return const0_rtx;
11673 }
11674 else
11675 {
11676 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
11677 unsigned char lsb_index = INTVAL (op1) & 0xFF;
11678 op1 = GEN_INT (length);
11679 op2 = GEN_INT (lsb_index);
11680
11681 mode1 = insn_data[icode].operand[1].mode;
11682 if (!insn_data[icode].operand[1].predicate (op0, mode1))
11683 op0 = copy_to_mode_reg (mode1, op0);
11684
11685 mode0 = insn_data[icode].operand[0].mode;
11686 if (target == 0
11687 || !register_operand (target, mode0))
11688 target = gen_reg_rtx (mode0);
11689
11690 pat = GEN_FCN (icode) (target, op0, op1, op2);
11691 if (pat)
11692 emit_insn (pat);
11693 return target;
11694 }
11695
11696 case IX86_BUILTIN_RDRAND16_STEP:
11697 icode = CODE_FOR_rdrandhi_1;
11698 mode0 = HImode;
11699 goto rdrand_step;
11700
11701 case IX86_BUILTIN_RDRAND32_STEP:
11702 icode = CODE_FOR_rdrandsi_1;
11703 mode0 = SImode;
11704 goto rdrand_step;
11705
11706 case IX86_BUILTIN_RDRAND64_STEP:
11707 icode = CODE_FOR_rdranddi_1;
11708 mode0 = DImode;
11709
11710 rdrand_step:
11711 arg0 = CALL_EXPR_ARG (exp, 0);
11712 op1 = expand_normal (arg0);
11713 if (!address_operand (op1, VOIDmode))
11714 {
11715 op1 = convert_memory_address (Pmode, op1);
11716 op1 = copy_addr_to_reg (op1);
11717 }
11718
11719 op0 = gen_reg_rtx (mode0);
11720 emit_insn (GEN_FCN (icode) (op0));
11721
11722 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11723
11724 op1 = gen_reg_rtx (SImode);
11725 emit_move_insn (op1, CONST1_RTX (SImode));
11726
11727 /* Emit SImode conditional move. */
11728 if (mode0 == HImode)
11729 {
11730 if (TARGET_ZERO_EXTEND_WITH_AND
11731 && optimize_function_for_speed_p (cfun))
11732 {
11733 op2 = force_reg (SImode, const0_rtx);
11734
11735 emit_insn (gen_movstricthi
11736 (gen_lowpart (HImode, op2), op0));
11737 }
11738 else
11739 {
11740 op2 = gen_reg_rtx (SImode);
11741
11742 emit_insn (gen_zero_extendhisi2 (op2, op0));
11743 }
11744 }
11745 else if (mode0 == SImode)
11746 op2 = op0;
11747 else
11748 op2 = gen_rtx_SUBREG (SImode, op0, 0);
11749
11750 if (target == 0
11751 || !register_operand (target, SImode))
11752 target = gen_reg_rtx (SImode);
11753
11754 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
11755 const0_rtx);
11756 emit_insn (gen_rtx_SET (target,
11757 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
11758 return target;
11759
11760 case IX86_BUILTIN_RDSEED16_STEP:
11761 icode = CODE_FOR_rdseedhi_1;
11762 mode0 = HImode;
11763 goto rdseed_step;
11764
11765 case IX86_BUILTIN_RDSEED32_STEP:
11766 icode = CODE_FOR_rdseedsi_1;
11767 mode0 = SImode;
11768 goto rdseed_step;
11769
11770 case IX86_BUILTIN_RDSEED64_STEP:
11771 icode = CODE_FOR_rdseeddi_1;
11772 mode0 = DImode;
11773
11774 rdseed_step:
11775 arg0 = CALL_EXPR_ARG (exp, 0);
11776 op1 = expand_normal (arg0);
11777 if (!address_operand (op1, VOIDmode))
11778 {
11779 op1 = convert_memory_address (Pmode, op1);
11780 op1 = copy_addr_to_reg (op1);
11781 }
11782
11783 op0 = gen_reg_rtx (mode0);
11784 emit_insn (GEN_FCN (icode) (op0));
11785
11786 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11787
11788 op2 = gen_reg_rtx (QImode);
11789
11790 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11791 const0_rtx);
11792 emit_insn (gen_rtx_SET (op2, pat));
11793
11794 if (target == 0
11795 || !register_operand (target, SImode))
11796 target = gen_reg_rtx (SImode);
11797
11798 emit_insn (gen_zero_extendqisi2 (target, op2));
11799 return target;
11800
11801 case IX86_BUILTIN_SBB32:
11802 icode = CODE_FOR_subborrowsi;
11803 icode2 = CODE_FOR_subborrowsi_0;
11804 mode0 = SImode;
11805 mode1 = DImode;
11806 mode2 = CCmode;
11807 goto handlecarry;
11808
11809 case IX86_BUILTIN_SBB64:
11810 icode = CODE_FOR_subborrowdi;
11811 icode2 = CODE_FOR_subborrowdi_0;
11812 mode0 = DImode;
11813 mode1 = TImode;
11814 mode2 = CCmode;
11815 goto handlecarry;
11816
11817 case IX86_BUILTIN_ADDCARRYX32:
11818 icode = CODE_FOR_addcarrysi;
11819 icode2 = CODE_FOR_addcarrysi_0;
11820 mode0 = SImode;
11821 mode1 = DImode;
11822 mode2 = CCCmode;
11823 goto handlecarry;
11824
11825 case IX86_BUILTIN_ADDCARRYX64:
11826 icode = CODE_FOR_addcarrydi;
11827 icode2 = CODE_FOR_addcarrydi_0;
11828 mode0 = DImode;
11829 mode1 = TImode;
11830 mode2 = CCCmode;
11831
11832 handlecarry:
11833 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
11834 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
11835 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
11836 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
11837
11838 op1 = expand_normal (arg0);
11839 if (!integer_zerop (arg0))
11840 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
11841
11842 op2 = expand_normal (arg1);
11843 if (!register_operand (op2, mode0))
11844 op2 = copy_to_mode_reg (mode0, op2);
11845
11846 op3 = expand_normal (arg2);
11847 if (!register_operand (op3, mode0))
11848 op3 = copy_to_mode_reg (mode0, op3);
11849
11850 op4 = expand_normal (arg3);
11851 if (!address_operand (op4, VOIDmode))
11852 {
11853 op4 = convert_memory_address (Pmode, op4);
11854 op4 = copy_addr_to_reg (op4);
11855 }
11856
11857 op0 = gen_reg_rtx (mode0);
11858 if (integer_zerop (arg0))
11859 {
11860 /* If arg0 is 0, optimize right away into add or sub
11861 instruction that sets CCCmode flags. */
11862 op1 = gen_rtx_REG (mode2, FLAGS_REG);
11863 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
11864 }
11865 else
11866 {
11867 /* Generate CF from input operand. */
11868 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
11869
11870 /* Generate instruction that consumes CF. */
11871 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
11872 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
11873 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
11874 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
11875 }
11876
11877 /* Return current CF value. */
11878 if (target == 0)
11879 target = gen_reg_rtx (QImode);
11880
11881 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
11882 emit_insn (gen_rtx_SET (target, pat));
11883
11884 /* Store the result. */
11885 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
11886
11887 return target;
11888
11889 case IX86_BUILTIN_READ_FLAGS:
11890 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
11891
11892 if (optimize
11893 || target == NULL_RTX
11894 || !nonimmediate_operand (target, word_mode)
11895 || GET_MODE (target) != word_mode)
11896 target = gen_reg_rtx (word_mode);
11897
11898 emit_insn (gen_pop (target));
11899 return target;
11900
11901 case IX86_BUILTIN_WRITE_FLAGS:
11902
11903 arg0 = CALL_EXPR_ARG (exp, 0);
11904 op0 = expand_normal (arg0);
11905 if (!general_no_elim_operand (op0, word_mode))
11906 op0 = copy_to_mode_reg (word_mode, op0);
11907
11908 emit_insn (gen_push (op0));
11909 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
11910 return 0;
11911
11912 case IX86_BUILTIN_KTESTC8:
11913 icode = CODE_FOR_ktestqi;
11914 mode3 = CCCmode;
11915 goto kortest;
11916
11917 case IX86_BUILTIN_KTESTZ8:
11918 icode = CODE_FOR_ktestqi;
11919 mode3 = CCZmode;
11920 goto kortest;
11921
11922 case IX86_BUILTIN_KTESTC16:
11923 icode = CODE_FOR_ktesthi;
11924 mode3 = CCCmode;
11925 goto kortest;
11926
11927 case IX86_BUILTIN_KTESTZ16:
11928 icode = CODE_FOR_ktesthi;
11929 mode3 = CCZmode;
11930 goto kortest;
11931
11932 case IX86_BUILTIN_KTESTC32:
11933 icode = CODE_FOR_ktestsi;
11934 mode3 = CCCmode;
11935 goto kortest;
11936
11937 case IX86_BUILTIN_KTESTZ32:
11938 icode = CODE_FOR_ktestsi;
11939 mode3 = CCZmode;
11940 goto kortest;
11941
11942 case IX86_BUILTIN_KTESTC64:
11943 icode = CODE_FOR_ktestdi;
11944 mode3 = CCCmode;
11945 goto kortest;
11946
11947 case IX86_BUILTIN_KTESTZ64:
11948 icode = CODE_FOR_ktestdi;
11949 mode3 = CCZmode;
11950 goto kortest;
11951
11952 case IX86_BUILTIN_KORTESTC8:
11953 icode = CODE_FOR_kortestqi;
11954 mode3 = CCCmode;
11955 goto kortest;
11956
11957 case IX86_BUILTIN_KORTESTZ8:
11958 icode = CODE_FOR_kortestqi;
11959 mode3 = CCZmode;
11960 goto kortest;
11961
11962 case IX86_BUILTIN_KORTESTC16:
11963 icode = CODE_FOR_kortesthi;
11964 mode3 = CCCmode;
11965 goto kortest;
11966
11967 case IX86_BUILTIN_KORTESTZ16:
11968 icode = CODE_FOR_kortesthi;
11969 mode3 = CCZmode;
11970 goto kortest;
11971
11972 case IX86_BUILTIN_KORTESTC32:
11973 icode = CODE_FOR_kortestsi;
11974 mode3 = CCCmode;
11975 goto kortest;
11976
11977 case IX86_BUILTIN_KORTESTZ32:
11978 icode = CODE_FOR_kortestsi;
11979 mode3 = CCZmode;
11980 goto kortest;
11981
11982 case IX86_BUILTIN_KORTESTC64:
11983 icode = CODE_FOR_kortestdi;
11984 mode3 = CCCmode;
11985 goto kortest;
11986
11987 case IX86_BUILTIN_KORTESTZ64:
11988 icode = CODE_FOR_kortestdi;
11989 mode3 = CCZmode;
11990
11991 kortest:
11992 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
11993 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
11994 op0 = expand_normal (arg0);
11995 op1 = expand_normal (arg1);
11996
11997 mode0 = insn_data[icode].operand[0].mode;
11998 mode1 = insn_data[icode].operand[1].mode;
11999
12000 if (GET_MODE (op0) != VOIDmode)
12001 op0 = force_reg (GET_MODE (op0), op0);
12002
12003 op0 = gen_lowpart (mode0, op0);
12004
12005 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12006 op0 = copy_to_mode_reg (mode0, op0);
12007
12008 if (GET_MODE (op1) != VOIDmode)
12009 op1 = force_reg (GET_MODE (op1), op1);
12010
12011 op1 = gen_lowpart (mode1, op1);
12012
12013 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12014 op1 = copy_to_mode_reg (mode1, op1);
12015
12016 target = gen_reg_rtx (QImode);
12017
12018 /* Emit kortest. */
12019 emit_insn (GEN_FCN (icode) (op0, op1));
12020 /* And use setcc to return result from flags. */
12021 ix86_expand_setcc (target, EQ,
12022 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12023 return target;
12024
12025 case IX86_BUILTIN_GATHERSIV2DF:
12026 icode = CODE_FOR_avx2_gathersiv2df;
12027 goto gather_gen;
12028 case IX86_BUILTIN_GATHERSIV4DF:
12029 icode = CODE_FOR_avx2_gathersiv4df;
12030 goto gather_gen;
12031 case IX86_BUILTIN_GATHERDIV2DF:
12032 icode = CODE_FOR_avx2_gatherdiv2df;
12033 goto gather_gen;
12034 case IX86_BUILTIN_GATHERDIV4DF:
12035 icode = CODE_FOR_avx2_gatherdiv4df;
12036 goto gather_gen;
12037 case IX86_BUILTIN_GATHERSIV4SF:
12038 icode = CODE_FOR_avx2_gathersiv4sf;
12039 goto gather_gen;
12040 case IX86_BUILTIN_GATHERSIV8SF:
12041 icode = CODE_FOR_avx2_gathersiv8sf;
12042 goto gather_gen;
12043 case IX86_BUILTIN_GATHERDIV4SF:
12044 icode = CODE_FOR_avx2_gatherdiv4sf;
12045 goto gather_gen;
12046 case IX86_BUILTIN_GATHERDIV8SF:
12047 icode = CODE_FOR_avx2_gatherdiv8sf;
12048 goto gather_gen;
12049 case IX86_BUILTIN_GATHERSIV2DI:
12050 icode = CODE_FOR_avx2_gathersiv2di;
12051 goto gather_gen;
12052 case IX86_BUILTIN_GATHERSIV4DI:
12053 icode = CODE_FOR_avx2_gathersiv4di;
12054 goto gather_gen;
12055 case IX86_BUILTIN_GATHERDIV2DI:
12056 icode = CODE_FOR_avx2_gatherdiv2di;
12057 goto gather_gen;
12058 case IX86_BUILTIN_GATHERDIV4DI:
12059 icode = CODE_FOR_avx2_gatherdiv4di;
12060 goto gather_gen;
12061 case IX86_BUILTIN_GATHERSIV4SI:
12062 icode = CODE_FOR_avx2_gathersiv4si;
12063 goto gather_gen;
12064 case IX86_BUILTIN_GATHERSIV8SI:
12065 icode = CODE_FOR_avx2_gathersiv8si;
12066 goto gather_gen;
12067 case IX86_BUILTIN_GATHERDIV4SI:
12068 icode = CODE_FOR_avx2_gatherdiv4si;
12069 goto gather_gen;
12070 case IX86_BUILTIN_GATHERDIV8SI:
12071 icode = CODE_FOR_avx2_gatherdiv8si;
12072 goto gather_gen;
12073 case IX86_BUILTIN_GATHERALTSIV4DF:
12074 icode = CODE_FOR_avx2_gathersiv4df;
12075 goto gather_gen;
12076 case IX86_BUILTIN_GATHERALTDIV8SF:
12077 icode = CODE_FOR_avx2_gatherdiv8sf;
12078 goto gather_gen;
12079 case IX86_BUILTIN_GATHERALTSIV4DI:
12080 icode = CODE_FOR_avx2_gathersiv4di;
12081 goto gather_gen;
12082 case IX86_BUILTIN_GATHERALTDIV8SI:
12083 icode = CODE_FOR_avx2_gatherdiv8si;
12084 goto gather_gen;
12085 case IX86_BUILTIN_GATHER3SIV16SF:
12086 icode = CODE_FOR_avx512f_gathersiv16sf;
12087 goto gather_gen;
12088 case IX86_BUILTIN_GATHER3SIV8DF:
12089 icode = CODE_FOR_avx512f_gathersiv8df;
12090 goto gather_gen;
12091 case IX86_BUILTIN_GATHER3DIV16SF:
12092 icode = CODE_FOR_avx512f_gatherdiv16sf;
12093 goto gather_gen;
12094 case IX86_BUILTIN_GATHER3DIV8DF:
12095 icode = CODE_FOR_avx512f_gatherdiv8df;
12096 goto gather_gen;
12097 case IX86_BUILTIN_GATHER3SIV16SI:
12098 icode = CODE_FOR_avx512f_gathersiv16si;
12099 goto gather_gen;
12100 case IX86_BUILTIN_GATHER3SIV8DI:
12101 icode = CODE_FOR_avx512f_gathersiv8di;
12102 goto gather_gen;
12103 case IX86_BUILTIN_GATHER3DIV16SI:
12104 icode = CODE_FOR_avx512f_gatherdiv16si;
12105 goto gather_gen;
12106 case IX86_BUILTIN_GATHER3DIV8DI:
12107 icode = CODE_FOR_avx512f_gatherdiv8di;
12108 goto gather_gen;
12109 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12110 icode = CODE_FOR_avx512f_gathersiv8df;
12111 goto gather_gen;
12112 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12113 icode = CODE_FOR_avx512f_gatherdiv16sf;
12114 goto gather_gen;
12115 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12116 icode = CODE_FOR_avx512f_gathersiv8di;
12117 goto gather_gen;
12118 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12119 icode = CODE_FOR_avx512f_gatherdiv16si;
12120 goto gather_gen;
12121 case IX86_BUILTIN_GATHER3SIV2DF:
12122 icode = CODE_FOR_avx512vl_gathersiv2df;
12123 goto gather_gen;
12124 case IX86_BUILTIN_GATHER3SIV4DF:
12125 icode = CODE_FOR_avx512vl_gathersiv4df;
12126 goto gather_gen;
12127 case IX86_BUILTIN_GATHER3DIV2DF:
12128 icode = CODE_FOR_avx512vl_gatherdiv2df;
12129 goto gather_gen;
12130 case IX86_BUILTIN_GATHER3DIV4DF:
12131 icode = CODE_FOR_avx512vl_gatherdiv4df;
12132 goto gather_gen;
12133 case IX86_BUILTIN_GATHER3SIV4SF:
12134 icode = CODE_FOR_avx512vl_gathersiv4sf;
12135 goto gather_gen;
12136 case IX86_BUILTIN_GATHER3SIV8SF:
12137 icode = CODE_FOR_avx512vl_gathersiv8sf;
12138 goto gather_gen;
12139 case IX86_BUILTIN_GATHER3DIV4SF:
12140 icode = CODE_FOR_avx512vl_gatherdiv4sf;
12141 goto gather_gen;
12142 case IX86_BUILTIN_GATHER3DIV8SF:
12143 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12144 goto gather_gen;
12145 case IX86_BUILTIN_GATHER3SIV2DI:
12146 icode = CODE_FOR_avx512vl_gathersiv2di;
12147 goto gather_gen;
12148 case IX86_BUILTIN_GATHER3SIV4DI:
12149 icode = CODE_FOR_avx512vl_gathersiv4di;
12150 goto gather_gen;
12151 case IX86_BUILTIN_GATHER3DIV2DI:
12152 icode = CODE_FOR_avx512vl_gatherdiv2di;
12153 goto gather_gen;
12154 case IX86_BUILTIN_GATHER3DIV4DI:
12155 icode = CODE_FOR_avx512vl_gatherdiv4di;
12156 goto gather_gen;
12157 case IX86_BUILTIN_GATHER3SIV4SI:
12158 icode = CODE_FOR_avx512vl_gathersiv4si;
12159 goto gather_gen;
12160 case IX86_BUILTIN_GATHER3SIV8SI:
12161 icode = CODE_FOR_avx512vl_gathersiv8si;
12162 goto gather_gen;
12163 case IX86_BUILTIN_GATHER3DIV4SI:
12164 icode = CODE_FOR_avx512vl_gatherdiv4si;
12165 goto gather_gen;
12166 case IX86_BUILTIN_GATHER3DIV8SI:
12167 icode = CODE_FOR_avx512vl_gatherdiv8si;
12168 goto gather_gen;
12169 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12170 icode = CODE_FOR_avx512vl_gathersiv4df;
12171 goto gather_gen;
12172 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12173 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12174 goto gather_gen;
12175 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12176 icode = CODE_FOR_avx512vl_gathersiv4di;
12177 goto gather_gen;
12178 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12179 icode = CODE_FOR_avx512vl_gatherdiv8si;
12180 goto gather_gen;
12181 case IX86_BUILTIN_SCATTERSIV16SF:
12182 icode = CODE_FOR_avx512f_scattersiv16sf;
12183 goto scatter_gen;
12184 case IX86_BUILTIN_SCATTERSIV8DF:
12185 icode = CODE_FOR_avx512f_scattersiv8df;
12186 goto scatter_gen;
12187 case IX86_BUILTIN_SCATTERDIV16SF:
12188 icode = CODE_FOR_avx512f_scatterdiv16sf;
12189 goto scatter_gen;
12190 case IX86_BUILTIN_SCATTERDIV8DF:
12191 icode = CODE_FOR_avx512f_scatterdiv8df;
12192 goto scatter_gen;
12193 case IX86_BUILTIN_SCATTERSIV16SI:
12194 icode = CODE_FOR_avx512f_scattersiv16si;
12195 goto scatter_gen;
12196 case IX86_BUILTIN_SCATTERSIV8DI:
12197 icode = CODE_FOR_avx512f_scattersiv8di;
12198 goto scatter_gen;
12199 case IX86_BUILTIN_SCATTERDIV16SI:
12200 icode = CODE_FOR_avx512f_scatterdiv16si;
12201 goto scatter_gen;
12202 case IX86_BUILTIN_SCATTERDIV8DI:
12203 icode = CODE_FOR_avx512f_scatterdiv8di;
12204 goto scatter_gen;
12205 case IX86_BUILTIN_SCATTERSIV8SF:
12206 icode = CODE_FOR_avx512vl_scattersiv8sf;
12207 goto scatter_gen;
12208 case IX86_BUILTIN_SCATTERSIV4SF:
12209 icode = CODE_FOR_avx512vl_scattersiv4sf;
12210 goto scatter_gen;
12211 case IX86_BUILTIN_SCATTERSIV4DF:
12212 icode = CODE_FOR_avx512vl_scattersiv4df;
12213 goto scatter_gen;
12214 case IX86_BUILTIN_SCATTERSIV2DF:
12215 icode = CODE_FOR_avx512vl_scattersiv2df;
12216 goto scatter_gen;
12217 case IX86_BUILTIN_SCATTERDIV8SF:
12218 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12219 goto scatter_gen;
12220 case IX86_BUILTIN_SCATTERDIV4SF:
12221 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12222 goto scatter_gen;
12223 case IX86_BUILTIN_SCATTERDIV4DF:
12224 icode = CODE_FOR_avx512vl_scatterdiv4df;
12225 goto scatter_gen;
12226 case IX86_BUILTIN_SCATTERDIV2DF:
12227 icode = CODE_FOR_avx512vl_scatterdiv2df;
12228 goto scatter_gen;
12229 case IX86_BUILTIN_SCATTERSIV8SI:
12230 icode = CODE_FOR_avx512vl_scattersiv8si;
12231 goto scatter_gen;
12232 case IX86_BUILTIN_SCATTERSIV4SI:
12233 icode = CODE_FOR_avx512vl_scattersiv4si;
12234 goto scatter_gen;
12235 case IX86_BUILTIN_SCATTERSIV4DI:
12236 icode = CODE_FOR_avx512vl_scattersiv4di;
12237 goto scatter_gen;
12238 case IX86_BUILTIN_SCATTERSIV2DI:
12239 icode = CODE_FOR_avx512vl_scattersiv2di;
12240 goto scatter_gen;
12241 case IX86_BUILTIN_SCATTERDIV8SI:
12242 icode = CODE_FOR_avx512vl_scatterdiv8si;
12243 goto scatter_gen;
12244 case IX86_BUILTIN_SCATTERDIV4SI:
12245 icode = CODE_FOR_avx512vl_scatterdiv4si;
12246 goto scatter_gen;
12247 case IX86_BUILTIN_SCATTERDIV4DI:
12248 icode = CODE_FOR_avx512vl_scatterdiv4di;
12249 goto scatter_gen;
12250 case IX86_BUILTIN_SCATTERDIV2DI:
12251 icode = CODE_FOR_avx512vl_scatterdiv2di;
12252 goto scatter_gen;
12253 case IX86_BUILTIN_GATHERPFDPD:
12254 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12255 goto vec_prefetch_gen;
12256 case IX86_BUILTIN_SCATTERALTSIV8DF:
12257 icode = CODE_FOR_avx512f_scattersiv8df;
12258 goto scatter_gen;
12259 case IX86_BUILTIN_SCATTERALTDIV16SF:
12260 icode = CODE_FOR_avx512f_scatterdiv16sf;
12261 goto scatter_gen;
12262 case IX86_BUILTIN_SCATTERALTSIV8DI:
12263 icode = CODE_FOR_avx512f_scattersiv8di;
12264 goto scatter_gen;
12265 case IX86_BUILTIN_SCATTERALTDIV16SI:
12266 icode = CODE_FOR_avx512f_scatterdiv16si;
12267 goto scatter_gen;
12268 case IX86_BUILTIN_SCATTERALTSIV4DF:
12269 icode = CODE_FOR_avx512vl_scattersiv4df;
12270 goto scatter_gen;
12271 case IX86_BUILTIN_SCATTERALTDIV8SF:
12272 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12273 goto scatter_gen;
12274 case IX86_BUILTIN_SCATTERALTSIV4DI:
12275 icode = CODE_FOR_avx512vl_scattersiv4di;
12276 goto scatter_gen;
12277 case IX86_BUILTIN_SCATTERALTDIV8SI:
12278 icode = CODE_FOR_avx512vl_scatterdiv8si;
12279 goto scatter_gen;
12280 case IX86_BUILTIN_SCATTERALTSIV2DF:
12281 icode = CODE_FOR_avx512vl_scattersiv2df;
12282 goto scatter_gen;
12283 case IX86_BUILTIN_SCATTERALTDIV4SF:
12284 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12285 goto scatter_gen;
12286 case IX86_BUILTIN_SCATTERALTSIV2DI:
12287 icode = CODE_FOR_avx512vl_scattersiv2di;
12288 goto scatter_gen;
12289 case IX86_BUILTIN_SCATTERALTDIV4SI:
12290 icode = CODE_FOR_avx512vl_scatterdiv4si;
12291 goto scatter_gen;
12292 case IX86_BUILTIN_GATHERPFDPS:
12293 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12294 goto vec_prefetch_gen;
12295 case IX86_BUILTIN_GATHERPFQPD:
12296 icode = CODE_FOR_avx512pf_gatherpfv8didf;
12297 goto vec_prefetch_gen;
12298 case IX86_BUILTIN_GATHERPFQPS:
12299 icode = CODE_FOR_avx512pf_gatherpfv8disf;
12300 goto vec_prefetch_gen;
12301 case IX86_BUILTIN_SCATTERPFDPD:
12302 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12303 goto vec_prefetch_gen;
12304 case IX86_BUILTIN_SCATTERPFDPS:
12305 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12306 goto vec_prefetch_gen;
12307 case IX86_BUILTIN_SCATTERPFQPD:
12308 icode = CODE_FOR_avx512pf_scatterpfv8didf;
12309 goto vec_prefetch_gen;
12310 case IX86_BUILTIN_SCATTERPFQPS:
12311 icode = CODE_FOR_avx512pf_scatterpfv8disf;
12312 goto vec_prefetch_gen;
12313
12314 gather_gen:
12315 rtx half;
12316 rtx (*gen) (rtx, rtx);
12317
12318 arg0 = CALL_EXPR_ARG (exp, 0);
12319 arg1 = CALL_EXPR_ARG (exp, 1);
12320 arg2 = CALL_EXPR_ARG (exp, 2);
12321 arg3 = CALL_EXPR_ARG (exp, 3);
12322 arg4 = CALL_EXPR_ARG (exp, 4);
12323 op0 = expand_normal (arg0);
12324 op1 = expand_normal (arg1);
12325 op2 = expand_normal (arg2);
12326 op3 = expand_normal (arg3);
12327 op4 = expand_normal (arg4);
12328 /* Note the arg order is different from the operand order. */
12329 mode0 = insn_data[icode].operand[1].mode;
12330 mode2 = insn_data[icode].operand[3].mode;
12331 mode3 = insn_data[icode].operand[4].mode;
12332 mode4 = insn_data[icode].operand[5].mode;
12333
12334 if (target == NULL_RTX
12335 || GET_MODE (target) != insn_data[icode].operand[0].mode
12336 || !insn_data[icode].operand[0].predicate (target,
12337 GET_MODE (target)))
12338 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12339 else
12340 subtarget = target;
12341
12342 switch (fcode)
12343 {
12344 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12345 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12346 half = gen_reg_rtx (V8SImode);
12347 if (!nonimmediate_operand (op2, V16SImode))
12348 op2 = copy_to_mode_reg (V16SImode, op2);
12349 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12350 op2 = half;
12351 break;
12352 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12353 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12354 case IX86_BUILTIN_GATHERALTSIV4DF:
12355 case IX86_BUILTIN_GATHERALTSIV4DI:
12356 half = gen_reg_rtx (V4SImode);
12357 if (!nonimmediate_operand (op2, V8SImode))
12358 op2 = copy_to_mode_reg (V8SImode, op2);
12359 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12360 op2 = half;
12361 break;
12362 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12363 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12364 half = gen_reg_rtx (mode0);
12365 if (mode0 == V8SFmode)
12366 gen = gen_vec_extract_lo_v16sf;
12367 else
12368 gen = gen_vec_extract_lo_v16si;
12369 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12370 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12371 emit_insn (gen (half, op0));
12372 op0 = half;
12373 op3 = lowpart_subreg (QImode, op3, HImode);
12374 break;
12375 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12376 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12377 case IX86_BUILTIN_GATHERALTDIV8SF:
12378 case IX86_BUILTIN_GATHERALTDIV8SI:
12379 half = gen_reg_rtx (mode0);
12380 if (mode0 == V4SFmode)
12381 gen = gen_vec_extract_lo_v8sf;
12382 else
12383 gen = gen_vec_extract_lo_v8si;
12384 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12385 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12386 emit_insn (gen (half, op0));
12387 op0 = half;
12388 if (VECTOR_MODE_P (GET_MODE (op3)))
12389 {
12390 half = gen_reg_rtx (mode0);
12391 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12392 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12393 emit_insn (gen (half, op3));
12394 op3 = half;
12395 }
12396 break;
12397 default:
12398 break;
12399 }
12400
12401 /* Force memory operand only with base register here. But we
12402 don't want to do it on memory operand for other builtin
12403 functions. */
12404 op1 = ix86_zero_extend_to_Pmode (op1);
12405
12406 if (!insn_data[icode].operand[1].predicate (op0, mode0))
12407 op0 = copy_to_mode_reg (mode0, op0);
12408 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12409 op1 = copy_to_mode_reg (Pmode, op1);
12410 if (!insn_data[icode].operand[3].predicate (op2, mode2))
12411 op2 = copy_to_mode_reg (mode2, op2);
12412
12413 op3 = fixup_modeless_constant (op3, mode3);
12414
12415 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12416 {
12417 if (!insn_data[icode].operand[4].predicate (op3, mode3))
12418 op3 = copy_to_mode_reg (mode3, op3);
12419 }
12420 else
12421 {
12422 op3 = copy_to_reg (op3);
12423 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12424 }
12425 if (!insn_data[icode].operand[5].predicate (op4, mode4))
12426 {
12427 error ("the last argument must be scale 1, 2, 4, 8");
12428 return const0_rtx;
12429 }
12430
12431 /* Optimize. If mask is known to have all high bits set,
12432 replace op0 with pc_rtx to signal that the instruction
12433 overwrites the whole destination and doesn't use its
12434 previous contents. */
12435 if (optimize)
12436 {
12437 if (TREE_CODE (arg3) == INTEGER_CST)
12438 {
12439 if (integer_all_onesp (arg3))
12440 op0 = pc_rtx;
12441 }
12442 else if (TREE_CODE (arg3) == VECTOR_CST)
12443 {
12444 unsigned int negative = 0;
12445 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12446 {
12447 tree cst = VECTOR_CST_ELT (arg3, i);
12448 if (TREE_CODE (cst) == INTEGER_CST
12449 && tree_int_cst_sign_bit (cst))
12450 negative++;
12451 else if (TREE_CODE (cst) == REAL_CST
12452 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12453 negative++;
12454 }
12455 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12456 op0 = pc_rtx;
12457 }
12458 else if (TREE_CODE (arg3) == SSA_NAME
12459 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12460 {
12461 /* Recognize also when mask is like:
12462 __v2df src = _mm_setzero_pd ();
12463 __v2df mask = _mm_cmpeq_pd (src, src);
12464 or
12465 __v8sf src = _mm256_setzero_ps ();
12466 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12467 as that is a cheaper way to load all ones into
12468 a register than having to load a constant from
12469 memory. */
12470 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12471 if (is_gimple_call (def_stmt))
12472 {
12473 tree fndecl = gimple_call_fndecl (def_stmt);
12474 if (fndecl
12475 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
12476 switch (DECL_MD_FUNCTION_CODE (fndecl))
12477 {
12478 case IX86_BUILTIN_CMPPD:
12479 case IX86_BUILTIN_CMPPS:
12480 case IX86_BUILTIN_CMPPD256:
12481 case IX86_BUILTIN_CMPPS256:
12482 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12483 break;
12484 /* FALLTHRU */
12485 case IX86_BUILTIN_CMPEQPD:
12486 case IX86_BUILTIN_CMPEQPS:
12487 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12488 && initializer_zerop (gimple_call_arg (def_stmt,
12489 1)))
12490 op0 = pc_rtx;
12491 break;
12492 default:
12493 break;
12494 }
12495 }
12496 }
12497 }
12498
12499 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12500 if (! pat)
12501 return const0_rtx;
12502 emit_insn (pat);
12503
12504 switch (fcode)
12505 {
12506 case IX86_BUILTIN_GATHER3DIV16SF:
12507 if (target == NULL_RTX)
12508 target = gen_reg_rtx (V8SFmode);
12509 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12510 break;
12511 case IX86_BUILTIN_GATHER3DIV16SI:
12512 if (target == NULL_RTX)
12513 target = gen_reg_rtx (V8SImode);
12514 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12515 break;
12516 case IX86_BUILTIN_GATHER3DIV8SF:
12517 case IX86_BUILTIN_GATHERDIV8SF:
12518 if (target == NULL_RTX)
12519 target = gen_reg_rtx (V4SFmode);
12520 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12521 break;
12522 case IX86_BUILTIN_GATHER3DIV8SI:
12523 case IX86_BUILTIN_GATHERDIV8SI:
12524 if (target == NULL_RTX)
12525 target = gen_reg_rtx (V4SImode);
12526 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12527 break;
12528 default:
12529 target = subtarget;
12530 break;
12531 }
12532 return target;
12533
12534 scatter_gen:
12535 arg0 = CALL_EXPR_ARG (exp, 0);
12536 arg1 = CALL_EXPR_ARG (exp, 1);
12537 arg2 = CALL_EXPR_ARG (exp, 2);
12538 arg3 = CALL_EXPR_ARG (exp, 3);
12539 arg4 = CALL_EXPR_ARG (exp, 4);
12540 op0 = expand_normal (arg0);
12541 op1 = expand_normal (arg1);
12542 op2 = expand_normal (arg2);
12543 op3 = expand_normal (arg3);
12544 op4 = expand_normal (arg4);
12545 mode1 = insn_data[icode].operand[1].mode;
12546 mode2 = insn_data[icode].operand[2].mode;
12547 mode3 = insn_data[icode].operand[3].mode;
12548 mode4 = insn_data[icode].operand[4].mode;
12549
12550 /* Scatter instruction stores operand op3 to memory with
12551 indices from op2 and scale from op4 under writemask op1.
12552 If index operand op2 has more elements then source operand
12553 op3 one need to use only its low half. And vice versa. */
12554 switch (fcode)
12555 {
12556 case IX86_BUILTIN_SCATTERALTSIV8DF:
12557 case IX86_BUILTIN_SCATTERALTSIV8DI:
12558 half = gen_reg_rtx (V8SImode);
12559 if (!nonimmediate_operand (op2, V16SImode))
12560 op2 = copy_to_mode_reg (V16SImode, op2);
12561 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12562 op2 = half;
12563 break;
12564 case IX86_BUILTIN_SCATTERALTDIV16SF:
12565 case IX86_BUILTIN_SCATTERALTDIV16SI:
12566 half = gen_reg_rtx (mode3);
12567 if (mode3 == V8SFmode)
12568 gen = gen_vec_extract_lo_v16sf;
12569 else
12570 gen = gen_vec_extract_lo_v16si;
12571 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12572 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12573 emit_insn (gen (half, op3));
12574 op3 = half;
12575 break;
12576 case IX86_BUILTIN_SCATTERALTSIV4DF:
12577 case IX86_BUILTIN_SCATTERALTSIV4DI:
12578 half = gen_reg_rtx (V4SImode);
12579 if (!nonimmediate_operand (op2, V8SImode))
12580 op2 = copy_to_mode_reg (V8SImode, op2);
12581 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12582 op2 = half;
12583 break;
12584 case IX86_BUILTIN_SCATTERALTDIV8SF:
12585 case IX86_BUILTIN_SCATTERALTDIV8SI:
12586 half = gen_reg_rtx (mode3);
12587 if (mode3 == V4SFmode)
12588 gen = gen_vec_extract_lo_v8sf;
12589 else
12590 gen = gen_vec_extract_lo_v8si;
12591 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12592 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12593 emit_insn (gen (half, op3));
12594 op3 = half;
12595 break;
12596 case IX86_BUILTIN_SCATTERALTSIV2DF:
12597 case IX86_BUILTIN_SCATTERALTSIV2DI:
12598 if (!nonimmediate_operand (op2, V4SImode))
12599 op2 = copy_to_mode_reg (V4SImode, op2);
12600 break;
12601 case IX86_BUILTIN_SCATTERALTDIV4SF:
12602 case IX86_BUILTIN_SCATTERALTDIV4SI:
12603 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12604 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12605 break;
12606 default:
12607 break;
12608 }
12609
12610 /* Force memory operand only with base register here. But we
12611 don't want to do it on memory operand for other builtin
12612 functions. */
12613 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
12614
12615 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12616 op0 = copy_to_mode_reg (Pmode, op0);
12617
12618 op1 = fixup_modeless_constant (op1, mode1);
12619
12620 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
12621 {
12622 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12623 op1 = copy_to_mode_reg (mode1, op1);
12624 }
12625 else
12626 {
12627 op1 = copy_to_reg (op1);
12628 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
12629 }
12630
12631 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12632 op2 = copy_to_mode_reg (mode2, op2);
12633
12634 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12635 op3 = copy_to_mode_reg (mode3, op3);
12636
12637 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12638 {
12639 error ("the last argument must be scale 1, 2, 4, 8");
12640 return const0_rtx;
12641 }
12642
12643 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12644 if (! pat)
12645 return const0_rtx;
12646
12647 emit_insn (pat);
12648 return 0;
12649
12650 vec_prefetch_gen:
12651 arg0 = CALL_EXPR_ARG (exp, 0);
12652 arg1 = CALL_EXPR_ARG (exp, 1);
12653 arg2 = CALL_EXPR_ARG (exp, 2);
12654 arg3 = CALL_EXPR_ARG (exp, 3);
12655 arg4 = CALL_EXPR_ARG (exp, 4);
12656 op0 = expand_normal (arg0);
12657 op1 = expand_normal (arg1);
12658 op2 = expand_normal (arg2);
12659 op3 = expand_normal (arg3);
12660 op4 = expand_normal (arg4);
12661 mode0 = insn_data[icode].operand[0].mode;
12662 mode1 = insn_data[icode].operand[1].mode;
12663 mode3 = insn_data[icode].operand[3].mode;
12664 mode4 = insn_data[icode].operand[4].mode;
12665
12666 op0 = fixup_modeless_constant (op0, mode0);
12667
12668 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
12669 {
12670 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12671 op0 = copy_to_mode_reg (mode0, op0);
12672 }
12673 else
12674 {
12675 op0 = copy_to_reg (op0);
12676 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
12677 }
12678
12679 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12680 op1 = copy_to_mode_reg (mode1, op1);
12681
12682 /* Force memory operand only with base register here. But we
12683 don't want to do it on memory operand for other builtin
12684 functions. */
12685 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
12686
12687 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
12688 op2 = copy_to_mode_reg (Pmode, op2);
12689
12690 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12691 {
12692 error ("the forth argument must be scale 1, 2, 4, 8");
12693 return const0_rtx;
12694 }
12695
12696 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12697 {
12698 error ("incorrect hint operand");
12699 return const0_rtx;
12700 }
12701
12702 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12703 if (! pat)
12704 return const0_rtx;
12705
12706 emit_insn (pat);
12707
12708 return 0;
12709
12710 case IX86_BUILTIN_XABORT:
12711 icode = CODE_FOR_xabort;
12712 arg0 = CALL_EXPR_ARG (exp, 0);
12713 op0 = expand_normal (arg0);
12714 mode0 = insn_data[icode].operand[0].mode;
12715 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12716 {
12717 error ("the argument to %<xabort%> intrinsic must "
12718 "be an 8-bit immediate");
12719 return const0_rtx;
12720 }
12721 emit_insn (gen_xabort (op0));
12722 return 0;
12723
12724 case IX86_BUILTIN_RSTORSSP:
12725 case IX86_BUILTIN_CLRSSBSY:
12726 arg0 = CALL_EXPR_ARG (exp, 0);
12727 op0 = expand_normal (arg0);
12728 icode = (fcode == IX86_BUILTIN_RSTORSSP
12729 ? CODE_FOR_rstorssp
12730 : CODE_FOR_clrssbsy);
12731 if (!address_operand (op0, VOIDmode))
12732 {
12733 op1 = convert_memory_address (Pmode, op0);
12734 op0 = copy_addr_to_reg (op1);
12735 }
12736 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
12737 return 0;
12738
12739 case IX86_BUILTIN_WRSSD:
12740 case IX86_BUILTIN_WRSSQ:
12741 case IX86_BUILTIN_WRUSSD:
12742 case IX86_BUILTIN_WRUSSQ:
12743 arg0 = CALL_EXPR_ARG (exp, 0);
12744 op0 = expand_normal (arg0);
12745 arg1 = CALL_EXPR_ARG (exp, 1);
12746 op1 = expand_normal (arg1);
12747 switch (fcode)
12748 {
12749 case IX86_BUILTIN_WRSSD:
12750 icode = CODE_FOR_wrsssi;
12751 mode = SImode;
12752 break;
12753 case IX86_BUILTIN_WRSSQ:
12754 icode = CODE_FOR_wrssdi;
12755 mode = DImode;
12756 break;
12757 case IX86_BUILTIN_WRUSSD:
12758 icode = CODE_FOR_wrusssi;
12759 mode = SImode;
12760 break;
12761 case IX86_BUILTIN_WRUSSQ:
12762 icode = CODE_FOR_wrussdi;
12763 mode = DImode;
12764 break;
12765 }
12766 op0 = force_reg (mode, op0);
12767 if (!address_operand (op1, VOIDmode))
12768 {
12769 op2 = convert_memory_address (Pmode, op1);
12770 op1 = copy_addr_to_reg (op2);
12771 }
12772 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
12773 return 0;
12774
12775 default:
12776 break;
12777 }
12778
12779 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12780 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
12781 {
12782 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
12783 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
12784 target);
12785 }
12786
12787 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
12788 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
12789 {
12790 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
12791 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
12792 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
12793 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
12794 int masked = 1;
12795 machine_mode mode, wide_mode, nar_mode;
12796
12797 nar_mode = V4SFmode;
12798 mode = V16SFmode;
12799 wide_mode = V64SFmode;
12800 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
12801 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
12802
12803 switch (fcode)
12804 {
12805 case IX86_BUILTIN_4FMAPS:
12806 fcn = gen_avx5124fmaddps_4fmaddps;
12807 masked = 0;
12808 goto v4fma_expand;
12809
12810 case IX86_BUILTIN_4DPWSSD:
12811 nar_mode = V4SImode;
12812 mode = V16SImode;
12813 wide_mode = V64SImode;
12814 fcn = gen_avx5124vnniw_vp4dpwssd;
12815 masked = 0;
12816 goto v4fma_expand;
12817
12818 case IX86_BUILTIN_4DPWSSDS:
12819 nar_mode = V4SImode;
12820 mode = V16SImode;
12821 wide_mode = V64SImode;
12822 fcn = gen_avx5124vnniw_vp4dpwssds;
12823 masked = 0;
12824 goto v4fma_expand;
12825
12826 case IX86_BUILTIN_4FNMAPS:
12827 fcn = gen_avx5124fmaddps_4fnmaddps;
12828 masked = 0;
12829 goto v4fma_expand;
12830
12831 case IX86_BUILTIN_4FNMAPS_MASK:
12832 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
12833 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
12834 goto v4fma_expand;
12835
12836 case IX86_BUILTIN_4DPWSSD_MASK:
12837 nar_mode = V4SImode;
12838 mode = V16SImode;
12839 wide_mode = V64SImode;
12840 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
12841 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
12842 goto v4fma_expand;
12843
12844 case IX86_BUILTIN_4DPWSSDS_MASK:
12845 nar_mode = V4SImode;
12846 mode = V16SImode;
12847 wide_mode = V64SImode;
12848 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
12849 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
12850 goto v4fma_expand;
12851
12852 case IX86_BUILTIN_4FMAPS_MASK:
12853 {
12854 tree args[4];
12855 rtx ops[4];
12856 rtx wide_reg;
12857 rtx accum;
12858 rtx addr;
12859 rtx mem;
12860
12861 v4fma_expand:
12862 wide_reg = gen_reg_rtx (wide_mode);
12863 for (i = 0; i < 4; i++)
12864 {
12865 args[i] = CALL_EXPR_ARG (exp, i);
12866 ops[i] = expand_normal (args[i]);
12867
12868 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
12869 ops[i]);
12870 }
12871
12872 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
12873 accum = force_reg (mode, accum);
12874
12875 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
12876 addr = force_reg (Pmode, addr);
12877
12878 mem = gen_rtx_MEM (nar_mode, addr);
12879
12880 target = gen_reg_rtx (mode);
12881
12882 emit_move_insn (target, accum);
12883
12884 if (! masked)
12885 emit_insn (fcn (target, accum, wide_reg, mem));
12886 else
12887 {
12888 rtx merge, mask;
12889 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
12890
12891 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
12892
12893 if (CONST_INT_P (mask))
12894 mask = fixup_modeless_constant (mask, HImode);
12895
12896 mask = force_reg (HImode, mask);
12897
12898 if (GET_MODE (mask) != HImode)
12899 mask = gen_rtx_SUBREG (HImode, mask, 0);
12900
12901 /* If merge is 0 then we're about to emit z-masked variant. */
12902 if (const0_operand (merge, mode))
12903 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
12904 /* If merge is the same as accum then emit merge-masked variant. */
12905 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
12906 {
12907 merge = force_reg (mode, merge);
12908 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
12909 }
12910 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12911 else
12912 {
12913 target = gen_reg_rtx (mode);
12914 emit_move_insn (target, merge);
12915 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
12916 }
12917 }
12918 return target;
12919 }
12920
12921 case IX86_BUILTIN_4FNMASS:
12922 fcn = gen_avx5124fmaddps_4fnmaddss;
12923 masked = 0;
12924 goto s4fma_expand;
12925
12926 case IX86_BUILTIN_4FMASS:
12927 fcn = gen_avx5124fmaddps_4fmaddss;
12928 masked = 0;
12929 goto s4fma_expand;
12930
12931 case IX86_BUILTIN_4FNMASS_MASK:
12932 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
12933 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
12934 goto s4fma_expand;
12935
12936 case IX86_BUILTIN_4FMASS_MASK:
12937 {
12938 tree args[4];
12939 rtx ops[4];
12940 rtx wide_reg;
12941 rtx accum;
12942 rtx addr;
12943 rtx mem;
12944
12945 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
12946 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
12947
12948 s4fma_expand:
12949 mode = V4SFmode;
12950 wide_reg = gen_reg_rtx (V64SFmode);
12951 for (i = 0; i < 4; i++)
12952 {
12953 rtx tmp;
12954 args[i] = CALL_EXPR_ARG (exp, i);
12955 ops[i] = expand_normal (args[i]);
12956
12957 tmp = gen_reg_rtx (SFmode);
12958 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
12959
12960 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
12961 gen_rtx_SUBREG (V16SFmode, tmp, 0));
12962 }
12963
12964 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
12965 accum = force_reg (V4SFmode, accum);
12966
12967 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
12968 addr = force_reg (Pmode, addr);
12969
12970 mem = gen_rtx_MEM (V4SFmode, addr);
12971
12972 target = gen_reg_rtx (V4SFmode);
12973
12974 emit_move_insn (target, accum);
12975
12976 if (! masked)
12977 emit_insn (fcn (target, accum, wide_reg, mem));
12978 else
12979 {
12980 rtx merge, mask;
12981 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
12982
12983 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
12984
12985 if (CONST_INT_P (mask))
12986 mask = fixup_modeless_constant (mask, QImode);
12987
12988 mask = force_reg (QImode, mask);
12989
12990 if (GET_MODE (mask) != QImode)
12991 mask = gen_rtx_SUBREG (QImode, mask, 0);
12992
12993 /* If merge is 0 then we're about to emit z-masked variant. */
12994 if (const0_operand (merge, mode))
12995 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
12996 /* If merge is the same as accum then emit merge-masked
12997 variant. */
12998 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
12999 {
13000 merge = force_reg (mode, merge);
13001 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13002 }
13003 /* Merge with something unknown might happen if we z-mask
13004 w/ -O0. */
13005 else
13006 {
13007 target = gen_reg_rtx (mode);
13008 emit_move_insn (target, merge);
13009 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13010 }
13011 }
13012 return target;
13013 }
13014 case IX86_BUILTIN_RDPID:
13015 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13016 target);
13017 case IX86_BUILTIN_FABSQ:
13018 case IX86_BUILTIN_COPYSIGNQ:
13019 if (!TARGET_SSE)
13020 /* Emit a normal call if SSE isn't available. */
13021 return expand_call (exp, target, ignore);
13022 /* FALLTHRU */
13023 default:
13024 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13025 }
13026 }
13027
13028 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13029 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13030 {
13031 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13032 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13033 }
13034
13035 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13036 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13037 {
13038 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13039 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13040 }
13041
13042 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13043 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13044 {
13045 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13046 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13047 }
13048
13049 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13050 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13051 {
13052 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13053 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13054 }
13055
13056 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13057 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13058 {
13059 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13060 const struct builtin_description *d = bdesc_multi_arg + i;
13061 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13062 (enum ix86_builtin_func_type)
13063 d->flag, d->comparison);
13064 }
13065
13066 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13067 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13068 {
13069 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13070 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13071 target);
13072 }
13073
13074 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
13075 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
13076 {
13077 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
13078 return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
13079 target);
13080 }
13081
13082 gcc_unreachable ();
13083 }
13084
13085 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13086 fill target with val via vec_duplicate. */
13087
13088 static bool
13089 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13090 {
13091 bool ok;
13092 rtx_insn *insn;
13093 rtx dup;
13094
13095 /* First attempt to recognize VAL as-is. */
13096 dup = gen_vec_duplicate (mode, val);
13097 insn = emit_insn (gen_rtx_SET (target, dup));
13098 if (recog_memoized (insn) < 0)
13099 {
13100 rtx_insn *seq;
13101 machine_mode innermode = GET_MODE_INNER (mode);
13102 rtx reg;
13103
13104 /* If that fails, force VAL into a register. */
13105
13106 start_sequence ();
13107 reg = force_reg (innermode, val);
13108 if (GET_MODE (reg) != innermode)
13109 reg = gen_lowpart (innermode, reg);
13110 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13111 seq = get_insns ();
13112 end_sequence ();
13113 if (seq)
13114 emit_insn_before (seq, insn);
13115
13116 ok = recog_memoized (insn) >= 0;
13117 gcc_assert (ok);
13118 }
13119 return true;
13120 }
13121
13122 /* Get a vector mode of the same size as the original but with elements
13123 twice as wide. This is only guaranteed to apply to integral vectors. */
13124
13125 static machine_mode
13126 get_mode_wider_vector (machine_mode o)
13127 {
13128 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13129 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13130 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13131 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13132 return n;
13133 }
13134
13135 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13136 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13137
13138 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13139 with all elements equal to VAR. Return true if successful. */
13140
13141 static bool
13142 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13143 rtx target, rtx val)
13144 {
13145 bool ok;
13146
13147 switch (mode)
13148 {
13149 case E_V2SImode:
13150 case E_V2SFmode:
13151 if (!mmx_ok)
13152 return false;
13153 /* FALLTHRU */
13154
13155 case E_V4DFmode:
13156 case E_V4DImode:
13157 case E_V8SFmode:
13158 case E_V8SImode:
13159 case E_V2DFmode:
13160 case E_V2DImode:
13161 case E_V4SFmode:
13162 case E_V4SImode:
13163 case E_V16SImode:
13164 case E_V8DImode:
13165 case E_V16SFmode:
13166 case E_V8DFmode:
13167 return ix86_vector_duplicate_value (mode, target, val);
13168
13169 case E_V4HImode:
13170 if (!mmx_ok)
13171 return false;
13172 if (TARGET_SSE || TARGET_3DNOW_A)
13173 {
13174 rtx x;
13175
13176 val = gen_lowpart (SImode, val);
13177 x = gen_rtx_TRUNCATE (HImode, val);
13178 x = gen_rtx_VEC_DUPLICATE (mode, x);
13179 emit_insn (gen_rtx_SET (target, x));
13180 return true;
13181 }
13182 goto widen;
13183
13184 case E_V8QImode:
13185 if (!mmx_ok)
13186 return false;
13187 goto widen;
13188
13189 case E_V8HImode:
13190 if (TARGET_AVX2)
13191 return ix86_vector_duplicate_value (mode, target, val);
13192
13193 if (TARGET_SSE2)
13194 {
13195 struct expand_vec_perm_d dperm;
13196 rtx tmp1, tmp2;
13197
13198 permute:
13199 memset (&dperm, 0, sizeof (dperm));
13200 dperm.target = target;
13201 dperm.vmode = mode;
13202 dperm.nelt = GET_MODE_NUNITS (mode);
13203 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13204 dperm.one_operand_p = true;
13205
13206 /* Extend to SImode using a paradoxical SUBREG. */
13207 tmp1 = gen_reg_rtx (SImode);
13208 emit_move_insn (tmp1, gen_lowpart (SImode, val));
13209
13210 /* Insert the SImode value as low element of a V4SImode vector. */
13211 tmp2 = gen_reg_rtx (V4SImode);
13212 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13213 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13214
13215 ok = (expand_vec_perm_1 (&dperm)
13216 || expand_vec_perm_broadcast_1 (&dperm));
13217 gcc_assert (ok);
13218 return ok;
13219 }
13220 goto widen;
13221
13222 case E_V16QImode:
13223 if (TARGET_AVX2)
13224 return ix86_vector_duplicate_value (mode, target, val);
13225
13226 if (TARGET_SSE2)
13227 goto permute;
13228 goto widen;
13229
13230 widen:
13231 /* Replicate the value once into the next wider mode and recurse. */
13232 {
13233 machine_mode smode, wsmode, wvmode;
13234 rtx x;
13235
13236 smode = GET_MODE_INNER (mode);
13237 wvmode = get_mode_wider_vector (mode);
13238 wsmode = GET_MODE_INNER (wvmode);
13239
13240 val = convert_modes (wsmode, smode, val, true);
13241 x = expand_simple_binop (wsmode, ASHIFT, val,
13242 GEN_INT (GET_MODE_BITSIZE (smode)),
13243 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13244 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13245
13246 x = gen_reg_rtx (wvmode);
13247 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13248 gcc_assert (ok);
13249 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13250 return ok;
13251 }
13252
13253 case E_V16HImode:
13254 case E_V32QImode:
13255 if (TARGET_AVX2)
13256 return ix86_vector_duplicate_value (mode, target, val);
13257 else
13258 {
13259 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13260 rtx x = gen_reg_rtx (hvmode);
13261
13262 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13263 gcc_assert (ok);
13264
13265 x = gen_rtx_VEC_CONCAT (mode, x, x);
13266 emit_insn (gen_rtx_SET (target, x));
13267 }
13268 return true;
13269
13270 case E_V64QImode:
13271 case E_V32HImode:
13272 if (TARGET_AVX512BW)
13273 return ix86_vector_duplicate_value (mode, target, val);
13274 else
13275 {
13276 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13277 rtx x = gen_reg_rtx (hvmode);
13278
13279 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13280 gcc_assert (ok);
13281
13282 x = gen_rtx_VEC_CONCAT (mode, x, x);
13283 emit_insn (gen_rtx_SET (target, x));
13284 }
13285 return true;
13286
13287 default:
13288 return false;
13289 }
13290 }
13291
13292 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13293 whose ONE_VAR element is VAR, and other elements are zero. Return true
13294 if successful. */
13295
13296 static bool
13297 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13298 rtx target, rtx var, int one_var)
13299 {
13300 machine_mode vsimode;
13301 rtx new_target;
13302 rtx x, tmp;
13303 bool use_vector_set = false;
13304 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13305
13306 switch (mode)
13307 {
13308 case E_V2DImode:
13309 /* For SSE4.1, we normally use vector set. But if the second
13310 element is zero and inter-unit moves are OK, we use movq
13311 instead. */
13312 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13313 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13314 && one_var == 0));
13315 break;
13316 case E_V16QImode:
13317 case E_V4SImode:
13318 case E_V4SFmode:
13319 use_vector_set = TARGET_SSE4_1;
13320 break;
13321 case E_V8HImode:
13322 use_vector_set = TARGET_SSE2;
13323 break;
13324 case E_V8QImode:
13325 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
13326 break;
13327 case E_V4HImode:
13328 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13329 break;
13330 case E_V32QImode:
13331 case E_V16HImode:
13332 use_vector_set = TARGET_AVX;
13333 break;
13334 case E_V8SImode:
13335 use_vector_set = TARGET_AVX;
13336 gen_vec_set_0 = gen_vec_setv8si_0;
13337 break;
13338 case E_V8SFmode:
13339 use_vector_set = TARGET_AVX;
13340 gen_vec_set_0 = gen_vec_setv8sf_0;
13341 break;
13342 case E_V4DFmode:
13343 use_vector_set = TARGET_AVX;
13344 gen_vec_set_0 = gen_vec_setv4df_0;
13345 break;
13346 case E_V4DImode:
13347 /* Use ix86_expand_vector_set in 64bit mode only. */
13348 use_vector_set = TARGET_AVX && TARGET_64BIT;
13349 gen_vec_set_0 = gen_vec_setv4di_0;
13350 break;
13351 case E_V16SImode:
13352 use_vector_set = TARGET_AVX512F && one_var == 0;
13353 gen_vec_set_0 = gen_vec_setv16si_0;
13354 break;
13355 case E_V16SFmode:
13356 use_vector_set = TARGET_AVX512F && one_var == 0;
13357 gen_vec_set_0 = gen_vec_setv16sf_0;
13358 break;
13359 case E_V8DFmode:
13360 use_vector_set = TARGET_AVX512F && one_var == 0;
13361 gen_vec_set_0 = gen_vec_setv8df_0;
13362 break;
13363 case E_V8DImode:
13364 /* Use ix86_expand_vector_set in 64bit mode only. */
13365 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13366 gen_vec_set_0 = gen_vec_setv8di_0;
13367 break;
13368 default:
13369 break;
13370 }
13371
13372 if (use_vector_set)
13373 {
13374 if (gen_vec_set_0 && one_var == 0)
13375 {
13376 var = force_reg (GET_MODE_INNER (mode), var);
13377 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13378 return true;
13379 }
13380 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13381 var = force_reg (GET_MODE_INNER (mode), var);
13382 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13383 return true;
13384 }
13385
13386 switch (mode)
13387 {
13388 case E_V2SFmode:
13389 case E_V2SImode:
13390 if (!mmx_ok)
13391 return false;
13392 /* FALLTHRU */
13393
13394 case E_V2DFmode:
13395 case E_V2DImode:
13396 if (one_var != 0)
13397 return false;
13398 var = force_reg (GET_MODE_INNER (mode), var);
13399 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13400 emit_insn (gen_rtx_SET (target, x));
13401 return true;
13402
13403 case E_V4SFmode:
13404 case E_V4SImode:
13405 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13406 new_target = gen_reg_rtx (mode);
13407 else
13408 new_target = target;
13409 var = force_reg (GET_MODE_INNER (mode), var);
13410 x = gen_rtx_VEC_DUPLICATE (mode, var);
13411 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13412 emit_insn (gen_rtx_SET (new_target, x));
13413 if (one_var != 0)
13414 {
13415 /* We need to shuffle the value to the correct position, so
13416 create a new pseudo to store the intermediate result. */
13417
13418 /* With SSE2, we can use the integer shuffle insns. */
13419 if (mode != V4SFmode && TARGET_SSE2)
13420 {
13421 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13422 const1_rtx,
13423 GEN_INT (one_var == 1 ? 0 : 1),
13424 GEN_INT (one_var == 2 ? 0 : 1),
13425 GEN_INT (one_var == 3 ? 0 : 1)));
13426 if (target != new_target)
13427 emit_move_insn (target, new_target);
13428 return true;
13429 }
13430
13431 /* Otherwise convert the intermediate result to V4SFmode and
13432 use the SSE1 shuffle instructions. */
13433 if (mode != V4SFmode)
13434 {
13435 tmp = gen_reg_rtx (V4SFmode);
13436 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13437 }
13438 else
13439 tmp = new_target;
13440
13441 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13442 const1_rtx,
13443 GEN_INT (one_var == 1 ? 0 : 1),
13444 GEN_INT (one_var == 2 ? 0+4 : 1+4),
13445 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13446
13447 if (mode != V4SFmode)
13448 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13449 else if (tmp != target)
13450 emit_move_insn (target, tmp);
13451 }
13452 else if (target != new_target)
13453 emit_move_insn (target, new_target);
13454 return true;
13455
13456 case E_V8HImode:
13457 case E_V16QImode:
13458 vsimode = V4SImode;
13459 goto widen;
13460 case E_V4HImode:
13461 case E_V8QImode:
13462 if (!mmx_ok)
13463 return false;
13464 vsimode = V2SImode;
13465 goto widen;
13466 widen:
13467 if (one_var != 0)
13468 return false;
13469
13470 /* Zero extend the variable element to SImode and recurse. */
13471 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13472
13473 x = gen_reg_rtx (vsimode);
13474 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13475 var, one_var))
13476 gcc_unreachable ();
13477
13478 emit_move_insn (target, gen_lowpart (mode, x));
13479 return true;
13480
13481 default:
13482 return false;
13483 }
13484 }
13485
13486 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13487 consisting of the values in VALS. It is known that all elements
13488 except ONE_VAR are constants. Return true if successful. */
13489
13490 static bool
13491 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13492 rtx target, rtx vals, int one_var)
13493 {
13494 rtx var = XVECEXP (vals, 0, one_var);
13495 machine_mode wmode;
13496 rtx const_vec, x;
13497
13498 const_vec = copy_rtx (vals);
13499 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13500 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13501
13502 switch (mode)
13503 {
13504 case E_V2DFmode:
13505 case E_V2DImode:
13506 case E_V2SFmode:
13507 case E_V2SImode:
13508 /* For the two element vectors, it's just as easy to use
13509 the general case. */
13510 return false;
13511
13512 case E_V4DImode:
13513 /* Use ix86_expand_vector_set in 64bit mode only. */
13514 if (!TARGET_64BIT)
13515 return false;
13516 /* FALLTHRU */
13517 case E_V4DFmode:
13518 case E_V8SFmode:
13519 case E_V8SImode:
13520 case E_V16HImode:
13521 case E_V32QImode:
13522 case E_V4SFmode:
13523 case E_V4SImode:
13524 case E_V8HImode:
13525 case E_V4HImode:
13526 break;
13527
13528 case E_V16QImode:
13529 if (TARGET_SSE4_1)
13530 break;
13531 wmode = V8HImode;
13532 goto widen;
13533 case E_V8QImode:
13534 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
13535 break;
13536 wmode = V4HImode;
13537 goto widen;
13538 widen:
13539 /* There's no way to set one QImode entry easily. Combine
13540 the variable value with its adjacent constant value, and
13541 promote to an HImode set. */
13542 x = XVECEXP (vals, 0, one_var ^ 1);
13543 if (one_var & 1)
13544 {
13545 var = convert_modes (HImode, QImode, var, true);
13546 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13547 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13548 x = GEN_INT (INTVAL (x) & 0xff);
13549 }
13550 else
13551 {
13552 var = convert_modes (HImode, QImode, var, true);
13553 x = gen_int_mode (UINTVAL (x) << 8, HImode);
13554 }
13555 if (x != const0_rtx)
13556 var = expand_simple_binop (HImode, IOR, var, x, var,
13557 1, OPTAB_LIB_WIDEN);
13558
13559 x = gen_reg_rtx (wmode);
13560 emit_move_insn (x, gen_lowpart (wmode, const_vec));
13561 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
13562
13563 emit_move_insn (target, gen_lowpart (mode, x));
13564 return true;
13565
13566 default:
13567 return false;
13568 }
13569
13570 emit_move_insn (target, const_vec);
13571 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13572 return true;
13573 }
13574
13575 /* A subroutine of ix86_expand_vector_init_general. Use vector
13576 concatenate to handle the most general case: all values variable,
13577 and none identical. */
13578
13579 static void
13580 ix86_expand_vector_init_concat (machine_mode mode,
13581 rtx target, rtx *ops, int n)
13582 {
13583 machine_mode half_mode = VOIDmode;
13584 rtx half[2];
13585 rtvec v;
13586 int i, j;
13587
13588 switch (n)
13589 {
13590 case 2:
13591 switch (mode)
13592 {
13593 case E_V16SImode:
13594 half_mode = V8SImode;
13595 break;
13596 case E_V16SFmode:
13597 half_mode = V8SFmode;
13598 break;
13599 case E_V8DImode:
13600 half_mode = V4DImode;
13601 break;
13602 case E_V8DFmode:
13603 half_mode = V4DFmode;
13604 break;
13605 case E_V8SImode:
13606 half_mode = V4SImode;
13607 break;
13608 case E_V8SFmode:
13609 half_mode = V4SFmode;
13610 break;
13611 case E_V4DImode:
13612 half_mode = V2DImode;
13613 break;
13614 case E_V4DFmode:
13615 half_mode = V2DFmode;
13616 break;
13617 case E_V4SImode:
13618 half_mode = V2SImode;
13619 break;
13620 case E_V4SFmode:
13621 half_mode = V2SFmode;
13622 break;
13623 case E_V2DImode:
13624 half_mode = DImode;
13625 break;
13626 case E_V2SImode:
13627 half_mode = SImode;
13628 break;
13629 case E_V2DFmode:
13630 half_mode = DFmode;
13631 break;
13632 case E_V2SFmode:
13633 half_mode = SFmode;
13634 break;
13635 default:
13636 gcc_unreachable ();
13637 }
13638
13639 if (!register_operand (ops[1], half_mode))
13640 ops[1] = force_reg (half_mode, ops[1]);
13641 if (!register_operand (ops[0], half_mode))
13642 ops[0] = force_reg (half_mode, ops[0]);
13643 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
13644 ops[1])));
13645 break;
13646
13647 case 4:
13648 switch (mode)
13649 {
13650 case E_V4DImode:
13651 half_mode = V2DImode;
13652 break;
13653 case E_V4DFmode:
13654 half_mode = V2DFmode;
13655 break;
13656 case E_V4SImode:
13657 half_mode = V2SImode;
13658 break;
13659 case E_V4SFmode:
13660 half_mode = V2SFmode;
13661 break;
13662 default:
13663 gcc_unreachable ();
13664 }
13665 goto half;
13666
13667 case 8:
13668 switch (mode)
13669 {
13670 case E_V8DImode:
13671 half_mode = V4DImode;
13672 break;
13673 case E_V8DFmode:
13674 half_mode = V4DFmode;
13675 break;
13676 case E_V8SImode:
13677 half_mode = V4SImode;
13678 break;
13679 case E_V8SFmode:
13680 half_mode = V4SFmode;
13681 break;
13682 default:
13683 gcc_unreachable ();
13684 }
13685 goto half;
13686
13687 case 16:
13688 switch (mode)
13689 {
13690 case E_V16SImode:
13691 half_mode = V8SImode;
13692 break;
13693 case E_V16SFmode:
13694 half_mode = V8SFmode;
13695 break;
13696 default:
13697 gcc_unreachable ();
13698 }
13699 goto half;
13700
13701 half:
13702 /* FIXME: We process inputs backward to help RA. PR 36222. */
13703 i = n - 1;
13704 for (j = 1; j != -1; j--)
13705 {
13706 half[j] = gen_reg_rtx (half_mode);
13707 switch (n >> 1)
13708 {
13709 case 2:
13710 v = gen_rtvec (2, ops[i-1], ops[i]);
13711 i -= 2;
13712 break;
13713 case 4:
13714 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
13715 i -= 4;
13716 break;
13717 case 8:
13718 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
13719 ops[i-3], ops[i-2], ops[i-1], ops[i]);
13720 i -= 8;
13721 break;
13722 default:
13723 gcc_unreachable ();
13724 }
13725 ix86_expand_vector_init (false, half[j],
13726 gen_rtx_PARALLEL (half_mode, v));
13727 }
13728
13729 ix86_expand_vector_init_concat (mode, target, half, 2);
13730 break;
13731
13732 default:
13733 gcc_unreachable ();
13734 }
13735 }
13736
13737 /* A subroutine of ix86_expand_vector_init_general. Use vector
13738 interleave to handle the most general case: all values variable,
13739 and none identical. */
13740
13741 static void
13742 ix86_expand_vector_init_interleave (machine_mode mode,
13743 rtx target, rtx *ops, int n)
13744 {
13745 machine_mode first_imode, second_imode, third_imode, inner_mode;
13746 int i, j;
13747 rtx op0, op1;
13748 rtx (*gen_load_even) (rtx, rtx, rtx);
13749 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
13750 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
13751
13752 switch (mode)
13753 {
13754 case E_V8HImode:
13755 gen_load_even = gen_vec_setv8hi;
13756 gen_interleave_first_low = gen_vec_interleave_lowv4si;
13757 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13758 inner_mode = HImode;
13759 first_imode = V4SImode;
13760 second_imode = V2DImode;
13761 third_imode = VOIDmode;
13762 break;
13763 case E_V16QImode:
13764 gen_load_even = gen_vec_setv16qi;
13765 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
13766 gen_interleave_second_low = gen_vec_interleave_lowv4si;
13767 inner_mode = QImode;
13768 first_imode = V8HImode;
13769 second_imode = V4SImode;
13770 third_imode = V2DImode;
13771 break;
13772 default:
13773 gcc_unreachable ();
13774 }
13775
13776 for (i = 0; i < n; i++)
13777 {
13778 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13779 op0 = gen_reg_rtx (SImode);
13780 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
13781
13782 /* Insert the SImode value as low element of V4SImode vector. */
13783 op1 = gen_reg_rtx (V4SImode);
13784 op0 = gen_rtx_VEC_MERGE (V4SImode,
13785 gen_rtx_VEC_DUPLICATE (V4SImode,
13786 op0),
13787 CONST0_RTX (V4SImode),
13788 const1_rtx);
13789 emit_insn (gen_rtx_SET (op1, op0));
13790
13791 /* Cast the V4SImode vector back to a vector in orignal mode. */
13792 op0 = gen_reg_rtx (mode);
13793 emit_move_insn (op0, gen_lowpart (mode, op1));
13794
13795 /* Load even elements into the second position. */
13796 emit_insn (gen_load_even (op0,
13797 force_reg (inner_mode,
13798 ops [i + i + 1]),
13799 const1_rtx));
13800
13801 /* Cast vector to FIRST_IMODE vector. */
13802 ops[i] = gen_reg_rtx (first_imode);
13803 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
13804 }
13805
13806 /* Interleave low FIRST_IMODE vectors. */
13807 for (i = j = 0; i < n; i += 2, j++)
13808 {
13809 op0 = gen_reg_rtx (first_imode);
13810 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
13811
13812 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13813 ops[j] = gen_reg_rtx (second_imode);
13814 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
13815 }
13816
13817 /* Interleave low SECOND_IMODE vectors. */
13818 switch (second_imode)
13819 {
13820 case E_V4SImode:
13821 for (i = j = 0; i < n / 2; i += 2, j++)
13822 {
13823 op0 = gen_reg_rtx (second_imode);
13824 emit_insn (gen_interleave_second_low (op0, ops[i],
13825 ops[i + 1]));
13826
13827 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13828 vector. */
13829 ops[j] = gen_reg_rtx (third_imode);
13830 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
13831 }
13832 second_imode = V2DImode;
13833 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13834 /* FALLTHRU */
13835
13836 case E_V2DImode:
13837 op0 = gen_reg_rtx (second_imode);
13838 emit_insn (gen_interleave_second_low (op0, ops[0],
13839 ops[1]));
13840
13841 /* Cast the SECOND_IMODE vector back to a vector on original
13842 mode. */
13843 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
13844 break;
13845
13846 default:
13847 gcc_unreachable ();
13848 }
13849 }
13850
13851 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
13852 all values variable, and none identical. */
13853
13854 static void
13855 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
13856 rtx target, rtx vals)
13857 {
13858 rtx ops[64], op0, op1, op2, op3, op4, op5;
13859 machine_mode half_mode = VOIDmode;
13860 machine_mode quarter_mode = VOIDmode;
13861 int n, i;
13862
13863 switch (mode)
13864 {
13865 case E_V2SFmode:
13866 case E_V2SImode:
13867 if (!mmx_ok && !TARGET_SSE)
13868 break;
13869 /* FALLTHRU */
13870
13871 case E_V16SImode:
13872 case E_V16SFmode:
13873 case E_V8DFmode:
13874 case E_V8DImode:
13875 case E_V8SFmode:
13876 case E_V8SImode:
13877 case E_V4DFmode:
13878 case E_V4DImode:
13879 case E_V4SFmode:
13880 case E_V4SImode:
13881 case E_V2DFmode:
13882 case E_V2DImode:
13883 n = GET_MODE_NUNITS (mode);
13884 for (i = 0; i < n; i++)
13885 ops[i] = XVECEXP (vals, 0, i);
13886 ix86_expand_vector_init_concat (mode, target, ops, n);
13887 return;
13888
13889 case E_V2TImode:
13890 for (i = 0; i < 2; i++)
13891 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13892 op0 = gen_reg_rtx (V4DImode);
13893 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
13894 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13895 return;
13896
13897 case E_V4TImode:
13898 for (i = 0; i < 4; i++)
13899 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13900 ops[4] = gen_reg_rtx (V4DImode);
13901 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
13902 ops[5] = gen_reg_rtx (V4DImode);
13903 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
13904 op0 = gen_reg_rtx (V8DImode);
13905 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
13906 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13907 return;
13908
13909 case E_V32QImode:
13910 half_mode = V16QImode;
13911 goto half;
13912
13913 case E_V16HImode:
13914 half_mode = V8HImode;
13915 goto half;
13916
13917 half:
13918 n = GET_MODE_NUNITS (mode);
13919 for (i = 0; i < n; i++)
13920 ops[i] = XVECEXP (vals, 0, i);
13921 op0 = gen_reg_rtx (half_mode);
13922 op1 = gen_reg_rtx (half_mode);
13923 ix86_expand_vector_init_interleave (half_mode, op0, ops,
13924 n >> 2);
13925 ix86_expand_vector_init_interleave (half_mode, op1,
13926 &ops [n >> 1], n >> 2);
13927 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
13928 return;
13929
13930 case E_V64QImode:
13931 quarter_mode = V16QImode;
13932 half_mode = V32QImode;
13933 goto quarter;
13934
13935 case E_V32HImode:
13936 quarter_mode = V8HImode;
13937 half_mode = V16HImode;
13938 goto quarter;
13939
13940 quarter:
13941 n = GET_MODE_NUNITS (mode);
13942 for (i = 0; i < n; i++)
13943 ops[i] = XVECEXP (vals, 0, i);
13944 op0 = gen_reg_rtx (quarter_mode);
13945 op1 = gen_reg_rtx (quarter_mode);
13946 op2 = gen_reg_rtx (quarter_mode);
13947 op3 = gen_reg_rtx (quarter_mode);
13948 op4 = gen_reg_rtx (half_mode);
13949 op5 = gen_reg_rtx (half_mode);
13950 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
13951 n >> 3);
13952 ix86_expand_vector_init_interleave (quarter_mode, op1,
13953 &ops [n >> 2], n >> 3);
13954 ix86_expand_vector_init_interleave (quarter_mode, op2,
13955 &ops [n >> 1], n >> 3);
13956 ix86_expand_vector_init_interleave (quarter_mode, op3,
13957 &ops [(n >> 1) | (n >> 2)], n >> 3);
13958 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
13959 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
13960 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
13961 return;
13962
13963 case E_V16QImode:
13964 if (!TARGET_SSE4_1)
13965 break;
13966 /* FALLTHRU */
13967
13968 case E_V8HImode:
13969 if (!TARGET_SSE2)
13970 break;
13971
13972 /* Don't use ix86_expand_vector_init_interleave if we can't
13973 move from GPR to SSE register directly. */
13974 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
13975 break;
13976
13977 n = GET_MODE_NUNITS (mode);
13978 for (i = 0; i < n; i++)
13979 ops[i] = XVECEXP (vals, 0, i);
13980 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
13981 return;
13982
13983 case E_V4HImode:
13984 case E_V8QImode:
13985 break;
13986
13987 default:
13988 gcc_unreachable ();
13989 }
13990
13991 {
13992 int i, j, n_elts, n_words, n_elt_per_word;
13993 machine_mode inner_mode;
13994 rtx words[4], shift;
13995
13996 inner_mode = GET_MODE_INNER (mode);
13997 n_elts = GET_MODE_NUNITS (mode);
13998 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
13999 n_elt_per_word = n_elts / n_words;
14000 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14001
14002 for (i = 0; i < n_words; ++i)
14003 {
14004 rtx word = NULL_RTX;
14005
14006 for (j = 0; j < n_elt_per_word; ++j)
14007 {
14008 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14009 elt = convert_modes (word_mode, inner_mode, elt, true);
14010
14011 if (j == 0)
14012 word = elt;
14013 else
14014 {
14015 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14016 word, 1, OPTAB_LIB_WIDEN);
14017 word = expand_simple_binop (word_mode, IOR, word, elt,
14018 word, 1, OPTAB_LIB_WIDEN);
14019 }
14020 }
14021
14022 words[i] = word;
14023 }
14024
14025 if (n_words == 1)
14026 emit_move_insn (target, gen_lowpart (mode, words[0]));
14027 else if (n_words == 2)
14028 {
14029 rtx tmp = gen_reg_rtx (mode);
14030 emit_clobber (tmp);
14031 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14032 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14033 emit_move_insn (target, tmp);
14034 }
14035 else if (n_words == 4)
14036 {
14037 rtx tmp = gen_reg_rtx (V4SImode);
14038 gcc_assert (word_mode == SImode);
14039 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14040 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14041 emit_move_insn (target, gen_lowpart (mode, tmp));
14042 }
14043 else
14044 gcc_unreachable ();
14045 }
14046 }
14047
14048 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14049 instructions unless MMX_OK is true. */
14050
14051 void
14052 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14053 {
14054 machine_mode mode = GET_MODE (target);
14055 machine_mode inner_mode = GET_MODE_INNER (mode);
14056 int n_elts = GET_MODE_NUNITS (mode);
14057 int n_var = 0, one_var = -1;
14058 bool all_same = true, all_const_zero = true;
14059 int i;
14060 rtx x;
14061
14062 /* Handle first initialization from vector elts. */
14063 if (n_elts != XVECLEN (vals, 0))
14064 {
14065 rtx subtarget = target;
14066 x = XVECEXP (vals, 0, 0);
14067 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14068 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14069 {
14070 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14071 if (inner_mode == QImode || inner_mode == HImode)
14072 {
14073 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14074 mode = mode_for_vector (SImode, n_bits / 4).require ();
14075 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
14076 ops[0] = gen_lowpart (inner_mode, ops[0]);
14077 ops[1] = gen_lowpart (inner_mode, ops[1]);
14078 subtarget = gen_reg_rtx (mode);
14079 }
14080 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14081 if (subtarget != target)
14082 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14083 return;
14084 }
14085 gcc_unreachable ();
14086 }
14087
14088 for (i = 0; i < n_elts; ++i)
14089 {
14090 x = XVECEXP (vals, 0, i);
14091 if (!(CONST_SCALAR_INT_P (x)
14092 || CONST_DOUBLE_P (x)
14093 || CONST_FIXED_P (x)))
14094 n_var++, one_var = i;
14095 else if (x != CONST0_RTX (inner_mode))
14096 all_const_zero = false;
14097 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14098 all_same = false;
14099 }
14100
14101 /* Constants are best loaded from the constant pool. */
14102 if (n_var == 0)
14103 {
14104 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14105 return;
14106 }
14107
14108 /* If all values are identical, broadcast the value. */
14109 if (all_same
14110 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14111 XVECEXP (vals, 0, 0)))
14112 return;
14113
14114 /* Values where only one field is non-constant are best loaded from
14115 the pool and overwritten via move later. */
14116 if (n_var == 1)
14117 {
14118 if (all_const_zero
14119 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14120 XVECEXP (vals, 0, one_var),
14121 one_var))
14122 return;
14123
14124 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14125 return;
14126 }
14127
14128 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14129 }
14130
14131 void
14132 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14133 {
14134 machine_mode mode = GET_MODE (target);
14135 machine_mode inner_mode = GET_MODE_INNER (mode);
14136 machine_mode half_mode;
14137 bool use_vec_merge = false;
14138 rtx tmp;
14139 static rtx (*gen_extract[6][2]) (rtx, rtx)
14140 = {
14141 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14142 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14143 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14144 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14145 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14146 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14147 };
14148 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14149 = {
14150 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14151 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14152 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14153 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14154 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14155 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14156 };
14157 int i, j, n;
14158 machine_mode mmode = VOIDmode;
14159 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14160
14161 switch (mode)
14162 {
14163 case E_V2SImode:
14164 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14165 if (use_vec_merge)
14166 break;
14167 /* FALLTHRU */
14168
14169 case E_V2SFmode:
14170 if (mmx_ok)
14171 {
14172 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14173 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14174 if (elt == 0)
14175 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14176 else
14177 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14178 emit_insn (gen_rtx_SET (target, tmp));
14179 return;
14180 }
14181 break;
14182
14183 case E_V2DImode:
14184 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14185 if (use_vec_merge)
14186 break;
14187
14188 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14189 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14190 if (elt == 0)
14191 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14192 else
14193 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14194 emit_insn (gen_rtx_SET (target, tmp));
14195 return;
14196
14197 case E_V2DFmode:
14198 /* NB: For ELT == 0, use standard scalar operation patterns which
14199 preserve the rest of the vector for combiner:
14200
14201 (vec_merge:V2DF
14202 (vec_duplicate:V2DF (reg:DF))
14203 (reg:V2DF)
14204 (const_int 1))
14205 */
14206 if (elt == 0)
14207 goto do_vec_merge;
14208
14209 {
14210 rtx op0, op1;
14211
14212 /* For the two element vectors, we implement a VEC_CONCAT with
14213 the extraction of the other element. */
14214
14215 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14216 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14217
14218 if (elt == 0)
14219 op0 = val, op1 = tmp;
14220 else
14221 op0 = tmp, op1 = val;
14222
14223 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14224 emit_insn (gen_rtx_SET (target, tmp));
14225 }
14226 return;
14227
14228 case E_V4SFmode:
14229 use_vec_merge = TARGET_SSE4_1;
14230 if (use_vec_merge)
14231 break;
14232
14233 switch (elt)
14234 {
14235 case 0:
14236 use_vec_merge = true;
14237 break;
14238
14239 case 1:
14240 /* tmp = target = A B C D */
14241 tmp = copy_to_reg (target);
14242 /* target = A A B B */
14243 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14244 /* target = X A B B */
14245 ix86_expand_vector_set (false, target, val, 0);
14246 /* target = A X C D */
14247 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14248 const1_rtx, const0_rtx,
14249 GEN_INT (2+4), GEN_INT (3+4)));
14250 return;
14251
14252 case 2:
14253 /* tmp = target = A B C D */
14254 tmp = copy_to_reg (target);
14255 /* tmp = X B C D */
14256 ix86_expand_vector_set (false, tmp, val, 0);
14257 /* target = A B X D */
14258 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14259 const0_rtx, const1_rtx,
14260 GEN_INT (0+4), GEN_INT (3+4)));
14261 return;
14262
14263 case 3:
14264 /* tmp = target = A B C D */
14265 tmp = copy_to_reg (target);
14266 /* tmp = X B C D */
14267 ix86_expand_vector_set (false, tmp, val, 0);
14268 /* target = A B X D */
14269 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14270 const0_rtx, const1_rtx,
14271 GEN_INT (2+4), GEN_INT (0+4)));
14272 return;
14273
14274 default:
14275 gcc_unreachable ();
14276 }
14277 break;
14278
14279 case E_V4SImode:
14280 use_vec_merge = TARGET_SSE4_1;
14281 if (use_vec_merge)
14282 break;
14283
14284 /* Element 0 handled by vec_merge below. */
14285 if (elt == 0)
14286 {
14287 use_vec_merge = true;
14288 break;
14289 }
14290
14291 if (TARGET_SSE2)
14292 {
14293 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14294 store into element 0, then shuffle them back. */
14295
14296 rtx order[4];
14297
14298 order[0] = GEN_INT (elt);
14299 order[1] = const1_rtx;
14300 order[2] = const2_rtx;
14301 order[3] = GEN_INT (3);
14302 order[elt] = const0_rtx;
14303
14304 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14305 order[1], order[2], order[3]));
14306
14307 ix86_expand_vector_set (false, target, val, 0);
14308
14309 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14310 order[1], order[2], order[3]));
14311 }
14312 else
14313 {
14314 /* For SSE1, we have to reuse the V4SF code. */
14315 rtx t = gen_reg_rtx (V4SFmode);
14316 emit_move_insn (t, gen_lowpart (V4SFmode, target));
14317 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14318 emit_move_insn (target, gen_lowpart (mode, t));
14319 }
14320 return;
14321
14322 case E_V8HImode:
14323 use_vec_merge = TARGET_SSE2;
14324 break;
14325 case E_V4HImode:
14326 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14327 break;
14328
14329 case E_V16QImode:
14330 use_vec_merge = TARGET_SSE4_1;
14331 break;
14332
14333 case E_V8QImode:
14334 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14335 break;
14336
14337 case E_V32QImode:
14338 half_mode = V16QImode;
14339 j = 0;
14340 n = 16;
14341 goto half;
14342
14343 case E_V16HImode:
14344 half_mode = V8HImode;
14345 j = 1;
14346 n = 8;
14347 goto half;
14348
14349 case E_V8SImode:
14350 half_mode = V4SImode;
14351 j = 2;
14352 n = 4;
14353 goto half;
14354
14355 case E_V4DImode:
14356 half_mode = V2DImode;
14357 j = 3;
14358 n = 2;
14359 goto half;
14360
14361 case E_V8SFmode:
14362 half_mode = V4SFmode;
14363 j = 4;
14364 n = 4;
14365 goto half;
14366
14367 case E_V4DFmode:
14368 half_mode = V2DFmode;
14369 j = 5;
14370 n = 2;
14371 goto half;
14372
14373 half:
14374 /* Compute offset. */
14375 i = elt / n;
14376 elt %= n;
14377
14378 gcc_assert (i <= 1);
14379
14380 /* Extract the half. */
14381 tmp = gen_reg_rtx (half_mode);
14382 emit_insn (gen_extract[j][i] (tmp, target));
14383
14384 /* Put val in tmp at elt. */
14385 ix86_expand_vector_set (false, tmp, val, elt);
14386
14387 /* Put it back. */
14388 emit_insn (gen_insert[j][i] (target, target, tmp));
14389 return;
14390
14391 case E_V8DFmode:
14392 if (TARGET_AVX512F)
14393 {
14394 mmode = QImode;
14395 gen_blendm = gen_avx512f_blendmv8df;
14396 }
14397 break;
14398
14399 case E_V8DImode:
14400 if (TARGET_AVX512F)
14401 {
14402 mmode = QImode;
14403 gen_blendm = gen_avx512f_blendmv8di;
14404 }
14405 break;
14406
14407 case E_V16SFmode:
14408 if (TARGET_AVX512F)
14409 {
14410 mmode = HImode;
14411 gen_blendm = gen_avx512f_blendmv16sf;
14412 }
14413 break;
14414
14415 case E_V16SImode:
14416 if (TARGET_AVX512F)
14417 {
14418 mmode = HImode;
14419 gen_blendm = gen_avx512f_blendmv16si;
14420 }
14421 break;
14422
14423 case E_V32HImode:
14424 if (TARGET_AVX512BW)
14425 {
14426 mmode = SImode;
14427 gen_blendm = gen_avx512bw_blendmv32hi;
14428 }
14429 else if (TARGET_AVX512F)
14430 {
14431 half_mode = E_V8HImode;
14432 n = 8;
14433 goto quarter;
14434 }
14435 break;
14436
14437 case E_V64QImode:
14438 if (TARGET_AVX512BW)
14439 {
14440 mmode = DImode;
14441 gen_blendm = gen_avx512bw_blendmv64qi;
14442 }
14443 else if (TARGET_AVX512F)
14444 {
14445 half_mode = E_V16QImode;
14446 n = 16;
14447 goto quarter;
14448 }
14449 break;
14450
14451 quarter:
14452 /* Compute offset. */
14453 i = elt / n;
14454 elt %= n;
14455
14456 gcc_assert (i <= 3);
14457
14458 {
14459 /* Extract the quarter. */
14460 tmp = gen_reg_rtx (V4SImode);
14461 rtx tmp2 = gen_lowpart (V16SImode, target);
14462 rtx mask = gen_reg_rtx (QImode);
14463
14464 emit_move_insn (mask, constm1_rtx);
14465 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
14466 tmp, mask));
14467
14468 tmp2 = gen_reg_rtx (half_mode);
14469 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
14470 tmp = tmp2;
14471
14472 /* Put val in tmp at elt. */
14473 ix86_expand_vector_set (false, tmp, val, elt);
14474
14475 /* Put it back. */
14476 tmp2 = gen_reg_rtx (V16SImode);
14477 rtx tmp3 = gen_lowpart (V16SImode, target);
14478 mask = gen_reg_rtx (HImode);
14479 emit_move_insn (mask, constm1_rtx);
14480 tmp = gen_lowpart (V4SImode, tmp);
14481 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
14482 tmp3, mask));
14483 emit_move_insn (target, gen_lowpart (mode, tmp2));
14484 }
14485 return;
14486
14487 default:
14488 break;
14489 }
14490
14491 if (mmode != VOIDmode)
14492 {
14493 tmp = gen_reg_rtx (mode);
14494 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
14495 /* The avx512*_blendm<mode> expanders have different operand order
14496 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14497 elements where the mask is set and second input operand otherwise,
14498 in {sse,avx}*_*blend* the first input operand is used for elements
14499 where the mask is clear and second input operand otherwise. */
14500 emit_insn (gen_blendm (target, target, tmp,
14501 force_reg (mmode,
14502 gen_int_mode (HOST_WIDE_INT_1U << elt,
14503 mmode))));
14504 }
14505 else if (use_vec_merge)
14506 {
14507 do_vec_merge:
14508 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
14509 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
14510 GEN_INT (HOST_WIDE_INT_1U << elt));
14511 emit_insn (gen_rtx_SET (target, tmp));
14512 }
14513 else
14514 {
14515 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14516
14517 emit_move_insn (mem, target);
14518
14519 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
14520 emit_move_insn (tmp, val);
14521
14522 emit_move_insn (target, mem);
14523 }
14524 }
14525
14526 void
14527 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
14528 {
14529 machine_mode mode = GET_MODE (vec);
14530 machine_mode inner_mode = GET_MODE_INNER (mode);
14531 bool use_vec_extr = false;
14532 rtx tmp;
14533
14534 switch (mode)
14535 {
14536 case E_V2SImode:
14537 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14538 if (use_vec_extr)
14539 break;
14540 /* FALLTHRU */
14541
14542 case E_V2SFmode:
14543 if (!mmx_ok)
14544 break;
14545 /* FALLTHRU */
14546
14547 case E_V2DFmode:
14548 case E_V2DImode:
14549 case E_V2TImode:
14550 case E_V4TImode:
14551 use_vec_extr = true;
14552 break;
14553
14554 case E_V4SFmode:
14555 use_vec_extr = TARGET_SSE4_1;
14556 if (use_vec_extr)
14557 break;
14558
14559 switch (elt)
14560 {
14561 case 0:
14562 tmp = vec;
14563 break;
14564
14565 case 1:
14566 case 3:
14567 tmp = gen_reg_rtx (mode);
14568 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
14569 GEN_INT (elt), GEN_INT (elt),
14570 GEN_INT (elt+4), GEN_INT (elt+4)));
14571 break;
14572
14573 case 2:
14574 tmp = gen_reg_rtx (mode);
14575 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
14576 break;
14577
14578 default:
14579 gcc_unreachable ();
14580 }
14581 vec = tmp;
14582 use_vec_extr = true;
14583 elt = 0;
14584 break;
14585
14586 case E_V4SImode:
14587 use_vec_extr = TARGET_SSE4_1;
14588 if (use_vec_extr)
14589 break;
14590
14591 if (TARGET_SSE2)
14592 {
14593 switch (elt)
14594 {
14595 case 0:
14596 tmp = vec;
14597 break;
14598
14599 case 1:
14600 case 3:
14601 tmp = gen_reg_rtx (mode);
14602 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
14603 GEN_INT (elt), GEN_INT (elt),
14604 GEN_INT (elt), GEN_INT (elt)));
14605 break;
14606
14607 case 2:
14608 tmp = gen_reg_rtx (mode);
14609 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
14610 break;
14611
14612 default:
14613 gcc_unreachable ();
14614 }
14615 vec = tmp;
14616 use_vec_extr = true;
14617 elt = 0;
14618 }
14619 else
14620 {
14621 /* For SSE1, we have to reuse the V4SF code. */
14622 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
14623 gen_lowpart (V4SFmode, vec), elt);
14624 return;
14625 }
14626 break;
14627
14628 case E_V8HImode:
14629 use_vec_extr = TARGET_SSE2;
14630 break;
14631 case E_V4HImode:
14632 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14633 break;
14634
14635 case E_V16QImode:
14636 use_vec_extr = TARGET_SSE4_1;
14637 if (!use_vec_extr
14638 && TARGET_SSE2
14639 && elt == 0
14640 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
14641 {
14642 tmp = gen_reg_rtx (SImode);
14643 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
14644 0);
14645 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
14646 return;
14647 }
14648 break;
14649
14650 case E_V8SFmode:
14651 if (TARGET_AVX)
14652 {
14653 tmp = gen_reg_rtx (V4SFmode);
14654 if (elt < 4)
14655 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
14656 else
14657 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
14658 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14659 return;
14660 }
14661 break;
14662
14663 case E_V4DFmode:
14664 if (TARGET_AVX)
14665 {
14666 tmp = gen_reg_rtx (V2DFmode);
14667 if (elt < 2)
14668 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
14669 else
14670 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
14671 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14672 return;
14673 }
14674 break;
14675
14676 case E_V32QImode:
14677 if (TARGET_AVX)
14678 {
14679 tmp = gen_reg_rtx (V16QImode);
14680 if (elt < 16)
14681 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
14682 else
14683 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
14684 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14685 return;
14686 }
14687 break;
14688
14689 case E_V16HImode:
14690 if (TARGET_AVX)
14691 {
14692 tmp = gen_reg_rtx (V8HImode);
14693 if (elt < 8)
14694 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
14695 else
14696 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
14697 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14698 return;
14699 }
14700 break;
14701
14702 case E_V8SImode:
14703 if (TARGET_AVX)
14704 {
14705 tmp = gen_reg_rtx (V4SImode);
14706 if (elt < 4)
14707 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
14708 else
14709 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
14710 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14711 return;
14712 }
14713 break;
14714
14715 case E_V4DImode:
14716 if (TARGET_AVX)
14717 {
14718 tmp = gen_reg_rtx (V2DImode);
14719 if (elt < 2)
14720 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
14721 else
14722 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
14723 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14724 return;
14725 }
14726 break;
14727
14728 case E_V32HImode:
14729 if (TARGET_AVX512BW)
14730 {
14731 tmp = gen_reg_rtx (V16HImode);
14732 if (elt < 16)
14733 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
14734 else
14735 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
14736 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14737 return;
14738 }
14739 break;
14740
14741 case E_V64QImode:
14742 if (TARGET_AVX512BW)
14743 {
14744 tmp = gen_reg_rtx (V32QImode);
14745 if (elt < 32)
14746 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
14747 else
14748 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
14749 ix86_expand_vector_extract (false, target, tmp, elt & 31);
14750 return;
14751 }
14752 break;
14753
14754 case E_V16SFmode:
14755 tmp = gen_reg_rtx (V8SFmode);
14756 if (elt < 8)
14757 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
14758 else
14759 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
14760 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14761 return;
14762
14763 case E_V8DFmode:
14764 tmp = gen_reg_rtx (V4DFmode);
14765 if (elt < 4)
14766 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
14767 else
14768 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
14769 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14770 return;
14771
14772 case E_V16SImode:
14773 tmp = gen_reg_rtx (V8SImode);
14774 if (elt < 8)
14775 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
14776 else
14777 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
14778 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14779 return;
14780
14781 case E_V8DImode:
14782 tmp = gen_reg_rtx (V4DImode);
14783 if (elt < 4)
14784 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
14785 else
14786 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
14787 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14788 return;
14789
14790 case E_V8QImode:
14791 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14792 /* ??? Could extract the appropriate HImode element and shift. */
14793 break;
14794
14795 default:
14796 break;
14797 }
14798
14799 if (use_vec_extr)
14800 {
14801 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
14802 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
14803
14804 /* Let the rtl optimizers know about the zero extension performed. */
14805 if (inner_mode == QImode || inner_mode == HImode)
14806 {
14807 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
14808 target = gen_lowpart (SImode, target);
14809 }
14810
14811 emit_insn (gen_rtx_SET (target, tmp));
14812 }
14813 else
14814 {
14815 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14816
14817 emit_move_insn (mem, vec);
14818
14819 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
14820 emit_move_insn (target, tmp);
14821 }
14822 }
14823
14824 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14825 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14826 The upper bits of DEST are undefined, though they shouldn't cause
14827 exceptions (some bits from src or all zeros are ok). */
14828
14829 static void
14830 emit_reduc_half (rtx dest, rtx src, int i)
14831 {
14832 rtx tem, d = dest;
14833 switch (GET_MODE (src))
14834 {
14835 case E_V4SFmode:
14836 if (i == 128)
14837 tem = gen_sse_movhlps (dest, src, src);
14838 else
14839 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
14840 GEN_INT (1 + 4), GEN_INT (1 + 4));
14841 break;
14842 case E_V2DFmode:
14843 tem = gen_vec_interleave_highv2df (dest, src, src);
14844 break;
14845 case E_V16QImode:
14846 case E_V8HImode:
14847 case E_V4SImode:
14848 case E_V2DImode:
14849 d = gen_reg_rtx (V1TImode);
14850 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
14851 GEN_INT (i / 2));
14852 break;
14853 case E_V8SFmode:
14854 if (i == 256)
14855 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
14856 else
14857 tem = gen_avx_shufps256 (dest, src, src,
14858 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
14859 break;
14860 case E_V4DFmode:
14861 if (i == 256)
14862 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
14863 else
14864 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
14865 break;
14866 case E_V32QImode:
14867 case E_V16HImode:
14868 case E_V8SImode:
14869 case E_V4DImode:
14870 if (i == 256)
14871 {
14872 if (GET_MODE (dest) != V4DImode)
14873 d = gen_reg_rtx (V4DImode);
14874 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
14875 gen_lowpart (V4DImode, src),
14876 const1_rtx);
14877 }
14878 else
14879 {
14880 d = gen_reg_rtx (V2TImode);
14881 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
14882 GEN_INT (i / 2));
14883 }
14884 break;
14885 case E_V64QImode:
14886 case E_V32HImode:
14887 if (i < 64)
14888 {
14889 d = gen_reg_rtx (V4TImode);
14890 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
14891 GEN_INT (i / 2));
14892 break;
14893 }
14894 /* FALLTHRU */
14895 case E_V16SImode:
14896 case E_V16SFmode:
14897 case E_V8DImode:
14898 case E_V8DFmode:
14899 if (i > 128)
14900 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
14901 gen_lowpart (V16SImode, src),
14902 gen_lowpart (V16SImode, src),
14903 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
14904 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
14905 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
14906 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
14907 GEN_INT (0xC), GEN_INT (0xD),
14908 GEN_INT (0xE), GEN_INT (0xF),
14909 GEN_INT (0x10), GEN_INT (0x11),
14910 GEN_INT (0x12), GEN_INT (0x13),
14911 GEN_INT (0x14), GEN_INT (0x15),
14912 GEN_INT (0x16), GEN_INT (0x17));
14913 else
14914 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
14915 gen_lowpart (V16SImode, src),
14916 GEN_INT (i == 128 ? 0x2 : 0x1),
14917 GEN_INT (0x3),
14918 GEN_INT (0x3),
14919 GEN_INT (0x3),
14920 GEN_INT (i == 128 ? 0x6 : 0x5),
14921 GEN_INT (0x7),
14922 GEN_INT (0x7),
14923 GEN_INT (0x7),
14924 GEN_INT (i == 128 ? 0xA : 0x9),
14925 GEN_INT (0xB),
14926 GEN_INT (0xB),
14927 GEN_INT (0xB),
14928 GEN_INT (i == 128 ? 0xE : 0xD),
14929 GEN_INT (0xF),
14930 GEN_INT (0xF),
14931 GEN_INT (0xF));
14932 break;
14933 default:
14934 gcc_unreachable ();
14935 }
14936 emit_insn (tem);
14937 if (d != dest)
14938 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
14939 }
14940
14941 /* Expand a vector reduction. FN is the binary pattern to reduce;
14942 DEST is the destination; IN is the input vector. */
14943
14944 void
14945 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
14946 {
14947 rtx half, dst, vec = in;
14948 machine_mode mode = GET_MODE (in);
14949 int i;
14950
14951 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
14952 if (TARGET_SSE4_1
14953 && mode == V8HImode
14954 && fn == gen_uminv8hi3)
14955 {
14956 emit_insn (gen_sse4_1_phminposuw (dest, in));
14957 return;
14958 }
14959
14960 for (i = GET_MODE_BITSIZE (mode);
14961 i > GET_MODE_UNIT_BITSIZE (mode);
14962 i >>= 1)
14963 {
14964 half = gen_reg_rtx (mode);
14965 emit_reduc_half (half, vec, i);
14966 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
14967 dst = dest;
14968 else
14969 dst = gen_reg_rtx (mode);
14970 emit_insn (fn (dst, half, vec));
14971 vec = dst;
14972 }
14973 }
14974
14975 /* Output code to perform a conditional jump to LABEL, if C2 flag in
14976 FP status register is set. */
14977
14978 void
14979 ix86_emit_fp_unordered_jump (rtx label)
14980 {
14981 rtx reg = gen_reg_rtx (HImode);
14982 rtx_insn *insn;
14983 rtx temp;
14984
14985 emit_insn (gen_x86_fnstsw_1 (reg));
14986
14987 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
14988 {
14989 emit_insn (gen_x86_sahf_1 (reg));
14990
14991 temp = gen_rtx_REG (CCmode, FLAGS_REG);
14992 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
14993 }
14994 else
14995 {
14996 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
14997
14998 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
14999 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15000 }
15001
15002 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15003 gen_rtx_LABEL_REF (VOIDmode, label),
15004 pc_rtx);
15005 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15006 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15007 JUMP_LABEL (insn) = label;
15008 }
15009
15010 /* Output code to perform an sinh XFmode calculation. */
15011
15012 void ix86_emit_i387_sinh (rtx op0, rtx op1)
15013 {
15014 rtx e1 = gen_reg_rtx (XFmode);
15015 rtx e2 = gen_reg_rtx (XFmode);
15016 rtx scratch = gen_reg_rtx (HImode);
15017 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15018 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15019 rtx cst1, tmp;
15020 rtx_code_label *jump_label = gen_label_rtx ();
15021 rtx_insn *insn;
15022
15023 /* scratch = fxam (op1) */
15024 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15025
15026 /* e1 = expm1 (|op1|) */
15027 emit_insn (gen_absxf2 (e2, op1));
15028 emit_insn (gen_expm1xf2 (e1, e2));
15029
15030 /* e2 = e1 / (e1 + 1.0) + e1 */
15031 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15032 emit_insn (gen_addxf3 (e2, e1, cst1));
15033 emit_insn (gen_divxf3 (e2, e1, e2));
15034 emit_insn (gen_addxf3 (e2, e2, e1));
15035
15036 /* flags = signbit (op1) */
15037 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15038
15039 /* if (flags) then e2 = -e2 */
15040 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15041 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15042 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15043 pc_rtx);
15044 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15045 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15046 JUMP_LABEL (insn) = jump_label;
15047
15048 emit_insn (gen_negxf2 (e2, e2));
15049
15050 emit_label (jump_label);
15051 LABEL_NUSES (jump_label) = 1;
15052
15053 /* op0 = 0.5 * e2 */
15054 half = force_reg (XFmode, half);
15055 emit_insn (gen_mulxf3 (op0, e2, half));
15056 }
15057
15058 /* Output code to perform an cosh XFmode calculation. */
15059
15060 void ix86_emit_i387_cosh (rtx op0, rtx op1)
15061 {
15062 rtx e1 = gen_reg_rtx (XFmode);
15063 rtx e2 = gen_reg_rtx (XFmode);
15064 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15065 rtx cst1;
15066
15067 /* e1 = exp (op1) */
15068 emit_insn (gen_expxf2 (e1, op1));
15069
15070 /* e2 = e1 + 1.0 / e1 */
15071 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15072 emit_insn (gen_divxf3 (e2, cst1, e1));
15073 emit_insn (gen_addxf3 (e2, e1, e2));
15074
15075 /* op0 = 0.5 * e2 */
15076 half = force_reg (XFmode, half);
15077 emit_insn (gen_mulxf3 (op0, e2, half));
15078 }
15079
15080 /* Output code to perform an tanh XFmode calculation. */
15081
15082 void ix86_emit_i387_tanh (rtx op0, rtx op1)
15083 {
15084 rtx e1 = gen_reg_rtx (XFmode);
15085 rtx e2 = gen_reg_rtx (XFmode);
15086 rtx scratch = gen_reg_rtx (HImode);
15087 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15088 rtx cst2, tmp;
15089 rtx_code_label *jump_label = gen_label_rtx ();
15090 rtx_insn *insn;
15091
15092 /* scratch = fxam (op1) */
15093 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15094
15095 /* e1 = expm1 (-|2 * op1|) */
15096 emit_insn (gen_addxf3 (e2, op1, op1));
15097 emit_insn (gen_absxf2 (e2, e2));
15098 emit_insn (gen_negxf2 (e2, e2));
15099 emit_insn (gen_expm1xf2 (e1, e2));
15100
15101 /* e2 = e1 / (e1 + 2.0) */
15102 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15103 emit_insn (gen_addxf3 (e2, e1, cst2));
15104 emit_insn (gen_divxf3 (e2, e1, e2));
15105
15106 /* flags = signbit (op1) */
15107 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15108
15109 /* if (!flags) then e2 = -e2 */
15110 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15111 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15112 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15113 pc_rtx);
15114 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15115 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15116 JUMP_LABEL (insn) = jump_label;
15117
15118 emit_insn (gen_negxf2 (e2, e2));
15119
15120 emit_label (jump_label);
15121 LABEL_NUSES (jump_label) = 1;
15122
15123 emit_move_insn (op0, e2);
15124 }
15125
15126 /* Output code to perform an asinh XFmode calculation. */
15127
15128 void ix86_emit_i387_asinh (rtx op0, rtx op1)
15129 {
15130 rtx e1 = gen_reg_rtx (XFmode);
15131 rtx e2 = gen_reg_rtx (XFmode);
15132 rtx scratch = gen_reg_rtx (HImode);
15133 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15134 rtx cst1, tmp;
15135 rtx_code_label *jump_label = gen_label_rtx ();
15136 rtx_insn *insn;
15137
15138 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15139 emit_insn (gen_mulxf3 (e1, op1, op1));
15140 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15141 emit_insn (gen_addxf3 (e2, e1, cst1));
15142 emit_insn (gen_sqrtxf2 (e2, e2));
15143 emit_insn (gen_addxf3 (e2, e2, cst1));
15144
15145 /* e1 = e1 / e2 */
15146 emit_insn (gen_divxf3 (e1, e1, e2));
15147
15148 /* scratch = fxam (op1) */
15149 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15150
15151 /* e1 = e1 + |op1| */
15152 emit_insn (gen_absxf2 (e2, op1));
15153 emit_insn (gen_addxf3 (e1, e1, e2));
15154
15155 /* e2 = log1p (e1) */
15156 ix86_emit_i387_log1p (e2, e1);
15157
15158 /* flags = signbit (op1) */
15159 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15160
15161 /* if (flags) then e2 = -e2 */
15162 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15163 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15164 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15165 pc_rtx);
15166 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15167 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15168 JUMP_LABEL (insn) = jump_label;
15169
15170 emit_insn (gen_negxf2 (e2, e2));
15171
15172 emit_label (jump_label);
15173 LABEL_NUSES (jump_label) = 1;
15174
15175 emit_move_insn (op0, e2);
15176 }
15177
15178 /* Output code to perform an acosh XFmode calculation. */
15179
15180 void ix86_emit_i387_acosh (rtx op0, rtx op1)
15181 {
15182 rtx e1 = gen_reg_rtx (XFmode);
15183 rtx e2 = gen_reg_rtx (XFmode);
15184 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15185
15186 /* e2 = sqrt (op1 + 1.0) */
15187 emit_insn (gen_addxf3 (e2, op1, cst1));
15188 emit_insn (gen_sqrtxf2 (e2, e2));
15189
15190 /* e1 = sqrt (op1 - 1.0) */
15191 emit_insn (gen_subxf3 (e1, op1, cst1));
15192 emit_insn (gen_sqrtxf2 (e1, e1));
15193
15194 /* e1 = e1 * e2 */
15195 emit_insn (gen_mulxf3 (e1, e1, e2));
15196
15197 /* e1 = e1 + op1 */
15198 emit_insn (gen_addxf3 (e1, e1, op1));
15199
15200 /* op0 = log (e1) */
15201 emit_insn (gen_logxf2 (op0, e1));
15202 }
15203
15204 /* Output code to perform an atanh XFmode calculation. */
15205
15206 void ix86_emit_i387_atanh (rtx op0, rtx op1)
15207 {
15208 rtx e1 = gen_reg_rtx (XFmode);
15209 rtx e2 = gen_reg_rtx (XFmode);
15210 rtx scratch = gen_reg_rtx (HImode);
15211 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15212 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15213 rtx cst1, tmp;
15214 rtx_code_label *jump_label = gen_label_rtx ();
15215 rtx_insn *insn;
15216
15217 /* scratch = fxam (op1) */
15218 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15219
15220 /* e2 = |op1| */
15221 emit_insn (gen_absxf2 (e2, op1));
15222
15223 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15224 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15225 emit_insn (gen_addxf3 (e1, e2, cst1));
15226 emit_insn (gen_addxf3 (e2, e2, e2));
15227 emit_insn (gen_negxf2 (e2, e2));
15228 emit_insn (gen_divxf3 (e1, e2, e1));
15229
15230 /* e2 = log1p (e1) */
15231 ix86_emit_i387_log1p (e2, e1);
15232
15233 /* flags = signbit (op1) */
15234 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15235
15236 /* if (!flags) then e2 = -e2 */
15237 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15238 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15239 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15240 pc_rtx);
15241 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15242 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15243 JUMP_LABEL (insn) = jump_label;
15244
15245 emit_insn (gen_negxf2 (e2, e2));
15246
15247 emit_label (jump_label);
15248 LABEL_NUSES (jump_label) = 1;
15249
15250 /* op0 = 0.5 * e2 */
15251 half = force_reg (XFmode, half);
15252 emit_insn (gen_mulxf3 (op0, e2, half));
15253 }
15254
15255 /* Output code to perform a log1p XFmode calculation. */
15256
15257 void ix86_emit_i387_log1p (rtx op0, rtx op1)
15258 {
15259 rtx_code_label *label1 = gen_label_rtx ();
15260 rtx_code_label *label2 = gen_label_rtx ();
15261
15262 rtx tmp = gen_reg_rtx (XFmode);
15263 rtx res = gen_reg_rtx (XFmode);
15264 rtx cst, cstln2, cst1;
15265 rtx_insn *insn;
15266
15267 cst = const_double_from_real_value
15268 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15269 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15270
15271 emit_insn (gen_absxf2 (tmp, op1));
15272
15273 cst = force_reg (XFmode, cst);
15274 ix86_expand_branch (GE, tmp, cst, label1);
15275 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15276 insn = get_last_insn ();
15277 JUMP_LABEL (insn) = label1;
15278
15279 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15280 emit_jump (label2);
15281
15282 emit_label (label1);
15283 LABEL_NUSES (label1) = 1;
15284
15285 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15286 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15287 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15288
15289 emit_label (label2);
15290 LABEL_NUSES (label2) = 1;
15291
15292 emit_move_insn (op0, res);
15293 }
15294
15295 /* Emit code for round calculation. */
15296 void ix86_emit_i387_round (rtx op0, rtx op1)
15297 {
15298 machine_mode inmode = GET_MODE (op1);
15299 machine_mode outmode = GET_MODE (op0);
15300 rtx e1 = gen_reg_rtx (XFmode);
15301 rtx e2 = gen_reg_rtx (XFmode);
15302 rtx scratch = gen_reg_rtx (HImode);
15303 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15304 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15305 rtx res = gen_reg_rtx (outmode);
15306 rtx_code_label *jump_label = gen_label_rtx ();
15307 rtx (*floor_insn) (rtx, rtx);
15308 rtx (*neg_insn) (rtx, rtx);
15309 rtx_insn *insn;
15310 rtx tmp;
15311
15312 switch (inmode)
15313 {
15314 case E_SFmode:
15315 case E_DFmode:
15316 tmp = gen_reg_rtx (XFmode);
15317
15318 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15319 op1 = tmp;
15320 break;
15321 case E_XFmode:
15322 break;
15323 default:
15324 gcc_unreachable ();
15325 }
15326
15327 switch (outmode)
15328 {
15329 case E_SFmode:
15330 floor_insn = gen_frndintxf2_floor;
15331 neg_insn = gen_negsf2;
15332 break;
15333 case E_DFmode:
15334 floor_insn = gen_frndintxf2_floor;
15335 neg_insn = gen_negdf2;
15336 break;
15337 case E_XFmode:
15338 floor_insn = gen_frndintxf2_floor;
15339 neg_insn = gen_negxf2;
15340 break;
15341 case E_HImode:
15342 floor_insn = gen_lfloorxfhi2;
15343 neg_insn = gen_neghi2;
15344 break;
15345 case E_SImode:
15346 floor_insn = gen_lfloorxfsi2;
15347 neg_insn = gen_negsi2;
15348 break;
15349 case E_DImode:
15350 floor_insn = gen_lfloorxfdi2;
15351 neg_insn = gen_negdi2;
15352 break;
15353 default:
15354 gcc_unreachable ();
15355 }
15356
15357 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15358
15359 /* scratch = fxam(op1) */
15360 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15361
15362 /* e1 = fabs(op1) */
15363 emit_insn (gen_absxf2 (e1, op1));
15364
15365 /* e2 = e1 + 0.5 */
15366 half = force_reg (XFmode, half);
15367 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15368
15369 /* res = floor(e2) */
15370 switch (outmode)
15371 {
15372 case E_SFmode:
15373 case E_DFmode:
15374 {
15375 tmp = gen_reg_rtx (XFmode);
15376
15377 emit_insn (floor_insn (tmp, e2));
15378 emit_insn (gen_rtx_SET (res,
15379 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15380 UNSPEC_TRUNC_NOOP)));
15381 }
15382 break;
15383 default:
15384 emit_insn (floor_insn (res, e2));
15385 }
15386
15387 /* flags = signbit(a) */
15388 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15389
15390 /* if (flags) then res = -res */
15391 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15392 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15393 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15394 pc_rtx);
15395 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15396 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15397 JUMP_LABEL (insn) = jump_label;
15398
15399 emit_insn (neg_insn (res, res));
15400
15401 emit_label (jump_label);
15402 LABEL_NUSES (jump_label) = 1;
15403
15404 emit_move_insn (op0, res);
15405 }
15406
15407 /* Output code to perform a Newton-Rhapson approximation of a single precision
15408 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15409
15410 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15411 {
15412 rtx x0, x1, e0, e1;
15413
15414 x0 = gen_reg_rtx (mode);
15415 e0 = gen_reg_rtx (mode);
15416 e1 = gen_reg_rtx (mode);
15417 x1 = gen_reg_rtx (mode);
15418
15419 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15420
15421 b = force_reg (mode, b);
15422
15423 /* x0 = rcp(b) estimate */
15424 if (mode == V16SFmode || mode == V8DFmode)
15425 {
15426 if (TARGET_AVX512ER)
15427 {
15428 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15429 UNSPEC_RCP28)));
15430 /* res = a * x0 */
15431 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15432 return;
15433 }
15434 else
15435 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15436 UNSPEC_RCP14)));
15437 }
15438 else
15439 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15440 UNSPEC_RCP)));
15441
15442 /* e0 = x0 * b */
15443 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15444
15445 /* e0 = x0 * e0 */
15446 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
15447
15448 /* e1 = x0 + x0 */
15449 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
15450
15451 /* x1 = e1 - e0 */
15452 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
15453
15454 /* res = a * x1 */
15455 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
15456 }
15457
15458 /* Output code to perform a Newton-Rhapson approximation of a
15459 single precision floating point [reciprocal] square root. */
15460
15461 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
15462 {
15463 rtx x0, e0, e1, e2, e3, mthree, mhalf;
15464 REAL_VALUE_TYPE r;
15465 int unspec;
15466
15467 x0 = gen_reg_rtx (mode);
15468 e0 = gen_reg_rtx (mode);
15469 e1 = gen_reg_rtx (mode);
15470 e2 = gen_reg_rtx (mode);
15471 e3 = gen_reg_rtx (mode);
15472
15473 if (TARGET_AVX512ER && mode == V16SFmode)
15474 {
15475 if (recip)
15476 /* res = rsqrt28(a) estimate */
15477 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15478 UNSPEC_RSQRT28)));
15479 else
15480 {
15481 /* x0 = rsqrt28(a) estimate */
15482 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15483 UNSPEC_RSQRT28)));
15484 /* res = rcp28(x0) estimate */
15485 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
15486 UNSPEC_RCP28)));
15487 }
15488 return;
15489 }
15490
15491 real_from_integer (&r, VOIDmode, -3, SIGNED);
15492 mthree = const_double_from_real_value (r, SFmode);
15493
15494 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
15495 mhalf = const_double_from_real_value (r, SFmode);
15496 unspec = UNSPEC_RSQRT;
15497
15498 if (VECTOR_MODE_P (mode))
15499 {
15500 mthree = ix86_build_const_vector (mode, true, mthree);
15501 mhalf = ix86_build_const_vector (mode, true, mhalf);
15502 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15503 if (GET_MODE_SIZE (mode) == 64)
15504 unspec = UNSPEC_RSQRT14;
15505 }
15506
15507 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15508 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15509
15510 a = force_reg (mode, a);
15511
15512 /* x0 = rsqrt(a) estimate */
15513 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15514 unspec)));
15515
15516 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15517 if (!recip)
15518 {
15519 rtx zero = force_reg (mode, CONST0_RTX(mode));
15520 rtx mask;
15521
15522 /* Handle masked compare. */
15523 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
15524 {
15525 mask = gen_reg_rtx (HImode);
15526 /* Imm value 0x4 corresponds to not-equal comparison. */
15527 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
15528 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
15529 }
15530 else
15531 {
15532 mask = gen_reg_rtx (mode);
15533 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
15534 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
15535 }
15536 }
15537
15538 mthree = force_reg (mode, mthree);
15539
15540 /* e0 = x0 * a */
15541 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
15542
15543 unsigned vector_size = GET_MODE_SIZE (mode);
15544 if (TARGET_FMA
15545 || (TARGET_AVX512F && vector_size == 64)
15546 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
15547 emit_insn (gen_rtx_SET (e2,
15548 gen_rtx_FMA (mode, e0, x0, mthree)));
15549 else
15550 {
15551 /* e1 = e0 * x0 */
15552 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
15553
15554 /* e2 = e1 - 3. */
15555 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
15556 }
15557
15558 mhalf = force_reg (mode, mhalf);
15559 if (recip)
15560 /* e3 = -.5 * x0 */
15561 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
15562 else
15563 /* e3 = -.5 * e0 */
15564 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
15565 /* ret = e2 * e3 */
15566 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
15567 }
15568
15569 /* Expand fabs (OP0) and return a new rtx that holds the result. The
15570 mask for masking out the sign-bit is stored in *SMASK, if that is
15571 non-null. */
15572
15573 static rtx
15574 ix86_expand_sse_fabs (rtx op0, rtx *smask)
15575 {
15576 machine_mode vmode, mode = GET_MODE (op0);
15577 rtx xa, mask;
15578
15579 xa = gen_reg_rtx (mode);
15580 if (mode == SFmode)
15581 vmode = V4SFmode;
15582 else if (mode == DFmode)
15583 vmode = V2DFmode;
15584 else
15585 vmode = mode;
15586 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
15587 if (!VECTOR_MODE_P (mode))
15588 {
15589 /* We need to generate a scalar mode mask in this case. */
15590 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15591 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15592 mask = gen_reg_rtx (mode);
15593 emit_insn (gen_rtx_SET (mask, tmp));
15594 }
15595 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
15596
15597 if (smask)
15598 *smask = mask;
15599
15600 return xa;
15601 }
15602
15603 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15604 swapping the operands if SWAP_OPERANDS is true. The expanded
15605 code is a forward jump to a newly created label in case the
15606 comparison is true. The generated label rtx is returned. */
15607 static rtx_code_label *
15608 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
15609 bool swap_operands)
15610 {
15611 bool unordered_compare = ix86_unordered_fp_compare (code);
15612 rtx_code_label *label;
15613 rtx tmp, reg;
15614
15615 if (swap_operands)
15616 std::swap (op0, op1);
15617
15618 label = gen_label_rtx ();
15619 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
15620 if (unordered_compare)
15621 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
15622 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
15623 emit_insn (gen_rtx_SET (reg, tmp));
15624 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
15625 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
15626 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
15627 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15628 JUMP_LABEL (tmp) = label;
15629
15630 return label;
15631 }
15632
15633 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15634 using comparison code CODE. Operands are swapped for the comparison if
15635 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15636 static rtx
15637 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
15638 bool swap_operands)
15639 {
15640 rtx (*insn)(rtx, rtx, rtx, rtx);
15641 machine_mode mode = GET_MODE (op0);
15642 rtx mask = gen_reg_rtx (mode);
15643
15644 if (swap_operands)
15645 std::swap (op0, op1);
15646
15647 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
15648
15649 emit_insn (insn (mask, op0, op1,
15650 gen_rtx_fmt_ee (code, mode, op0, op1)));
15651 return mask;
15652 }
15653
15654 /* Expand copysign from SIGN to the positive value ABS_VALUE
15655 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15656 the sign-bit. */
15657
15658 static void
15659 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
15660 {
15661 machine_mode mode = GET_MODE (sign);
15662 rtx sgn = gen_reg_rtx (mode);
15663 if (mask == NULL_RTX)
15664 {
15665 machine_mode vmode;
15666
15667 if (mode == SFmode)
15668 vmode = V4SFmode;
15669 else if (mode == DFmode)
15670 vmode = V2DFmode;
15671 else
15672 vmode = mode;
15673
15674 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
15675 if (!VECTOR_MODE_P (mode))
15676 {
15677 /* We need to generate a scalar mode mask in this case. */
15678 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15679 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15680 mask = gen_reg_rtx (mode);
15681 emit_insn (gen_rtx_SET (mask, tmp));
15682 }
15683 }
15684 else
15685 mask = gen_rtx_NOT (mode, mask);
15686 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
15687 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
15688 }
15689
15690 /* Expand SSE sequence for computing lround from OP1 storing
15691 into OP0. */
15692
15693 void
15694 ix86_expand_lround (rtx op0, rtx op1)
15695 {
15696 /* C code for the stuff we're doing below:
15697 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15698 return (long)tmp;
15699 */
15700 machine_mode mode = GET_MODE (op1);
15701 const struct real_format *fmt;
15702 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
15703 rtx adj;
15704
15705 /* load nextafter (0.5, 0.0) */
15706 fmt = REAL_MODE_FORMAT (mode);
15707 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
15708 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
15709
15710 /* adj = copysign (0.5, op1) */
15711 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
15712 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
15713
15714 /* adj = op1 + adj */
15715 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
15716
15717 /* op0 = (imode)adj */
15718 expand_fix (op0, adj, 0);
15719 }
15720
15721 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15722 into OPERAND0. */
15723
15724 void
15725 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
15726 {
15727 /* C code for the stuff we're doing below (for do_floor):
15728 xi = (long)op1;
15729 xi -= (double)xi > op1 ? 1 : 0;
15730 return xi;
15731 */
15732 machine_mode fmode = GET_MODE (op1);
15733 machine_mode imode = GET_MODE (op0);
15734 rtx ireg, freg, tmp;
15735 rtx_code_label *label;
15736
15737 /* reg = (long)op1 */
15738 ireg = gen_reg_rtx (imode);
15739 expand_fix (ireg, op1, 0);
15740
15741 /* freg = (double)reg */
15742 freg = gen_reg_rtx (fmode);
15743 expand_float (freg, ireg, 0);
15744
15745 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15746 label = ix86_expand_sse_compare_and_jump (UNLE,
15747 freg, op1, !do_floor);
15748 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
15749 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
15750 emit_move_insn (ireg, tmp);
15751
15752 emit_label (label);
15753 LABEL_NUSES (label) = 1;
15754
15755 emit_move_insn (op0, ireg);
15756 }
15757
15758 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15759 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15760
15761 static rtx
15762 ix86_gen_TWO52 (machine_mode mode)
15763 {
15764 REAL_VALUE_TYPE TWO52r;
15765 rtx TWO52;
15766
15767 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
15768 TWO52 = const_double_from_real_value (TWO52r, mode);
15769 TWO52 = force_reg (mode, TWO52);
15770
15771 return TWO52;
15772 }
15773
15774 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15775
15776 void
15777 ix86_expand_rint (rtx operand0, rtx operand1)
15778 {
15779 /* C code for the stuff we're doing below:
15780 xa = fabs (operand1);
15781 if (!isless (xa, 2**52))
15782 return operand1;
15783 two52 = 2**52;
15784 if (flag_rounding_math)
15785 {
15786 two52 = copysign (two52, operand1);
15787 xa = operand1;
15788 }
15789 xa = xa + two52 - two52;
15790 return copysign (xa, operand1);
15791 */
15792 machine_mode mode = GET_MODE (operand0);
15793 rtx res, xa, TWO52, two52, mask;
15794 rtx_code_label *label;
15795
15796 res = gen_reg_rtx (mode);
15797 emit_move_insn (res, operand1);
15798
15799 /* xa = abs (operand1) */
15800 xa = ix86_expand_sse_fabs (res, &mask);
15801
15802 /* if (!isless (xa, TWO52)) goto label; */
15803 TWO52 = ix86_gen_TWO52 (mode);
15804 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15805
15806 two52 = TWO52;
15807 if (flag_rounding_math)
15808 {
15809 two52 = gen_reg_rtx (mode);
15810 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
15811 xa = res;
15812 }
15813
15814 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
15815 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
15816
15817 ix86_sse_copysign_to_positive (res, xa, res, mask);
15818
15819 emit_label (label);
15820 LABEL_NUSES (label) = 1;
15821
15822 emit_move_insn (operand0, res);
15823 }
15824
15825 /* Expand SSE2 sequence for computing floor or ceil
15826 from OPERAND1 storing into OPERAND0. */
15827 void
15828 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
15829 {
15830 /* C code for the stuff we expand below.
15831 double xa = fabs (x), x2;
15832 if (!isless (xa, TWO52))
15833 return x;
15834 x2 = (double)(long)x;
15835 Compensate. Floor:
15836 if (x2 > x)
15837 x2 -= 1;
15838 Compensate. Ceil:
15839 if (x2 < x)
15840 x2 += 1;
15841 if (HONOR_SIGNED_ZEROS (mode))
15842 return copysign (x2, x);
15843 return x2;
15844 */
15845 machine_mode mode = GET_MODE (operand0);
15846 rtx xa, xi, TWO52, tmp, one, res, mask;
15847 rtx_code_label *label;
15848
15849 TWO52 = ix86_gen_TWO52 (mode);
15850
15851 /* Temporary for holding the result, initialized to the input
15852 operand to ease control flow. */
15853 res = gen_reg_rtx (mode);
15854 emit_move_insn (res, operand1);
15855
15856 /* xa = abs (operand1) */
15857 xa = ix86_expand_sse_fabs (res, &mask);
15858
15859 /* if (!isless (xa, TWO52)) goto label; */
15860 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15861
15862 /* xa = (double)(long)x */
15863 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
15864 expand_fix (xi, res, 0);
15865 expand_float (xa, xi, 0);
15866
15867 /* generate 1.0 */
15868 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15869
15870 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15871 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15872 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15873 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15874 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15875 emit_move_insn (res, tmp);
15876
15877 if (HONOR_SIGNED_ZEROS (mode))
15878 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
15879
15880 emit_label (label);
15881 LABEL_NUSES (label) = 1;
15882
15883 emit_move_insn (operand0, res);
15884 }
15885
15886 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15887 into OPERAND0 without relying on DImode truncation via cvttsd2siq
15888 that is only available on 64bit targets. */
15889 void
15890 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
15891 {
15892 /* C code for the stuff we expand below.
15893 double xa = fabs (x), x2;
15894 if (!isless (xa, TWO52))
15895 return x;
15896 xa = xa + TWO52 - TWO52;
15897 x2 = copysign (xa, x);
15898 Compensate. Floor:
15899 if (x2 > x)
15900 x2 -= 1;
15901 Compensate. Ceil:
15902 if (x2 < x)
15903 x2 += 1;
15904 if (HONOR_SIGNED_ZEROS (mode))
15905 x2 = copysign (x2, x);
15906 return x2;
15907 */
15908 machine_mode mode = GET_MODE (operand0);
15909 rtx xa, TWO52, tmp, one, res, mask;
15910 rtx_code_label *label;
15911
15912 TWO52 = ix86_gen_TWO52 (mode);
15913
15914 /* Temporary for holding the result, initialized to the input
15915 operand to ease control flow. */
15916 res = gen_reg_rtx (mode);
15917 emit_move_insn (res, operand1);
15918
15919 /* xa = abs (operand1) */
15920 xa = ix86_expand_sse_fabs (res, &mask);
15921
15922 /* if (!isless (xa, TWO52)) goto label; */
15923 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15924
15925 /* xa = xa + TWO52 - TWO52; */
15926 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
15927 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
15928
15929 /* xa = copysign (xa, operand1) */
15930 ix86_sse_copysign_to_positive (xa, xa, res, mask);
15931
15932 /* generate 1.0 */
15933 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15934
15935 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15936 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15937 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15938 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15939 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15940 if (!do_floor && HONOR_SIGNED_ZEROS (mode))
15941 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
15942 emit_move_insn (res, tmp);
15943
15944 emit_label (label);
15945 LABEL_NUSES (label) = 1;
15946
15947 emit_move_insn (operand0, res);
15948 }
15949
15950 /* Expand SSE sequence for computing trunc
15951 from OPERAND1 storing into OPERAND0. */
15952 void
15953 ix86_expand_trunc (rtx operand0, rtx operand1)
15954 {
15955 /* C code for SSE variant we expand below.
15956 double xa = fabs (x), x2;
15957 if (!isless (xa, TWO52))
15958 return x;
15959 x2 = (double)(long)x;
15960 if (HONOR_SIGNED_ZEROS (mode))
15961 return copysign (x2, x);
15962 return x2;
15963 */
15964 machine_mode mode = GET_MODE (operand0);
15965 rtx xa, xi, TWO52, res, mask;
15966 rtx_code_label *label;
15967
15968 TWO52 = ix86_gen_TWO52 (mode);
15969
15970 /* Temporary for holding the result, initialized to the input
15971 operand to ease control flow. */
15972 res = gen_reg_rtx (mode);
15973 emit_move_insn (res, operand1);
15974
15975 /* xa = abs (operand1) */
15976 xa = ix86_expand_sse_fabs (res, &mask);
15977
15978 /* if (!isless (xa, TWO52)) goto label; */
15979 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15980
15981 /* x = (double)(long)x */
15982 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
15983 expand_fix (xi, res, 0);
15984 expand_float (res, xi, 0);
15985
15986 if (HONOR_SIGNED_ZEROS (mode))
15987 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
15988
15989 emit_label (label);
15990 LABEL_NUSES (label) = 1;
15991
15992 emit_move_insn (operand0, res);
15993 }
15994
15995 /* Expand SSE sequence for computing trunc from OPERAND1 storing
15996 into OPERAND0 without relying on DImode truncation via cvttsd2siq
15997 that is only available on 64bit targets. */
15998 void
15999 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16000 {
16001 machine_mode mode = GET_MODE (operand0);
16002 rtx xa, mask, TWO52, one, res, smask, tmp;
16003 rtx_code_label *label;
16004
16005 /* C code for SSE variant we expand below.
16006 double xa = fabs (x), x2;
16007 if (!isless (xa, TWO52))
16008 return x;
16009 xa2 = xa + TWO52 - TWO52;
16010 Compensate:
16011 if (xa2 > xa)
16012 xa2 -= 1.0;
16013 x2 = copysign (xa2, x);
16014 return x2;
16015 */
16016
16017 TWO52 = ix86_gen_TWO52 (mode);
16018
16019 /* Temporary for holding the result, initialized to the input
16020 operand to ease control flow. */
16021 res = gen_reg_rtx (mode);
16022 emit_move_insn (res, operand1);
16023
16024 /* xa = abs (operand1) */
16025 xa = ix86_expand_sse_fabs (res, &smask);
16026
16027 /* if (!isless (xa, TWO52)) goto label; */
16028 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16029
16030 /* res = xa + TWO52 - TWO52; */
16031 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16032 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
16033 emit_move_insn (res, tmp);
16034
16035 /* generate 1.0 */
16036 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16037
16038 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
16039 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
16040 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
16041 tmp = expand_simple_binop (mode, MINUS,
16042 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
16043 emit_move_insn (res, tmp);
16044
16045 /* res = copysign (res, operand1) */
16046 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
16047
16048 emit_label (label);
16049 LABEL_NUSES (label) = 1;
16050
16051 emit_move_insn (operand0, res);
16052 }
16053
16054 /* Expand SSE sequence for computing round
16055 from OPERAND1 storing into OPERAND0. */
16056 void
16057 ix86_expand_round (rtx operand0, rtx operand1)
16058 {
16059 /* C code for the stuff we're doing below:
16060 double xa = fabs (x);
16061 if (!isless (xa, TWO52))
16062 return x;
16063 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16064 return copysign (xa, x);
16065 */
16066 machine_mode mode = GET_MODE (operand0);
16067 rtx res, TWO52, xa, xi, half, mask;
16068 rtx_code_label *label;
16069 const struct real_format *fmt;
16070 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16071
16072 /* Temporary for holding the result, initialized to the input
16073 operand to ease control flow. */
16074 res = gen_reg_rtx (mode);
16075 emit_move_insn (res, operand1);
16076
16077 TWO52 = ix86_gen_TWO52 (mode);
16078 xa = ix86_expand_sse_fabs (res, &mask);
16079 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16080
16081 /* load nextafter (0.5, 0.0) */
16082 fmt = REAL_MODE_FORMAT (mode);
16083 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16084 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16085
16086 /* xa = xa + 0.5 */
16087 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16088 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16089
16090 /* xa = (double)(int64_t)xa */
16091 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16092 expand_fix (xi, xa, 0);
16093 expand_float (xa, xi, 0);
16094
16095 /* res = copysign (xa, operand1) */
16096 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
16097
16098 emit_label (label);
16099 LABEL_NUSES (label) = 1;
16100
16101 emit_move_insn (operand0, res);
16102 }
16103
16104 /* Expand SSE sequence for computing round from OPERAND1 storing
16105 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16106 that is only available on 64bit targets. */
16107 void
16108 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16109 {
16110 /* C code for the stuff we expand below.
16111 double xa = fabs (x), xa2, x2;
16112 if (!isless (xa, TWO52))
16113 return x;
16114 Using the absolute value and copying back sign makes
16115 -0.0 -> -0.0 correct.
16116 xa2 = xa + TWO52 - TWO52;
16117 Compensate.
16118 dxa = xa2 - xa;
16119 if (dxa <= -0.5)
16120 xa2 += 1;
16121 else if (dxa > 0.5)
16122 xa2 -= 1;
16123 x2 = copysign (xa2, x);
16124 return x2;
16125 */
16126 machine_mode mode = GET_MODE (operand0);
16127 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16128 rtx_code_label *label;
16129
16130 TWO52 = ix86_gen_TWO52 (mode);
16131
16132 /* Temporary for holding the result, initialized to the input
16133 operand to ease control flow. */
16134 res = gen_reg_rtx (mode);
16135 emit_move_insn (res, operand1);
16136
16137 /* xa = abs (operand1) */
16138 xa = ix86_expand_sse_fabs (res, &mask);
16139
16140 /* if (!isless (xa, TWO52)) goto label; */
16141 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16142
16143 /* xa2 = xa + TWO52 - TWO52; */
16144 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16145 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16146
16147 /* dxa = xa2 - xa; */
16148 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16149
16150 /* generate 0.5, 1.0 and -0.5 */
16151 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16152 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16153 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16154 0, OPTAB_DIRECT);
16155
16156 /* Compensate. */
16157 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16158 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16159 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16160 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16161 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16162 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16163 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16164 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16165
16166 /* res = copysign (xa2, operand1) */
16167 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
16168
16169 emit_label (label);
16170 LABEL_NUSES (label) = 1;
16171
16172 emit_move_insn (operand0, res);
16173 }
16174
16175 /* Expand SSE sequence for computing round
16176 from OP1 storing into OP0 using sse4 round insn. */
16177 void
16178 ix86_expand_round_sse4 (rtx op0, rtx op1)
16179 {
16180 machine_mode mode = GET_MODE (op0);
16181 rtx e1, e2, res, half;
16182 const struct real_format *fmt;
16183 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16184 rtx (*gen_copysign) (rtx, rtx, rtx);
16185 rtx (*gen_round) (rtx, rtx, rtx);
16186
16187 switch (mode)
16188 {
16189 case E_SFmode:
16190 gen_copysign = gen_copysignsf3;
16191 gen_round = gen_sse4_1_roundsf2;
16192 break;
16193 case E_DFmode:
16194 gen_copysign = gen_copysigndf3;
16195 gen_round = gen_sse4_1_rounddf2;
16196 break;
16197 default:
16198 gcc_unreachable ();
16199 }
16200
16201 /* round (a) = trunc (a + copysign (0.5, a)) */
16202
16203 /* load nextafter (0.5, 0.0) */
16204 fmt = REAL_MODE_FORMAT (mode);
16205 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16206 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16207 half = const_double_from_real_value (pred_half, mode);
16208
16209 /* e1 = copysign (0.5, op1) */
16210 e1 = gen_reg_rtx (mode);
16211 emit_insn (gen_copysign (e1, half, op1));
16212
16213 /* e2 = op1 + e1 */
16214 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16215
16216 /* res = trunc (e2) */
16217 res = gen_reg_rtx (mode);
16218 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16219
16220 emit_move_insn (op0, res);
16221 }
16222
16223 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16224 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16225 insn every time. */
16226
16227 static GTY(()) rtx_insn *vselect_insn;
16228
16229 /* Initialize vselect_insn. */
16230
16231 static void
16232 init_vselect_insn (void)
16233 {
16234 unsigned i;
16235 rtx x;
16236
16237 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16238 for (i = 0; i < MAX_VECT_LEN; ++i)
16239 XVECEXP (x, 0, i) = const0_rtx;
16240 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16241 const0_rtx), x);
16242 x = gen_rtx_SET (const0_rtx, x);
16243 start_sequence ();
16244 vselect_insn = emit_insn (x);
16245 end_sequence ();
16246 }
16247
16248 /* Construct (set target (vec_select op0 (parallel perm))) and
16249 return true if that's a valid instruction in the active ISA. */
16250
16251 static bool
16252 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16253 unsigned nelt, bool testing_p)
16254 {
16255 unsigned int i;
16256 rtx x, save_vconcat;
16257 int icode;
16258
16259 if (vselect_insn == NULL_RTX)
16260 init_vselect_insn ();
16261
16262 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16263 PUT_NUM_ELEM (XVEC (x, 0), nelt);
16264 for (i = 0; i < nelt; ++i)
16265 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16266 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16267 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16268 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16269 SET_DEST (PATTERN (vselect_insn)) = target;
16270 icode = recog_memoized (vselect_insn);
16271
16272 if (icode >= 0 && !testing_p)
16273 emit_insn (copy_rtx (PATTERN (vselect_insn)));
16274
16275 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16276 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16277 INSN_CODE (vselect_insn) = -1;
16278
16279 return icode >= 0;
16280 }
16281
16282 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16283
16284 static bool
16285 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16286 const unsigned char *perm, unsigned nelt,
16287 bool testing_p)
16288 {
16289 machine_mode v2mode;
16290 rtx x;
16291 bool ok;
16292
16293 if (vselect_insn == NULL_RTX)
16294 init_vselect_insn ();
16295
16296 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16297 return false;
16298 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16299 PUT_MODE (x, v2mode);
16300 XEXP (x, 0) = op0;
16301 XEXP (x, 1) = op1;
16302 ok = expand_vselect (target, x, perm, nelt, testing_p);
16303 XEXP (x, 0) = const0_rtx;
16304 XEXP (x, 1) = const0_rtx;
16305 return ok;
16306 }
16307
16308 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16309 using movss or movsd. */
16310 static bool
16311 expand_vec_perm_movs (struct expand_vec_perm_d *d)
16312 {
16313 machine_mode vmode = d->vmode;
16314 unsigned i, nelt = d->nelt;
16315 rtx x;
16316
16317 if (d->one_operand_p)
16318 return false;
16319
16320 if (!(TARGET_SSE && vmode == V4SFmode)
16321 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
16322 && !(TARGET_SSE2 && vmode == V2DFmode))
16323 return false;
16324
16325 /* Only the first element is changed. */
16326 if (d->perm[0] != nelt && d->perm[0] != 0)
16327 return false;
16328 for (i = 1; i < nelt; ++i)
16329 if (d->perm[i] != i + nelt - d->perm[0])
16330 return false;
16331
16332 if (d->testing_p)
16333 return true;
16334
16335 if (d->perm[0] == nelt)
16336 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16337 else
16338 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16339
16340 emit_insn (gen_rtx_SET (d->target, x));
16341
16342 return true;
16343 }
16344
16345 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16346 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16347
16348 static bool
16349 expand_vec_perm_blend (struct expand_vec_perm_d *d)
16350 {
16351 machine_mode mmode, vmode = d->vmode;
16352 unsigned i, nelt = d->nelt;
16353 unsigned HOST_WIDE_INT mask;
16354 rtx target, op0, op1, maskop, x;
16355 rtx rperm[32], vperm;
16356
16357 if (d->one_operand_p)
16358 return false;
16359 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16360 && (TARGET_AVX512BW
16361 || GET_MODE_UNIT_SIZE (vmode) >= 4))
16362 ;
16363 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16364 ;
16365 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16366 ;
16367 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16368 ;
16369 else
16370 return false;
16371
16372 /* This is a blend, not a permute. Elements must stay in their
16373 respective lanes. */
16374 for (i = 0; i < nelt; ++i)
16375 {
16376 unsigned e = d->perm[i];
16377 if (!(e == i || e == i + nelt))
16378 return false;
16379 }
16380
16381 if (d->testing_p)
16382 return true;
16383
16384 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16385 decision should be extracted elsewhere, so that we only try that
16386 sequence once all budget==3 options have been tried. */
16387 target = d->target;
16388 op0 = d->op0;
16389 op1 = d->op1;
16390 mask = 0;
16391
16392 switch (vmode)
16393 {
16394 case E_V8DFmode:
16395 case E_V16SFmode:
16396 case E_V4DFmode:
16397 case E_V8SFmode:
16398 case E_V2DFmode:
16399 case E_V4SFmode:
16400 case E_V8HImode:
16401 case E_V8SImode:
16402 case E_V32HImode:
16403 case E_V64QImode:
16404 case E_V16SImode:
16405 case E_V8DImode:
16406 for (i = 0; i < nelt; ++i)
16407 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
16408 break;
16409
16410 case E_V2DImode:
16411 for (i = 0; i < 2; ++i)
16412 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16413 vmode = V8HImode;
16414 goto do_subreg;
16415
16416 case E_V4SImode:
16417 for (i = 0; i < 4; ++i)
16418 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16419 vmode = V8HImode;
16420 goto do_subreg;
16421
16422 case E_V16QImode:
16423 /* See if bytes move in pairs so we can use pblendw with
16424 an immediate argument, rather than pblendvb with a vector
16425 argument. */
16426 for (i = 0; i < 16; i += 2)
16427 if (d->perm[i] + 1 != d->perm[i + 1])
16428 {
16429 use_pblendvb:
16430 for (i = 0; i < nelt; ++i)
16431 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
16432
16433 finish_pblendvb:
16434 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16435 vperm = force_reg (vmode, vperm);
16436
16437 if (GET_MODE_SIZE (vmode) == 16)
16438 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
16439 else
16440 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
16441 if (target != d->target)
16442 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16443 return true;
16444 }
16445
16446 for (i = 0; i < 8; ++i)
16447 mask |= (d->perm[i * 2] >= 16) << i;
16448 vmode = V8HImode;
16449 /* FALLTHRU */
16450
16451 do_subreg:
16452 target = gen_reg_rtx (vmode);
16453 op0 = gen_lowpart (vmode, op0);
16454 op1 = gen_lowpart (vmode, op1);
16455 break;
16456
16457 case E_V32QImode:
16458 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16459 for (i = 0; i < 32; i += 2)
16460 if (d->perm[i] + 1 != d->perm[i + 1])
16461 goto use_pblendvb;
16462 /* See if bytes move in quadruplets. If yes, vpblendd
16463 with immediate can be used. */
16464 for (i = 0; i < 32; i += 4)
16465 if (d->perm[i] + 2 != d->perm[i + 2])
16466 break;
16467 if (i < 32)
16468 {
16469 /* See if bytes move the same in both lanes. If yes,
16470 vpblendw with immediate can be used. */
16471 for (i = 0; i < 16; i += 2)
16472 if (d->perm[i] + 16 != d->perm[i + 16])
16473 goto use_pblendvb;
16474
16475 /* Use vpblendw. */
16476 for (i = 0; i < 16; ++i)
16477 mask |= (d->perm[i * 2] >= 32) << i;
16478 vmode = V16HImode;
16479 goto do_subreg;
16480 }
16481
16482 /* Use vpblendd. */
16483 for (i = 0; i < 8; ++i)
16484 mask |= (d->perm[i * 4] >= 32) << i;
16485 vmode = V8SImode;
16486 goto do_subreg;
16487
16488 case E_V16HImode:
16489 /* See if words move in pairs. If yes, vpblendd can be used. */
16490 for (i = 0; i < 16; i += 2)
16491 if (d->perm[i] + 1 != d->perm[i + 1])
16492 break;
16493 if (i < 16)
16494 {
16495 /* See if words move the same in both lanes. If not,
16496 vpblendvb must be used. */
16497 for (i = 0; i < 8; i++)
16498 if (d->perm[i] + 8 != d->perm[i + 8])
16499 {
16500 /* Use vpblendvb. */
16501 for (i = 0; i < 32; ++i)
16502 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
16503
16504 vmode = V32QImode;
16505 nelt = 32;
16506 target = gen_reg_rtx (vmode);
16507 op0 = gen_lowpart (vmode, op0);
16508 op1 = gen_lowpart (vmode, op1);
16509 goto finish_pblendvb;
16510 }
16511
16512 /* Use vpblendw. */
16513 for (i = 0; i < 16; ++i)
16514 mask |= (d->perm[i] >= 16) << i;
16515 break;
16516 }
16517
16518 /* Use vpblendd. */
16519 for (i = 0; i < 8; ++i)
16520 mask |= (d->perm[i * 2] >= 16) << i;
16521 vmode = V8SImode;
16522 goto do_subreg;
16523
16524 case E_V4DImode:
16525 /* Use vpblendd. */
16526 for (i = 0; i < 4; ++i)
16527 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16528 vmode = V8SImode;
16529 goto do_subreg;
16530
16531 default:
16532 gcc_unreachable ();
16533 }
16534
16535 switch (vmode)
16536 {
16537 case E_V8DFmode:
16538 case E_V8DImode:
16539 mmode = QImode;
16540 break;
16541 case E_V16SFmode:
16542 case E_V16SImode:
16543 mmode = HImode;
16544 break;
16545 case E_V32HImode:
16546 mmode = SImode;
16547 break;
16548 case E_V64QImode:
16549 mmode = DImode;
16550 break;
16551 default:
16552 mmode = VOIDmode;
16553 }
16554
16555 if (mmode != VOIDmode)
16556 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
16557 else
16558 maskop = GEN_INT (mask);
16559
16560 /* This matches five different patterns with the different modes. */
16561 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
16562 x = gen_rtx_SET (target, x);
16563 emit_insn (x);
16564 if (target != d->target)
16565 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16566
16567 return true;
16568 }
16569
16570 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16571 in terms of the variable form of vpermilps.
16572
16573 Note that we will have already failed the immediate input vpermilps,
16574 which requires that the high and low part shuffle be identical; the
16575 variable form doesn't require that. */
16576
16577 static bool
16578 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
16579 {
16580 rtx rperm[8], vperm;
16581 unsigned i;
16582
16583 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
16584 return false;
16585
16586 /* We can only permute within the 128-bit lane. */
16587 for (i = 0; i < 8; ++i)
16588 {
16589 unsigned e = d->perm[i];
16590 if (i < 4 ? e >= 4 : e < 4)
16591 return false;
16592 }
16593
16594 if (d->testing_p)
16595 return true;
16596
16597 for (i = 0; i < 8; ++i)
16598 {
16599 unsigned e = d->perm[i];
16600
16601 /* Within each 128-bit lane, the elements of op0 are numbered
16602 from 0 and the elements of op1 are numbered from 4. */
16603 if (e >= 8 + 4)
16604 e -= 8;
16605 else if (e >= 4)
16606 e -= 4;
16607
16608 rperm[i] = GEN_INT (e);
16609 }
16610
16611 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
16612 vperm = force_reg (V8SImode, vperm);
16613 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
16614
16615 return true;
16616 }
16617
16618 /* Return true if permutation D can be performed as VMODE permutation
16619 instead. */
16620
16621 static bool
16622 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
16623 {
16624 unsigned int i, j, chunk;
16625
16626 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
16627 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
16628 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
16629 return false;
16630
16631 if (GET_MODE_NUNITS (vmode) >= d->nelt)
16632 return true;
16633
16634 chunk = d->nelt / GET_MODE_NUNITS (vmode);
16635 for (i = 0; i < d->nelt; i += chunk)
16636 if (d->perm[i] & (chunk - 1))
16637 return false;
16638 else
16639 for (j = 1; j < chunk; ++j)
16640 if (d->perm[i] + j != d->perm[i + j])
16641 return false;
16642
16643 return true;
16644 }
16645
16646 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16647 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16648
16649 static bool
16650 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
16651 {
16652 unsigned i, nelt, eltsz, mask;
16653 unsigned char perm[64];
16654 machine_mode vmode = V16QImode;
16655 rtx rperm[64], vperm, target, op0, op1;
16656
16657 nelt = d->nelt;
16658
16659 if (!d->one_operand_p)
16660 {
16661 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
16662 {
16663 if (TARGET_AVX2
16664 && valid_perm_using_mode_p (V2TImode, d))
16665 {
16666 if (d->testing_p)
16667 return true;
16668
16669 /* Use vperm2i128 insn. The pattern uses
16670 V4DImode instead of V2TImode. */
16671 target = d->target;
16672 if (d->vmode != V4DImode)
16673 target = gen_reg_rtx (V4DImode);
16674 op0 = gen_lowpart (V4DImode, d->op0);
16675 op1 = gen_lowpart (V4DImode, d->op1);
16676 rperm[0]
16677 = GEN_INT ((d->perm[0] / (nelt / 2))
16678 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
16679 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
16680 if (target != d->target)
16681 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16682 return true;
16683 }
16684 return false;
16685 }
16686 }
16687 else
16688 {
16689 if (GET_MODE_SIZE (d->vmode) == 16)
16690 {
16691 if (!TARGET_SSSE3)
16692 return false;
16693 }
16694 else if (GET_MODE_SIZE (d->vmode) == 32)
16695 {
16696 if (!TARGET_AVX2)
16697 return false;
16698
16699 /* V4DImode should be already handled through
16700 expand_vselect by vpermq instruction. */
16701 gcc_assert (d->vmode != V4DImode);
16702
16703 vmode = V32QImode;
16704 if (d->vmode == V8SImode
16705 || d->vmode == V16HImode
16706 || d->vmode == V32QImode)
16707 {
16708 /* First see if vpermq can be used for
16709 V8SImode/V16HImode/V32QImode. */
16710 if (valid_perm_using_mode_p (V4DImode, d))
16711 {
16712 for (i = 0; i < 4; i++)
16713 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
16714 if (d->testing_p)
16715 return true;
16716 target = gen_reg_rtx (V4DImode);
16717 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
16718 perm, 4, false))
16719 {
16720 emit_move_insn (d->target,
16721 gen_lowpart (d->vmode, target));
16722 return true;
16723 }
16724 return false;
16725 }
16726
16727 /* Next see if vpermd can be used. */
16728 if (valid_perm_using_mode_p (V8SImode, d))
16729 vmode = V8SImode;
16730 }
16731 /* Or if vpermps can be used. */
16732 else if (d->vmode == V8SFmode)
16733 vmode = V8SImode;
16734
16735 if (vmode == V32QImode)
16736 {
16737 /* vpshufb only works intra lanes, it is not
16738 possible to shuffle bytes in between the lanes. */
16739 for (i = 0; i < nelt; ++i)
16740 if ((d->perm[i] ^ i) & (nelt / 2))
16741 return false;
16742 }
16743 }
16744 else if (GET_MODE_SIZE (d->vmode) == 64)
16745 {
16746 if (!TARGET_AVX512BW)
16747 return false;
16748
16749 /* If vpermq didn't work, vpshufb won't work either. */
16750 if (d->vmode == V8DFmode || d->vmode == V8DImode)
16751 return false;
16752
16753 vmode = V64QImode;
16754 if (d->vmode == V16SImode
16755 || d->vmode == V32HImode
16756 || d->vmode == V64QImode)
16757 {
16758 /* First see if vpermq can be used for
16759 V16SImode/V32HImode/V64QImode. */
16760 if (valid_perm_using_mode_p (V8DImode, d))
16761 {
16762 for (i = 0; i < 8; i++)
16763 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
16764 if (d->testing_p)
16765 return true;
16766 target = gen_reg_rtx (V8DImode);
16767 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
16768 perm, 8, false))
16769 {
16770 emit_move_insn (d->target,
16771 gen_lowpart (d->vmode, target));
16772 return true;
16773 }
16774 return false;
16775 }
16776
16777 /* Next see if vpermd can be used. */
16778 if (valid_perm_using_mode_p (V16SImode, d))
16779 vmode = V16SImode;
16780 }
16781 /* Or if vpermps can be used. */
16782 else if (d->vmode == V16SFmode)
16783 vmode = V16SImode;
16784 if (vmode == V64QImode)
16785 {
16786 /* vpshufb only works intra lanes, it is not
16787 possible to shuffle bytes in between the lanes. */
16788 for (i = 0; i < nelt; ++i)
16789 if ((d->perm[i] ^ i) & (3 * nelt / 4))
16790 return false;
16791 }
16792 }
16793 else
16794 return false;
16795 }
16796
16797 if (d->testing_p)
16798 return true;
16799
16800 if (vmode == V8SImode)
16801 for (i = 0; i < 8; ++i)
16802 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
16803 else if (vmode == V16SImode)
16804 for (i = 0; i < 16; ++i)
16805 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
16806 else
16807 {
16808 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
16809 if (!d->one_operand_p)
16810 mask = 2 * nelt - 1;
16811 else if (vmode == V16QImode)
16812 mask = nelt - 1;
16813 else if (vmode == V64QImode)
16814 mask = nelt / 4 - 1;
16815 else
16816 mask = nelt / 2 - 1;
16817
16818 for (i = 0; i < nelt; ++i)
16819 {
16820 unsigned j, e = d->perm[i] & mask;
16821 for (j = 0; j < eltsz; ++j)
16822 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
16823 }
16824 }
16825
16826 vperm = gen_rtx_CONST_VECTOR (vmode,
16827 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
16828 vperm = force_reg (vmode, vperm);
16829
16830 target = d->target;
16831 if (d->vmode != vmode)
16832 target = gen_reg_rtx (vmode);
16833 op0 = gen_lowpart (vmode, d->op0);
16834 if (d->one_operand_p)
16835 {
16836 if (vmode == V16QImode)
16837 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
16838 else if (vmode == V32QImode)
16839 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
16840 else if (vmode == V64QImode)
16841 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
16842 else if (vmode == V8SFmode)
16843 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
16844 else if (vmode == V8SImode)
16845 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
16846 else if (vmode == V16SFmode)
16847 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
16848 else if (vmode == V16SImode)
16849 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
16850 else
16851 gcc_unreachable ();
16852 }
16853 else
16854 {
16855 op1 = gen_lowpart (vmode, d->op1);
16856 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
16857 }
16858 if (target != d->target)
16859 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16860
16861 return true;
16862 }
16863
16864 /* For V*[QHS]Imode permutations, check if the same permutation
16865 can't be performed in a 2x, 4x or 8x wider inner mode. */
16866
16867 static bool
16868 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
16869 struct expand_vec_perm_d *nd)
16870 {
16871 int i;
16872 machine_mode mode = VOIDmode;
16873
16874 switch (d->vmode)
16875 {
16876 case E_V16QImode: mode = V8HImode; break;
16877 case E_V32QImode: mode = V16HImode; break;
16878 case E_V64QImode: mode = V32HImode; break;
16879 case E_V8HImode: mode = V4SImode; break;
16880 case E_V16HImode: mode = V8SImode; break;
16881 case E_V32HImode: mode = V16SImode; break;
16882 case E_V4SImode: mode = V2DImode; break;
16883 case E_V8SImode: mode = V4DImode; break;
16884 case E_V16SImode: mode = V8DImode; break;
16885 default: return false;
16886 }
16887 for (i = 0; i < d->nelt; i += 2)
16888 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
16889 return false;
16890 nd->vmode = mode;
16891 nd->nelt = d->nelt / 2;
16892 for (i = 0; i < nd->nelt; i++)
16893 nd->perm[i] = d->perm[2 * i] / 2;
16894 if (GET_MODE_INNER (mode) != DImode)
16895 canonicalize_vector_int_perm (nd, nd);
16896 if (nd != d)
16897 {
16898 nd->one_operand_p = d->one_operand_p;
16899 nd->testing_p = d->testing_p;
16900 if (d->op0 == d->op1)
16901 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
16902 else
16903 {
16904 nd->op0 = gen_lowpart (nd->vmode, d->op0);
16905 nd->op1 = gen_lowpart (nd->vmode, d->op1);
16906 }
16907 if (d->testing_p)
16908 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
16909 else
16910 nd->target = gen_reg_rtx (nd->vmode);
16911 }
16912 return true;
16913 }
16914
16915 /* Try to expand one-operand permutation with constant mask. */
16916
16917 static bool
16918 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
16919 {
16920 machine_mode mode = GET_MODE (d->op0);
16921 machine_mode maskmode = mode;
16922 rtx (*gen) (rtx, rtx, rtx) = NULL;
16923 rtx target, op0, mask;
16924 rtx vec[64];
16925
16926 if (!rtx_equal_p (d->op0, d->op1))
16927 return false;
16928
16929 if (!TARGET_AVX512F)
16930 return false;
16931
16932 switch (mode)
16933 {
16934 case E_V16SImode:
16935 gen = gen_avx512f_permvarv16si;
16936 break;
16937 case E_V16SFmode:
16938 gen = gen_avx512f_permvarv16sf;
16939 maskmode = V16SImode;
16940 break;
16941 case E_V8DImode:
16942 gen = gen_avx512f_permvarv8di;
16943 break;
16944 case E_V8DFmode:
16945 gen = gen_avx512f_permvarv8df;
16946 maskmode = V8DImode;
16947 break;
16948 default:
16949 return false;
16950 }
16951
16952 target = d->target;
16953 op0 = d->op0;
16954 for (int i = 0; i < d->nelt; ++i)
16955 vec[i] = GEN_INT (d->perm[i]);
16956 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
16957 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
16958 return true;
16959 }
16960
16961 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
16962
16963 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
16964 in a single instruction. */
16965
16966 static bool
16967 expand_vec_perm_1 (struct expand_vec_perm_d *d)
16968 {
16969 unsigned i, nelt = d->nelt;
16970 struct expand_vec_perm_d nd;
16971
16972 /* Check plain VEC_SELECT first, because AVX has instructions that could
16973 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
16974 input where SEL+CONCAT may not. */
16975 if (d->one_operand_p)
16976 {
16977 int mask = nelt - 1;
16978 bool identity_perm = true;
16979 bool broadcast_perm = true;
16980
16981 for (i = 0; i < nelt; i++)
16982 {
16983 nd.perm[i] = d->perm[i] & mask;
16984 if (nd.perm[i] != i)
16985 identity_perm = false;
16986 if (nd.perm[i])
16987 broadcast_perm = false;
16988 }
16989
16990 if (identity_perm)
16991 {
16992 if (!d->testing_p)
16993 emit_move_insn (d->target, d->op0);
16994 return true;
16995 }
16996 else if (broadcast_perm && TARGET_AVX2)
16997 {
16998 /* Use vpbroadcast{b,w,d}. */
16999 rtx (*gen) (rtx, rtx) = NULL;
17000 switch (d->vmode)
17001 {
17002 case E_V64QImode:
17003 if (TARGET_AVX512BW)
17004 gen = gen_avx512bw_vec_dupv64qi_1;
17005 break;
17006 case E_V32QImode:
17007 gen = gen_avx2_pbroadcastv32qi_1;
17008 break;
17009 case E_V32HImode:
17010 if (TARGET_AVX512BW)
17011 gen = gen_avx512bw_vec_dupv32hi_1;
17012 break;
17013 case E_V16HImode:
17014 gen = gen_avx2_pbroadcastv16hi_1;
17015 break;
17016 case E_V16SImode:
17017 if (TARGET_AVX512F)
17018 gen = gen_avx512f_vec_dupv16si_1;
17019 break;
17020 case E_V8SImode:
17021 gen = gen_avx2_pbroadcastv8si_1;
17022 break;
17023 case E_V16QImode:
17024 gen = gen_avx2_pbroadcastv16qi;
17025 break;
17026 case E_V8HImode:
17027 gen = gen_avx2_pbroadcastv8hi;
17028 break;
17029 case E_V16SFmode:
17030 if (TARGET_AVX512F)
17031 gen = gen_avx512f_vec_dupv16sf_1;
17032 break;
17033 case E_V8SFmode:
17034 gen = gen_avx2_vec_dupv8sf_1;
17035 break;
17036 case E_V8DFmode:
17037 if (TARGET_AVX512F)
17038 gen = gen_avx512f_vec_dupv8df_1;
17039 break;
17040 case E_V8DImode:
17041 if (TARGET_AVX512F)
17042 gen = gen_avx512f_vec_dupv8di_1;
17043 break;
17044 /* For other modes prefer other shuffles this function creates. */
17045 default: break;
17046 }
17047 if (gen != NULL)
17048 {
17049 if (!d->testing_p)
17050 emit_insn (gen (d->target, d->op0));
17051 return true;
17052 }
17053 }
17054
17055 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17056 return true;
17057
17058 /* There are plenty of patterns in sse.md that are written for
17059 SEL+CONCAT and are not replicated for a single op. Perhaps
17060 that should be changed, to avoid the nastiness here. */
17061
17062 /* Recognize interleave style patterns, which means incrementing
17063 every other permutation operand. */
17064 for (i = 0; i < nelt; i += 2)
17065 {
17066 nd.perm[i] = d->perm[i] & mask;
17067 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17068 }
17069 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17070 d->testing_p))
17071 return true;
17072
17073 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17074 if (nelt >= 4)
17075 {
17076 for (i = 0; i < nelt; i += 4)
17077 {
17078 nd.perm[i + 0] = d->perm[i + 0] & mask;
17079 nd.perm[i + 1] = d->perm[i + 1] & mask;
17080 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17081 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17082 }
17083
17084 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17085 d->testing_p))
17086 return true;
17087 }
17088 }
17089
17090 /* Try movss/movsd instructions. */
17091 if (expand_vec_perm_movs (d))
17092 return true;
17093
17094 /* Finally, try the fully general two operand permute. */
17095 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17096 d->testing_p))
17097 return true;
17098
17099 /* Recognize interleave style patterns with reversed operands. */
17100 if (!d->one_operand_p)
17101 {
17102 for (i = 0; i < nelt; ++i)
17103 {
17104 unsigned e = d->perm[i];
17105 if (e >= nelt)
17106 e -= nelt;
17107 else
17108 e += nelt;
17109 nd.perm[i] = e;
17110 }
17111
17112 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17113 d->testing_p))
17114 return true;
17115 }
17116
17117 /* Try the SSE4.1 blend variable merge instructions. */
17118 if (expand_vec_perm_blend (d))
17119 return true;
17120
17121 /* Try one of the AVX vpermil variable permutations. */
17122 if (expand_vec_perm_vpermil (d))
17123 return true;
17124
17125 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17126 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17127 if (expand_vec_perm_pshufb (d))
17128 return true;
17129
17130 /* Try the AVX2 vpalignr instruction. */
17131 if (expand_vec_perm_palignr (d, true))
17132 return true;
17133
17134 /* Try the AVX512F vperm{s,d} instructions. */
17135 if (ix86_expand_vec_one_operand_perm_avx512 (d))
17136 return true;
17137
17138 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17139 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17140 return true;
17141
17142 /* See if we can get the same permutation in different vector integer
17143 mode. */
17144 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17145 {
17146 if (!d->testing_p)
17147 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17148 return true;
17149 }
17150 return false;
17151 }
17152
17153 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17154 in terms of a pair of pshuflw + pshufhw instructions. */
17155
17156 static bool
17157 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17158 {
17159 unsigned char perm2[MAX_VECT_LEN];
17160 unsigned i;
17161 bool ok;
17162
17163 if (d->vmode != V8HImode || !d->one_operand_p)
17164 return false;
17165
17166 /* The two permutations only operate in 64-bit lanes. */
17167 for (i = 0; i < 4; ++i)
17168 if (d->perm[i] >= 4)
17169 return false;
17170 for (i = 4; i < 8; ++i)
17171 if (d->perm[i] < 4)
17172 return false;
17173
17174 if (d->testing_p)
17175 return true;
17176
17177 /* Emit the pshuflw. */
17178 memcpy (perm2, d->perm, 4);
17179 for (i = 4; i < 8; ++i)
17180 perm2[i] = i;
17181 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17182 gcc_assert (ok);
17183
17184 /* Emit the pshufhw. */
17185 memcpy (perm2 + 4, d->perm + 4, 4);
17186 for (i = 0; i < 4; ++i)
17187 perm2[i] = i;
17188 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17189 gcc_assert (ok);
17190
17191 return true;
17192 }
17193
17194 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17195 the permutation using the SSSE3 palignr instruction. This succeeds
17196 when all of the elements in PERM fit within one vector and we merely
17197 need to shift them down so that a single vector permutation has a
17198 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17199 the vpalignr instruction itself can perform the requested permutation. */
17200
17201 static bool
17202 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17203 {
17204 unsigned i, nelt = d->nelt;
17205 unsigned min, max, minswap, maxswap;
17206 bool in_order, ok, swap = false;
17207 rtx shift, target;
17208 struct expand_vec_perm_d dcopy;
17209
17210 /* Even with AVX, palignr only operates on 128-bit vectors,
17211 in AVX2 palignr operates on both 128-bit lanes. */
17212 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17213 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17214 return false;
17215
17216 min = 2 * nelt;
17217 max = 0;
17218 minswap = 2 * nelt;
17219 maxswap = 0;
17220 for (i = 0; i < nelt; ++i)
17221 {
17222 unsigned e = d->perm[i];
17223 unsigned eswap = d->perm[i] ^ nelt;
17224 if (GET_MODE_SIZE (d->vmode) == 32)
17225 {
17226 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17227 eswap = e ^ (nelt / 2);
17228 }
17229 if (e < min)
17230 min = e;
17231 if (e > max)
17232 max = e;
17233 if (eswap < minswap)
17234 minswap = eswap;
17235 if (eswap > maxswap)
17236 maxswap = eswap;
17237 }
17238 if (min == 0
17239 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17240 {
17241 if (d->one_operand_p
17242 || minswap == 0
17243 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17244 ? nelt / 2 : nelt))
17245 return false;
17246 swap = true;
17247 min = minswap;
17248 max = maxswap;
17249 }
17250
17251 /* Given that we have SSSE3, we know we'll be able to implement the
17252 single operand permutation after the palignr with pshufb for
17253 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17254 first. */
17255 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17256 return true;
17257
17258 dcopy = *d;
17259 if (swap)
17260 {
17261 dcopy.op0 = d->op1;
17262 dcopy.op1 = d->op0;
17263 for (i = 0; i < nelt; ++i)
17264 dcopy.perm[i] ^= nelt;
17265 }
17266
17267 in_order = true;
17268 for (i = 0; i < nelt; ++i)
17269 {
17270 unsigned e = dcopy.perm[i];
17271 if (GET_MODE_SIZE (d->vmode) == 32
17272 && e >= nelt
17273 && (e & (nelt / 2 - 1)) < min)
17274 e = e - min - (nelt / 2);
17275 else
17276 e = e - min;
17277 if (e != i)
17278 in_order = false;
17279 dcopy.perm[i] = e;
17280 }
17281 dcopy.one_operand_p = true;
17282
17283 if (single_insn_only_p && !in_order)
17284 return false;
17285
17286 /* For AVX2, test whether we can permute the result in one instruction. */
17287 if (d->testing_p)
17288 {
17289 if (in_order)
17290 return true;
17291 dcopy.op1 = dcopy.op0;
17292 return expand_vec_perm_1 (&dcopy);
17293 }
17294
17295 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17296 if (GET_MODE_SIZE (d->vmode) == 16)
17297 {
17298 target = gen_reg_rtx (TImode);
17299 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17300 gen_lowpart (TImode, dcopy.op0), shift));
17301 }
17302 else
17303 {
17304 target = gen_reg_rtx (V2TImode);
17305 emit_insn (gen_avx2_palignrv2ti (target,
17306 gen_lowpart (V2TImode, dcopy.op1),
17307 gen_lowpart (V2TImode, dcopy.op0),
17308 shift));
17309 }
17310
17311 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17312
17313 /* Test for the degenerate case where the alignment by itself
17314 produces the desired permutation. */
17315 if (in_order)
17316 {
17317 emit_move_insn (d->target, dcopy.op0);
17318 return true;
17319 }
17320
17321 ok = expand_vec_perm_1 (&dcopy);
17322 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17323
17324 return ok;
17325 }
17326
17327 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17328 the permutation using the SSE4_1 pblendv instruction. Potentially
17329 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17330
17331 static bool
17332 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17333 {
17334 unsigned i, which, nelt = d->nelt;
17335 struct expand_vec_perm_d dcopy, dcopy1;
17336 machine_mode vmode = d->vmode;
17337 bool ok;
17338
17339 /* Use the same checks as in expand_vec_perm_blend. */
17340 if (d->one_operand_p)
17341 return false;
17342 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17343 ;
17344 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17345 ;
17346 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17347 ;
17348 else
17349 return false;
17350
17351 /* Figure out where permutation elements stay not in their
17352 respective lanes. */
17353 for (i = 0, which = 0; i < nelt; ++i)
17354 {
17355 unsigned e = d->perm[i];
17356 if (e != i)
17357 which |= (e < nelt ? 1 : 2);
17358 }
17359 /* We can pblend the part where elements stay not in their
17360 respective lanes only when these elements are all in one
17361 half of a permutation.
17362 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17363 lanes, but both 8 and 9 >= 8
17364 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17365 respective lanes and 8 >= 8, but 2 not. */
17366 if (which != 1 && which != 2)
17367 return false;
17368 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17369 return true;
17370
17371 /* First we apply one operand permutation to the part where
17372 elements stay not in their respective lanes. */
17373 dcopy = *d;
17374 if (which == 2)
17375 dcopy.op0 = dcopy.op1 = d->op1;
17376 else
17377 dcopy.op0 = dcopy.op1 = d->op0;
17378 if (!d->testing_p)
17379 dcopy.target = gen_reg_rtx (vmode);
17380 dcopy.one_operand_p = true;
17381
17382 for (i = 0; i < nelt; ++i)
17383 dcopy.perm[i] = d->perm[i] & (nelt - 1);
17384
17385 ok = expand_vec_perm_1 (&dcopy);
17386 if (GET_MODE_SIZE (vmode) != 16 && !ok)
17387 return false;
17388 else
17389 gcc_assert (ok);
17390 if (d->testing_p)
17391 return true;
17392
17393 /* Next we put permuted elements into their positions. */
17394 dcopy1 = *d;
17395 if (which == 2)
17396 dcopy1.op1 = dcopy.target;
17397 else
17398 dcopy1.op0 = dcopy.target;
17399
17400 for (i = 0; i < nelt; ++i)
17401 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17402
17403 ok = expand_vec_perm_blend (&dcopy1);
17404 gcc_assert (ok);
17405
17406 return true;
17407 }
17408
17409 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17410
17411 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17412 a two vector permutation into a single vector permutation by using
17413 an interleave operation to merge the vectors. */
17414
17415 static bool
17416 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17417 {
17418 struct expand_vec_perm_d dremap, dfinal;
17419 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17420 unsigned HOST_WIDE_INT contents;
17421 unsigned char remap[2 * MAX_VECT_LEN];
17422 rtx_insn *seq;
17423 bool ok, same_halves = false;
17424
17425 if (GET_MODE_SIZE (d->vmode) == 16)
17426 {
17427 if (d->one_operand_p)
17428 return false;
17429 }
17430 else if (GET_MODE_SIZE (d->vmode) == 32)
17431 {
17432 if (!TARGET_AVX)
17433 return false;
17434 /* For 32-byte modes allow even d->one_operand_p.
17435 The lack of cross-lane shuffling in some instructions
17436 might prevent a single insn shuffle. */
17437 dfinal = *d;
17438 dfinal.testing_p = true;
17439 /* If expand_vec_perm_interleave3 can expand this into
17440 a 3 insn sequence, give up and let it be expanded as
17441 3 insn sequence. While that is one insn longer,
17442 it doesn't need a memory operand and in the common
17443 case that both interleave low and high permutations
17444 with the same operands are adjacent needs 4 insns
17445 for both after CSE. */
17446 if (expand_vec_perm_interleave3 (&dfinal))
17447 return false;
17448 }
17449 else
17450 return false;
17451
17452 /* Examine from whence the elements come. */
17453 contents = 0;
17454 for (i = 0; i < nelt; ++i)
17455 contents |= HOST_WIDE_INT_1U << d->perm[i];
17456
17457 memset (remap, 0xff, sizeof (remap));
17458 dremap = *d;
17459
17460 if (GET_MODE_SIZE (d->vmode) == 16)
17461 {
17462 unsigned HOST_WIDE_INT h1, h2, h3, h4;
17463
17464 /* Split the two input vectors into 4 halves. */
17465 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
17466 h2 = h1 << nelt2;
17467 h3 = h2 << nelt2;
17468 h4 = h3 << nelt2;
17469
17470 /* If the elements from the low halves use interleave low, and similarly
17471 for interleave high. If the elements are from mis-matched halves, we
17472 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17473 if ((contents & (h1 | h3)) == contents)
17474 {
17475 /* punpckl* */
17476 for (i = 0; i < nelt2; ++i)
17477 {
17478 remap[i] = i * 2;
17479 remap[i + nelt] = i * 2 + 1;
17480 dremap.perm[i * 2] = i;
17481 dremap.perm[i * 2 + 1] = i + nelt;
17482 }
17483 if (!TARGET_SSE2 && d->vmode == V4SImode)
17484 dremap.vmode = V4SFmode;
17485 }
17486 else if ((contents & (h2 | h4)) == contents)
17487 {
17488 /* punpckh* */
17489 for (i = 0; i < nelt2; ++i)
17490 {
17491 remap[i + nelt2] = i * 2;
17492 remap[i + nelt + nelt2] = i * 2 + 1;
17493 dremap.perm[i * 2] = i + nelt2;
17494 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
17495 }
17496 if (!TARGET_SSE2 && d->vmode == V4SImode)
17497 dremap.vmode = V4SFmode;
17498 }
17499 else if ((contents & (h1 | h4)) == contents)
17500 {
17501 /* shufps */
17502 for (i = 0; i < nelt2; ++i)
17503 {
17504 remap[i] = i;
17505 remap[i + nelt + nelt2] = i + nelt2;
17506 dremap.perm[i] = i;
17507 dremap.perm[i + nelt2] = i + nelt + nelt2;
17508 }
17509 if (nelt != 4)
17510 {
17511 /* shufpd */
17512 dremap.vmode = V2DImode;
17513 dremap.nelt = 2;
17514 dremap.perm[0] = 0;
17515 dremap.perm[1] = 3;
17516 }
17517 }
17518 else if ((contents & (h2 | h3)) == contents)
17519 {
17520 /* shufps */
17521 for (i = 0; i < nelt2; ++i)
17522 {
17523 remap[i + nelt2] = i;
17524 remap[i + nelt] = i + nelt2;
17525 dremap.perm[i] = i + nelt2;
17526 dremap.perm[i + nelt2] = i + nelt;
17527 }
17528 if (nelt != 4)
17529 {
17530 /* shufpd */
17531 dremap.vmode = V2DImode;
17532 dremap.nelt = 2;
17533 dremap.perm[0] = 1;
17534 dremap.perm[1] = 2;
17535 }
17536 }
17537 else
17538 return false;
17539 }
17540 else
17541 {
17542 unsigned int nelt4 = nelt / 4, nzcnt = 0;
17543 unsigned HOST_WIDE_INT q[8];
17544 unsigned int nonzero_halves[4];
17545
17546 /* Split the two input vectors into 8 quarters. */
17547 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
17548 for (i = 1; i < 8; ++i)
17549 q[i] = q[0] << (nelt4 * i);
17550 for (i = 0; i < 4; ++i)
17551 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
17552 {
17553 nonzero_halves[nzcnt] = i;
17554 ++nzcnt;
17555 }
17556
17557 if (nzcnt == 1)
17558 {
17559 gcc_assert (d->one_operand_p);
17560 nonzero_halves[1] = nonzero_halves[0];
17561 same_halves = true;
17562 }
17563 else if (d->one_operand_p)
17564 {
17565 gcc_assert (nonzero_halves[0] == 0);
17566 gcc_assert (nonzero_halves[1] == 1);
17567 }
17568
17569 if (nzcnt <= 2)
17570 {
17571 if (d->perm[0] / nelt2 == nonzero_halves[1])
17572 {
17573 /* Attempt to increase the likelihood that dfinal
17574 shuffle will be intra-lane. */
17575 std::swap (nonzero_halves[0], nonzero_halves[1]);
17576 }
17577
17578 /* vperm2f128 or vperm2i128. */
17579 for (i = 0; i < nelt2; ++i)
17580 {
17581 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
17582 remap[i + nonzero_halves[0] * nelt2] = i;
17583 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
17584 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
17585 }
17586
17587 if (d->vmode != V8SFmode
17588 && d->vmode != V4DFmode
17589 && d->vmode != V8SImode)
17590 {
17591 dremap.vmode = V8SImode;
17592 dremap.nelt = 8;
17593 for (i = 0; i < 4; ++i)
17594 {
17595 dremap.perm[i] = i + nonzero_halves[0] * 4;
17596 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
17597 }
17598 }
17599 }
17600 else if (d->one_operand_p)
17601 return false;
17602 else if (TARGET_AVX2
17603 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
17604 {
17605 /* vpunpckl* */
17606 for (i = 0; i < nelt4; ++i)
17607 {
17608 remap[i] = i * 2;
17609 remap[i + nelt] = i * 2 + 1;
17610 remap[i + nelt2] = i * 2 + nelt2;
17611 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
17612 dremap.perm[i * 2] = i;
17613 dremap.perm[i * 2 + 1] = i + nelt;
17614 dremap.perm[i * 2 + nelt2] = i + nelt2;
17615 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
17616 }
17617 }
17618 else if (TARGET_AVX2
17619 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
17620 {
17621 /* vpunpckh* */
17622 for (i = 0; i < nelt4; ++i)
17623 {
17624 remap[i + nelt4] = i * 2;
17625 remap[i + nelt + nelt4] = i * 2 + 1;
17626 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
17627 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
17628 dremap.perm[i * 2] = i + nelt4;
17629 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
17630 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
17631 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
17632 }
17633 }
17634 else
17635 return false;
17636 }
17637
17638 /* Use the remapping array set up above to move the elements from their
17639 swizzled locations into their final destinations. */
17640 dfinal = *d;
17641 for (i = 0; i < nelt; ++i)
17642 {
17643 unsigned e = remap[d->perm[i]];
17644 gcc_assert (e < nelt);
17645 /* If same_halves is true, both halves of the remapped vector are the
17646 same. Avoid cross-lane accesses if possible. */
17647 if (same_halves && i >= nelt2)
17648 {
17649 gcc_assert (e < nelt2);
17650 dfinal.perm[i] = e + nelt2;
17651 }
17652 else
17653 dfinal.perm[i] = e;
17654 }
17655 if (!d->testing_p)
17656 {
17657 dremap.target = gen_reg_rtx (dremap.vmode);
17658 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17659 }
17660 dfinal.op1 = dfinal.op0;
17661 dfinal.one_operand_p = true;
17662
17663 /* Test if the final remap can be done with a single insn. For V4SFmode or
17664 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17665 start_sequence ();
17666 ok = expand_vec_perm_1 (&dfinal);
17667 seq = get_insns ();
17668 end_sequence ();
17669
17670 if (!ok)
17671 return false;
17672
17673 if (d->testing_p)
17674 return true;
17675
17676 if (dremap.vmode != dfinal.vmode)
17677 {
17678 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
17679 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
17680 }
17681
17682 ok = expand_vec_perm_1 (&dremap);
17683 gcc_assert (ok);
17684
17685 emit_insn (seq);
17686 return true;
17687 }
17688
17689 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17690 a single vector cross-lane permutation into vpermq followed
17691 by any of the single insn permutations. */
17692
17693 static bool
17694 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
17695 {
17696 struct expand_vec_perm_d dremap, dfinal;
17697 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
17698 unsigned contents[2];
17699 bool ok;
17700
17701 if (!(TARGET_AVX2
17702 && (d->vmode == V32QImode || d->vmode == V16HImode)
17703 && d->one_operand_p))
17704 return false;
17705
17706 contents[0] = 0;
17707 contents[1] = 0;
17708 for (i = 0; i < nelt2; ++i)
17709 {
17710 contents[0] |= 1u << (d->perm[i] / nelt4);
17711 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
17712 }
17713
17714 for (i = 0; i < 2; ++i)
17715 {
17716 unsigned int cnt = 0;
17717 for (j = 0; j < 4; ++j)
17718 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
17719 return false;
17720 }
17721
17722 if (d->testing_p)
17723 return true;
17724
17725 dremap = *d;
17726 dremap.vmode = V4DImode;
17727 dremap.nelt = 4;
17728 dremap.target = gen_reg_rtx (V4DImode);
17729 dremap.op0 = gen_lowpart (V4DImode, d->op0);
17730 dremap.op1 = dremap.op0;
17731 dremap.one_operand_p = true;
17732 for (i = 0; i < 2; ++i)
17733 {
17734 unsigned int cnt = 0;
17735 for (j = 0; j < 4; ++j)
17736 if ((contents[i] & (1u << j)) != 0)
17737 dremap.perm[2 * i + cnt++] = j;
17738 for (; cnt < 2; ++cnt)
17739 dremap.perm[2 * i + cnt] = 0;
17740 }
17741
17742 dfinal = *d;
17743 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17744 dfinal.op1 = dfinal.op0;
17745 dfinal.one_operand_p = true;
17746 for (i = 0, j = 0; i < nelt; ++i)
17747 {
17748 if (i == nelt2)
17749 j = 2;
17750 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
17751 if ((d->perm[i] / nelt4) == dremap.perm[j])
17752 ;
17753 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
17754 dfinal.perm[i] |= nelt4;
17755 else
17756 gcc_unreachable ();
17757 }
17758
17759 ok = expand_vec_perm_1 (&dremap);
17760 gcc_assert (ok);
17761
17762 ok = expand_vec_perm_1 (&dfinal);
17763 gcc_assert (ok);
17764
17765 return true;
17766 }
17767
17768 static bool canonicalize_perm (struct expand_vec_perm_d *d);
17769
17770 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
17771 a vector permutation using two instructions, vperm2f128 resp.
17772 vperm2i128 followed by any single in-lane permutation. */
17773
17774 static bool
17775 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
17776 {
17777 struct expand_vec_perm_d dfirst, dsecond;
17778 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
17779 bool ok;
17780
17781 if (!TARGET_AVX
17782 || GET_MODE_SIZE (d->vmode) != 32
17783 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
17784 return false;
17785
17786 dsecond = *d;
17787 dsecond.one_operand_p = false;
17788 dsecond.testing_p = true;
17789
17790 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17791 immediate. For perm < 16 the second permutation uses
17792 d->op0 as first operand, for perm >= 16 it uses d->op1
17793 as first operand. The second operand is the result of
17794 vperm2[fi]128. */
17795 for (perm = 0; perm < 32; perm++)
17796 {
17797 /* Ignore permutations which do not move anything cross-lane. */
17798 if (perm < 16)
17799 {
17800 /* The second shuffle for e.g. V4DFmode has
17801 0123 and ABCD operands.
17802 Ignore AB23, as 23 is already in the second lane
17803 of the first operand. */
17804 if ((perm & 0xc) == (1 << 2)) continue;
17805 /* And 01CD, as 01 is in the first lane of the first
17806 operand. */
17807 if ((perm & 3) == 0) continue;
17808 /* And 4567, as then the vperm2[fi]128 doesn't change
17809 anything on the original 4567 second operand. */
17810 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
17811 }
17812 else
17813 {
17814 /* The second shuffle for e.g. V4DFmode has
17815 4567 and ABCD operands.
17816 Ignore AB67, as 67 is already in the second lane
17817 of the first operand. */
17818 if ((perm & 0xc) == (3 << 2)) continue;
17819 /* And 45CD, as 45 is in the first lane of the first
17820 operand. */
17821 if ((perm & 3) == 2) continue;
17822 /* And 0123, as then the vperm2[fi]128 doesn't change
17823 anything on the original 0123 first operand. */
17824 if ((perm & 0xf) == (1 << 2)) continue;
17825 }
17826
17827 for (i = 0; i < nelt; i++)
17828 {
17829 j = d->perm[i] / nelt2;
17830 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
17831 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
17832 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
17833 dsecond.perm[i] = d->perm[i] & (nelt - 1);
17834 else
17835 break;
17836 }
17837
17838 if (i == nelt)
17839 {
17840 start_sequence ();
17841 ok = expand_vec_perm_1 (&dsecond);
17842 end_sequence ();
17843 }
17844 else
17845 ok = false;
17846
17847 if (ok)
17848 {
17849 if (d->testing_p)
17850 return true;
17851
17852 /* Found a usable second shuffle. dfirst will be
17853 vperm2f128 on d->op0 and d->op1. */
17854 dsecond.testing_p = false;
17855 dfirst = *d;
17856 dfirst.target = gen_reg_rtx (d->vmode);
17857 for (i = 0; i < nelt; i++)
17858 dfirst.perm[i] = (i & (nelt2 - 1))
17859 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
17860
17861 canonicalize_perm (&dfirst);
17862 ok = expand_vec_perm_1 (&dfirst);
17863 gcc_assert (ok);
17864
17865 /* And dsecond is some single insn shuffle, taking
17866 d->op0 and result of vperm2f128 (if perm < 16) or
17867 d->op1 and result of vperm2f128 (otherwise). */
17868 if (perm >= 16)
17869 dsecond.op0 = dsecond.op1;
17870 dsecond.op1 = dfirst.target;
17871
17872 ok = expand_vec_perm_1 (&dsecond);
17873 gcc_assert (ok);
17874
17875 return true;
17876 }
17877
17878 /* For one operand, the only useful vperm2f128 permutation is 0x01
17879 aka lanes swap. */
17880 if (d->one_operand_p)
17881 return false;
17882 }
17883
17884 return false;
17885 }
17886
17887 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17888 a two vector permutation using 2 intra-lane interleave insns
17889 and cross-lane shuffle for 32-byte vectors. */
17890
17891 static bool
17892 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
17893 {
17894 unsigned i, nelt;
17895 rtx (*gen) (rtx, rtx, rtx);
17896
17897 if (d->one_operand_p)
17898 return false;
17899 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
17900 ;
17901 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
17902 ;
17903 else
17904 return false;
17905
17906 nelt = d->nelt;
17907 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
17908 return false;
17909 for (i = 0; i < nelt; i += 2)
17910 if (d->perm[i] != d->perm[0] + i / 2
17911 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
17912 return false;
17913
17914 if (d->testing_p)
17915 return true;
17916
17917 switch (d->vmode)
17918 {
17919 case E_V32QImode:
17920 if (d->perm[0])
17921 gen = gen_vec_interleave_highv32qi;
17922 else
17923 gen = gen_vec_interleave_lowv32qi;
17924 break;
17925 case E_V16HImode:
17926 if (d->perm[0])
17927 gen = gen_vec_interleave_highv16hi;
17928 else
17929 gen = gen_vec_interleave_lowv16hi;
17930 break;
17931 case E_V8SImode:
17932 if (d->perm[0])
17933 gen = gen_vec_interleave_highv8si;
17934 else
17935 gen = gen_vec_interleave_lowv8si;
17936 break;
17937 case E_V4DImode:
17938 if (d->perm[0])
17939 gen = gen_vec_interleave_highv4di;
17940 else
17941 gen = gen_vec_interleave_lowv4di;
17942 break;
17943 case E_V8SFmode:
17944 if (d->perm[0])
17945 gen = gen_vec_interleave_highv8sf;
17946 else
17947 gen = gen_vec_interleave_lowv8sf;
17948 break;
17949 case E_V4DFmode:
17950 if (d->perm[0])
17951 gen = gen_vec_interleave_highv4df;
17952 else
17953 gen = gen_vec_interleave_lowv4df;
17954 break;
17955 default:
17956 gcc_unreachable ();
17957 }
17958
17959 emit_insn (gen (d->target, d->op0, d->op1));
17960 return true;
17961 }
17962
17963 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
17964 a single vector permutation using a single intra-lane vector
17965 permutation, vperm2f128 swapping the lanes and vblend* insn blending
17966 the non-swapped and swapped vectors together. */
17967
17968 static bool
17969 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
17970 {
17971 struct expand_vec_perm_d dfirst, dsecond;
17972 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
17973 rtx_insn *seq;
17974 bool ok;
17975 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
17976
17977 if (!TARGET_AVX
17978 || TARGET_AVX2
17979 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
17980 || !d->one_operand_p)
17981 return false;
17982
17983 dfirst = *d;
17984 for (i = 0; i < nelt; i++)
17985 dfirst.perm[i] = 0xff;
17986 for (i = 0, msk = 0; i < nelt; i++)
17987 {
17988 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
17989 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
17990 return false;
17991 dfirst.perm[j] = d->perm[i];
17992 if (j != i)
17993 msk |= (1 << i);
17994 }
17995 for (i = 0; i < nelt; i++)
17996 if (dfirst.perm[i] == 0xff)
17997 dfirst.perm[i] = i;
17998
17999 if (!d->testing_p)
18000 dfirst.target = gen_reg_rtx (dfirst.vmode);
18001
18002 start_sequence ();
18003 ok = expand_vec_perm_1 (&dfirst);
18004 seq = get_insns ();
18005 end_sequence ();
18006
18007 if (!ok)
18008 return false;
18009
18010 if (d->testing_p)
18011 return true;
18012
18013 emit_insn (seq);
18014
18015 dsecond = *d;
18016 dsecond.op0 = dfirst.target;
18017 dsecond.op1 = dfirst.target;
18018 dsecond.one_operand_p = true;
18019 dsecond.target = gen_reg_rtx (dsecond.vmode);
18020 for (i = 0; i < nelt; i++)
18021 dsecond.perm[i] = i ^ nelt2;
18022
18023 ok = expand_vec_perm_1 (&dsecond);
18024 gcc_assert (ok);
18025
18026 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18027 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18028 return true;
18029 }
18030
18031 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
18032 permutation using two vperm2f128, followed by a vshufpd insn blending
18033 the two vectors together. */
18034
18035 static bool
18036 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18037 {
18038 struct expand_vec_perm_d dfirst, dsecond, dthird;
18039 bool ok;
18040
18041 if (!TARGET_AVX || (d->vmode != V4DFmode))
18042 return false;
18043
18044 if (d->testing_p)
18045 return true;
18046
18047 dfirst = *d;
18048 dsecond = *d;
18049 dthird = *d;
18050
18051 dfirst.perm[0] = (d->perm[0] & ~1);
18052 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18053 dfirst.perm[2] = (d->perm[2] & ~1);
18054 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18055 dsecond.perm[0] = (d->perm[1] & ~1);
18056 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18057 dsecond.perm[2] = (d->perm[3] & ~1);
18058 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18059 dthird.perm[0] = (d->perm[0] % 2);
18060 dthird.perm[1] = (d->perm[1] % 2) + 4;
18061 dthird.perm[2] = (d->perm[2] % 2) + 2;
18062 dthird.perm[3] = (d->perm[3] % 2) + 6;
18063
18064 dfirst.target = gen_reg_rtx (dfirst.vmode);
18065 dsecond.target = gen_reg_rtx (dsecond.vmode);
18066 dthird.op0 = dfirst.target;
18067 dthird.op1 = dsecond.target;
18068 dthird.one_operand_p = false;
18069
18070 canonicalize_perm (&dfirst);
18071 canonicalize_perm (&dsecond);
18072
18073 ok = expand_vec_perm_1 (&dfirst)
18074 && expand_vec_perm_1 (&dsecond)
18075 && expand_vec_perm_1 (&dthird);
18076
18077 gcc_assert (ok);
18078
18079 return true;
18080 }
18081
18082 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
18083
18084 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18085 a two vector permutation using two intra-lane vector
18086 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18087 the non-swapped and swapped vectors together. */
18088
18089 static bool
18090 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
18091 {
18092 struct expand_vec_perm_d dfirst, dsecond, dthird;
18093 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
18094 rtx_insn *seq1, *seq2;
18095 bool ok;
18096 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18097
18098 if (!TARGET_AVX
18099 || TARGET_AVX2
18100 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18101 || d->one_operand_p)
18102 return false;
18103
18104 dfirst = *d;
18105 dsecond = *d;
18106 for (i = 0; i < nelt; i++)
18107 {
18108 dfirst.perm[i] = 0xff;
18109 dsecond.perm[i] = 0xff;
18110 }
18111 for (i = 0, msk = 0; i < nelt; i++)
18112 {
18113 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18114 if (j == i)
18115 {
18116 dfirst.perm[j] = d->perm[i];
18117 which1 |= (d->perm[i] < nelt ? 1 : 2);
18118 }
18119 else
18120 {
18121 dsecond.perm[j] = d->perm[i];
18122 which2 |= (d->perm[i] < nelt ? 1 : 2);
18123 msk |= (1U << i);
18124 }
18125 }
18126 if (msk == 0 || msk == (1U << nelt) - 1)
18127 return false;
18128
18129 if (!d->testing_p)
18130 {
18131 dfirst.target = gen_reg_rtx (dfirst.vmode);
18132 dsecond.target = gen_reg_rtx (dsecond.vmode);
18133 }
18134
18135 for (i = 0; i < nelt; i++)
18136 {
18137 if (dfirst.perm[i] == 0xff)
18138 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
18139 if (dsecond.perm[i] == 0xff)
18140 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
18141 }
18142 canonicalize_perm (&dfirst);
18143 start_sequence ();
18144 ok = ix86_expand_vec_perm_const_1 (&dfirst);
18145 seq1 = get_insns ();
18146 end_sequence ();
18147
18148 if (!ok)
18149 return false;
18150
18151 canonicalize_perm (&dsecond);
18152 start_sequence ();
18153 ok = ix86_expand_vec_perm_const_1 (&dsecond);
18154 seq2 = get_insns ();
18155 end_sequence ();
18156
18157 if (!ok)
18158 return false;
18159
18160 if (d->testing_p)
18161 return true;
18162
18163 emit_insn (seq1);
18164 emit_insn (seq2);
18165
18166 dthird = *d;
18167 dthird.op0 = dsecond.target;
18168 dthird.op1 = dsecond.target;
18169 dthird.one_operand_p = true;
18170 dthird.target = gen_reg_rtx (dthird.vmode);
18171 for (i = 0; i < nelt; i++)
18172 dthird.perm[i] = i ^ nelt2;
18173
18174 ok = expand_vec_perm_1 (&dthird);
18175 gcc_assert (ok);
18176
18177 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18178 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
18179 return true;
18180 }
18181
18182 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18183 permutation with two pshufb insns and an ior. We should have already
18184 failed all two instruction sequences. */
18185
18186 static bool
18187 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18188 {
18189 rtx rperm[2][16], vperm, l, h, op, m128;
18190 unsigned int i, nelt, eltsz;
18191
18192 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18193 return false;
18194 gcc_assert (!d->one_operand_p);
18195
18196 if (d->testing_p)
18197 return true;
18198
18199 nelt = d->nelt;
18200 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18201
18202 /* Generate two permutation masks. If the required element is within
18203 the given vector it is shuffled into the proper lane. If the required
18204 element is in the other vector, force a zero into the lane by setting
18205 bit 7 in the permutation mask. */
18206 m128 = GEN_INT (-128);
18207 for (i = 0; i < nelt; ++i)
18208 {
18209 unsigned j, e = d->perm[i];
18210 unsigned which = (e >= nelt);
18211 if (e >= nelt)
18212 e -= nelt;
18213
18214 for (j = 0; j < eltsz; ++j)
18215 {
18216 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18217 rperm[1-which][i*eltsz + j] = m128;
18218 }
18219 }
18220
18221 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18222 vperm = force_reg (V16QImode, vperm);
18223
18224 l = gen_reg_rtx (V16QImode);
18225 op = gen_lowpart (V16QImode, d->op0);
18226 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18227
18228 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18229 vperm = force_reg (V16QImode, vperm);
18230
18231 h = gen_reg_rtx (V16QImode);
18232 op = gen_lowpart (V16QImode, d->op1);
18233 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18234
18235 op = d->target;
18236 if (d->vmode != V16QImode)
18237 op = gen_reg_rtx (V16QImode);
18238 emit_insn (gen_iorv16qi3 (op, l, h));
18239 if (op != d->target)
18240 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18241
18242 return true;
18243 }
18244
18245 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18246 with two vpshufb insns, vpermq and vpor. We should have already failed
18247 all two or three instruction sequences. */
18248
18249 static bool
18250 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18251 {
18252 rtx rperm[2][32], vperm, l, h, hp, op, m128;
18253 unsigned int i, nelt, eltsz;
18254
18255 if (!TARGET_AVX2
18256 || !d->one_operand_p
18257 || (d->vmode != V32QImode && d->vmode != V16HImode))
18258 return false;
18259
18260 if (d->testing_p)
18261 return true;
18262
18263 nelt = d->nelt;
18264 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18265
18266 /* Generate two permutation masks. If the required element is within
18267 the same lane, it is shuffled in. If the required element from the
18268 other lane, force a zero by setting bit 7 in the permutation mask.
18269 In the other mask the mask has non-negative elements if element
18270 is requested from the other lane, but also moved to the other lane,
18271 so that the result of vpshufb can have the two V2TImode halves
18272 swapped. */
18273 m128 = GEN_INT (-128);
18274 for (i = 0; i < nelt; ++i)
18275 {
18276 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18277 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18278
18279 for (j = 0; j < eltsz; ++j)
18280 {
18281 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18282 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18283 }
18284 }
18285
18286 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18287 vperm = force_reg (V32QImode, vperm);
18288
18289 h = gen_reg_rtx (V32QImode);
18290 op = gen_lowpart (V32QImode, d->op0);
18291 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18292
18293 /* Swap the 128-byte lanes of h into hp. */
18294 hp = gen_reg_rtx (V4DImode);
18295 op = gen_lowpart (V4DImode, h);
18296 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18297 const1_rtx));
18298
18299 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18300 vperm = force_reg (V32QImode, vperm);
18301
18302 l = gen_reg_rtx (V32QImode);
18303 op = gen_lowpart (V32QImode, d->op0);
18304 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18305
18306 op = d->target;
18307 if (d->vmode != V32QImode)
18308 op = gen_reg_rtx (V32QImode);
18309 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18310 if (op != d->target)
18311 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18312
18313 return true;
18314 }
18315
18316 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18317 and extract-odd permutations of two V32QImode and V16QImode operand
18318 with two vpshufb insns, vpor and vpermq. We should have already
18319 failed all two or three instruction sequences. */
18320
18321 static bool
18322 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18323 {
18324 rtx rperm[2][32], vperm, l, h, ior, op, m128;
18325 unsigned int i, nelt, eltsz;
18326
18327 if (!TARGET_AVX2
18328 || d->one_operand_p
18329 || (d->vmode != V32QImode && d->vmode != V16HImode))
18330 return false;
18331
18332 for (i = 0; i < d->nelt; ++i)
18333 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18334 return false;
18335
18336 if (d->testing_p)
18337 return true;
18338
18339 nelt = d->nelt;
18340 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18341
18342 /* Generate two permutation masks. In the first permutation mask
18343 the first quarter will contain indexes for the first half
18344 of the op0, the second quarter will contain bit 7 set, third quarter
18345 will contain indexes for the second half of the op0 and the
18346 last quarter bit 7 set. In the second permutation mask
18347 the first quarter will contain bit 7 set, the second quarter
18348 indexes for the first half of the op1, the third quarter bit 7 set
18349 and last quarter indexes for the second half of the op1.
18350 I.e. the first mask e.g. for V32QImode extract even will be:
18351 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18352 (all values masked with 0xf except for -128) and second mask
18353 for extract even will be
18354 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18355 m128 = GEN_INT (-128);
18356 for (i = 0; i < nelt; ++i)
18357 {
18358 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18359 unsigned which = d->perm[i] >= nelt;
18360 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18361
18362 for (j = 0; j < eltsz; ++j)
18363 {
18364 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18365 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18366 }
18367 }
18368
18369 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18370 vperm = force_reg (V32QImode, vperm);
18371
18372 l = gen_reg_rtx (V32QImode);
18373 op = gen_lowpart (V32QImode, d->op0);
18374 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18375
18376 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18377 vperm = force_reg (V32QImode, vperm);
18378
18379 h = gen_reg_rtx (V32QImode);
18380 op = gen_lowpart (V32QImode, d->op1);
18381 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18382
18383 ior = gen_reg_rtx (V32QImode);
18384 emit_insn (gen_iorv32qi3 (ior, l, h));
18385
18386 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18387 op = gen_reg_rtx (V4DImode);
18388 ior = gen_lowpart (V4DImode, ior);
18389 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18390 const1_rtx, GEN_INT (3)));
18391 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18392
18393 return true;
18394 }
18395
18396 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18397 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18398 with two "and" and "pack" or two "shift" and "pack" insns. We should
18399 have already failed all two instruction sequences. */
18400
18401 static bool
18402 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18403 {
18404 rtx op, dop0, dop1, t;
18405 unsigned i, odd, c, s, nelt = d->nelt;
18406 bool end_perm = false;
18407 machine_mode half_mode;
18408 rtx (*gen_and) (rtx, rtx, rtx);
18409 rtx (*gen_pack) (rtx, rtx, rtx);
18410 rtx (*gen_shift) (rtx, rtx, rtx);
18411
18412 if (d->one_operand_p)
18413 return false;
18414
18415 switch (d->vmode)
18416 {
18417 case E_V8HImode:
18418 /* Required for "pack". */
18419 if (!TARGET_SSE4_1)
18420 return false;
18421 c = 0xffff;
18422 s = 16;
18423 half_mode = V4SImode;
18424 gen_and = gen_andv4si3;
18425 gen_pack = gen_sse4_1_packusdw;
18426 gen_shift = gen_lshrv4si3;
18427 break;
18428 case E_V16QImode:
18429 /* No check as all instructions are SSE2. */
18430 c = 0xff;
18431 s = 8;
18432 half_mode = V8HImode;
18433 gen_and = gen_andv8hi3;
18434 gen_pack = gen_sse2_packuswb;
18435 gen_shift = gen_lshrv8hi3;
18436 break;
18437 case E_V16HImode:
18438 if (!TARGET_AVX2)
18439 return false;
18440 c = 0xffff;
18441 s = 16;
18442 half_mode = V8SImode;
18443 gen_and = gen_andv8si3;
18444 gen_pack = gen_avx2_packusdw;
18445 gen_shift = gen_lshrv8si3;
18446 end_perm = true;
18447 break;
18448 case E_V32QImode:
18449 if (!TARGET_AVX2)
18450 return false;
18451 c = 0xff;
18452 s = 8;
18453 half_mode = V16HImode;
18454 gen_and = gen_andv16hi3;
18455 gen_pack = gen_avx2_packuswb;
18456 gen_shift = gen_lshrv16hi3;
18457 end_perm = true;
18458 break;
18459 default:
18460 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18461 general shuffles. */
18462 return false;
18463 }
18464
18465 /* Check that permutation is even or odd. */
18466 odd = d->perm[0];
18467 if (odd > 1)
18468 return false;
18469
18470 for (i = 1; i < nelt; ++i)
18471 if (d->perm[i] != 2 * i + odd)
18472 return false;
18473
18474 if (d->testing_p)
18475 return true;
18476
18477 dop0 = gen_reg_rtx (half_mode);
18478 dop1 = gen_reg_rtx (half_mode);
18479 if (odd == 0)
18480 {
18481 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
18482 t = force_reg (half_mode, t);
18483 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
18484 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
18485 }
18486 else
18487 {
18488 emit_insn (gen_shift (dop0,
18489 gen_lowpart (half_mode, d->op0),
18490 GEN_INT (s)));
18491 emit_insn (gen_shift (dop1,
18492 gen_lowpart (half_mode, d->op1),
18493 GEN_INT (s)));
18494 }
18495 /* In AVX2 for 256 bit case we need to permute pack result. */
18496 if (TARGET_AVX2 && end_perm)
18497 {
18498 op = gen_reg_rtx (d->vmode);
18499 t = gen_reg_rtx (V4DImode);
18500 emit_insn (gen_pack (op, dop0, dop1));
18501 emit_insn (gen_avx2_permv4di_1 (t,
18502 gen_lowpart (V4DImode, op),
18503 const0_rtx,
18504 const2_rtx,
18505 const1_rtx,
18506 GEN_INT (3)));
18507 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
18508 }
18509 else
18510 emit_insn (gen_pack (d->target, dop0, dop1));
18511
18512 return true;
18513 }
18514
18515 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18516 and extract-odd permutations of two V64QI operands
18517 with two "shifts", two "truncs" and one "concat" insns for "odd"
18518 and two "truncs" and one concat insn for "even."
18519 Have already failed all two instruction sequences. */
18520
18521 static bool
18522 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
18523 {
18524 rtx t1, t2, t3, t4;
18525 unsigned i, odd, nelt = d->nelt;
18526
18527 if (!TARGET_AVX512BW
18528 || d->one_operand_p
18529 || d->vmode != V64QImode)
18530 return false;
18531
18532 /* Check that permutation is even or odd. */
18533 odd = d->perm[0];
18534 if (odd > 1)
18535 return false;
18536
18537 for (i = 1; i < nelt; ++i)
18538 if (d->perm[i] != 2 * i + odd)
18539 return false;
18540
18541 if (d->testing_p)
18542 return true;
18543
18544
18545 if (odd)
18546 {
18547 t1 = gen_reg_rtx (V32HImode);
18548 t2 = gen_reg_rtx (V32HImode);
18549 emit_insn (gen_lshrv32hi3 (t1,
18550 gen_lowpart (V32HImode, d->op0),
18551 GEN_INT (8)));
18552 emit_insn (gen_lshrv32hi3 (t2,
18553 gen_lowpart (V32HImode, d->op1),
18554 GEN_INT (8)));
18555 }
18556 else
18557 {
18558 t1 = gen_lowpart (V32HImode, d->op0);
18559 t2 = gen_lowpart (V32HImode, d->op1);
18560 }
18561
18562 t3 = gen_reg_rtx (V32QImode);
18563 t4 = gen_reg_rtx (V32QImode);
18564 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
18565 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
18566 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
18567
18568 return true;
18569 }
18570
18571 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
18572 and extract-odd permutations. */
18573
18574 static bool
18575 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
18576 {
18577 rtx t1, t2, t3, t4, t5;
18578
18579 switch (d->vmode)
18580 {
18581 case E_V4DFmode:
18582 if (d->testing_p)
18583 break;
18584 t1 = gen_reg_rtx (V4DFmode);
18585 t2 = gen_reg_rtx (V4DFmode);
18586
18587 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18588 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
18589 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
18590
18591 /* Now an unpck[lh]pd will produce the result required. */
18592 if (odd)
18593 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
18594 else
18595 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
18596 emit_insn (t3);
18597 break;
18598
18599 case E_V8SFmode:
18600 {
18601 int mask = odd ? 0xdd : 0x88;
18602
18603 if (d->testing_p)
18604 break;
18605 t1 = gen_reg_rtx (V8SFmode);
18606 t2 = gen_reg_rtx (V8SFmode);
18607 t3 = gen_reg_rtx (V8SFmode);
18608
18609 /* Shuffle within the 128-bit lanes to produce:
18610 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18611 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
18612 GEN_INT (mask)));
18613
18614 /* Shuffle the lanes around to produce:
18615 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18616 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
18617 GEN_INT (0x3)));
18618
18619 /* Shuffle within the 128-bit lanes to produce:
18620 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18621 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
18622
18623 /* Shuffle within the 128-bit lanes to produce:
18624 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18625 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
18626
18627 /* Shuffle the lanes around to produce:
18628 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18629 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
18630 GEN_INT (0x20)));
18631 }
18632 break;
18633
18634 case E_V2DFmode:
18635 case E_V4SFmode:
18636 case E_V2DImode:
18637 case E_V2SImode:
18638 case E_V4SImode:
18639 /* These are always directly implementable by expand_vec_perm_1. */
18640 gcc_unreachable ();
18641
18642 case E_V2SFmode:
18643 gcc_assert (TARGET_MMX_WITH_SSE);
18644 /* We have no suitable instructions. */
18645 if (d->testing_p)
18646 return false;
18647 break;
18648
18649 case E_V4HImode:
18650 if (d->testing_p)
18651 break;
18652 /* We need 2*log2(N)-1 operations to achieve odd/even
18653 with interleave. */
18654 t1 = gen_reg_rtx (V4HImode);
18655 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
18656 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
18657 if (odd)
18658 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
18659 else
18660 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
18661 emit_insn (t2);
18662 break;
18663
18664 case E_V8HImode:
18665 if (TARGET_SSE4_1)
18666 return expand_vec_perm_even_odd_pack (d);
18667 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
18668 return expand_vec_perm_pshufb2 (d);
18669 else
18670 {
18671 if (d->testing_p)
18672 break;
18673 /* We need 2*log2(N)-1 operations to achieve odd/even
18674 with interleave. */
18675 t1 = gen_reg_rtx (V8HImode);
18676 t2 = gen_reg_rtx (V8HImode);
18677 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
18678 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
18679 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
18680 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
18681 if (odd)
18682 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
18683 else
18684 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
18685 emit_insn (t3);
18686 }
18687 break;
18688
18689 case E_V16QImode:
18690 return expand_vec_perm_even_odd_pack (d);
18691
18692 case E_V16HImode:
18693 case E_V32QImode:
18694 return expand_vec_perm_even_odd_pack (d);
18695
18696 case E_V64QImode:
18697 return expand_vec_perm_even_odd_trunc (d);
18698
18699 case E_V4DImode:
18700 if (!TARGET_AVX2)
18701 {
18702 struct expand_vec_perm_d d_copy = *d;
18703 d_copy.vmode = V4DFmode;
18704 if (d->testing_p)
18705 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
18706 else
18707 d_copy.target = gen_reg_rtx (V4DFmode);
18708 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
18709 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
18710 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18711 {
18712 if (!d->testing_p)
18713 emit_move_insn (d->target,
18714 gen_lowpart (V4DImode, d_copy.target));
18715 return true;
18716 }
18717 return false;
18718 }
18719
18720 if (d->testing_p)
18721 break;
18722
18723 t1 = gen_reg_rtx (V4DImode);
18724 t2 = gen_reg_rtx (V4DImode);
18725
18726 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18727 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
18728 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
18729
18730 /* Now an vpunpck[lh]qdq will produce the result required. */
18731 if (odd)
18732 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
18733 else
18734 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
18735 emit_insn (t3);
18736 break;
18737
18738 case E_V8SImode:
18739 if (!TARGET_AVX2)
18740 {
18741 struct expand_vec_perm_d d_copy = *d;
18742 d_copy.vmode = V8SFmode;
18743 if (d->testing_p)
18744 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
18745 else
18746 d_copy.target = gen_reg_rtx (V8SFmode);
18747 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
18748 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
18749 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18750 {
18751 if (!d->testing_p)
18752 emit_move_insn (d->target,
18753 gen_lowpart (V8SImode, d_copy.target));
18754 return true;
18755 }
18756 return false;
18757 }
18758
18759 if (d->testing_p)
18760 break;
18761
18762 t1 = gen_reg_rtx (V8SImode);
18763 t2 = gen_reg_rtx (V8SImode);
18764 t3 = gen_reg_rtx (V4DImode);
18765 t4 = gen_reg_rtx (V4DImode);
18766 t5 = gen_reg_rtx (V4DImode);
18767
18768 /* Shuffle the lanes around into
18769 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18770 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
18771 gen_lowpart (V4DImode, d->op1),
18772 GEN_INT (0x20)));
18773 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
18774 gen_lowpart (V4DImode, d->op1),
18775 GEN_INT (0x31)));
18776
18777 /* Swap the 2nd and 3rd position in each lane into
18778 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18779 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
18780 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18781 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
18782 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18783
18784 /* Now an vpunpck[lh]qdq will produce
18785 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18786 if (odd)
18787 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
18788 gen_lowpart (V4DImode, t2));
18789 else
18790 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
18791 gen_lowpart (V4DImode, t2));
18792 emit_insn (t3);
18793 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
18794 break;
18795
18796 default:
18797 gcc_unreachable ();
18798 }
18799
18800 return true;
18801 }
18802
18803 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18804 extract-even and extract-odd permutations. */
18805
18806 static bool
18807 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
18808 {
18809 unsigned i, odd, nelt = d->nelt;
18810
18811 odd = d->perm[0];
18812 if (odd != 0 && odd != 1)
18813 return false;
18814
18815 for (i = 1; i < nelt; ++i)
18816 if (d->perm[i] != 2 * i + odd)
18817 return false;
18818
18819 return expand_vec_perm_even_odd_1 (d, odd);
18820 }
18821
18822 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
18823 permutations. We assume that expand_vec_perm_1 has already failed. */
18824
18825 static bool
18826 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
18827 {
18828 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
18829 machine_mode vmode = d->vmode;
18830 unsigned char perm2[4];
18831 rtx op0 = d->op0, dest;
18832 bool ok;
18833
18834 switch (vmode)
18835 {
18836 case E_V4DFmode:
18837 case E_V8SFmode:
18838 /* These are special-cased in sse.md so that we can optionally
18839 use the vbroadcast instruction. They expand to two insns
18840 if the input happens to be in a register. */
18841 gcc_unreachable ();
18842
18843 case E_V2DFmode:
18844 case E_V2SFmode:
18845 case E_V4SFmode:
18846 case E_V2DImode:
18847 case E_V2SImode:
18848 case E_V4SImode:
18849 /* These are always implementable using standard shuffle patterns. */
18850 gcc_unreachable ();
18851
18852 case E_V8HImode:
18853 case E_V16QImode:
18854 /* These can be implemented via interleave. We save one insn by
18855 stopping once we have promoted to V4SImode and then use pshufd. */
18856 if (d->testing_p)
18857 return true;
18858 do
18859 {
18860 rtx dest;
18861 rtx (*gen) (rtx, rtx, rtx)
18862 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
18863 : gen_vec_interleave_lowv8hi;
18864
18865 if (elt >= nelt2)
18866 {
18867 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
18868 : gen_vec_interleave_highv8hi;
18869 elt -= nelt2;
18870 }
18871 nelt2 /= 2;
18872
18873 dest = gen_reg_rtx (vmode);
18874 emit_insn (gen (dest, op0, op0));
18875 vmode = get_mode_wider_vector (vmode);
18876 op0 = gen_lowpart (vmode, dest);
18877 }
18878 while (vmode != V4SImode);
18879
18880 memset (perm2, elt, 4);
18881 dest = gen_reg_rtx (V4SImode);
18882 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
18883 gcc_assert (ok);
18884 if (!d->testing_p)
18885 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
18886 return true;
18887
18888 case E_V64QImode:
18889 case E_V32QImode:
18890 case E_V16HImode:
18891 case E_V8SImode:
18892 case E_V4DImode:
18893 /* For AVX2 broadcasts of the first element vpbroadcast* or
18894 vpermq should be used by expand_vec_perm_1. */
18895 gcc_assert (!TARGET_AVX2 || d->perm[0]);
18896 return false;
18897
18898 default:
18899 gcc_unreachable ();
18900 }
18901 }
18902
18903 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18904 broadcast permutations. */
18905
18906 static bool
18907 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
18908 {
18909 unsigned i, elt, nelt = d->nelt;
18910
18911 if (!d->one_operand_p)
18912 return false;
18913
18914 elt = d->perm[0];
18915 for (i = 1; i < nelt; ++i)
18916 if (d->perm[i] != elt)
18917 return false;
18918
18919 return expand_vec_perm_broadcast_1 (d);
18920 }
18921
18922 /* Implement arbitrary permutations of two V64QImode operands
18923 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18924 static bool
18925 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
18926 {
18927 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
18928 return false;
18929
18930 if (d->testing_p)
18931 return true;
18932
18933 struct expand_vec_perm_d ds[2];
18934 rtx rperm[128], vperm, target0, target1;
18935 unsigned int i, nelt;
18936 machine_mode vmode;
18937
18938 nelt = d->nelt;
18939 vmode = V64QImode;
18940
18941 for (i = 0; i < 2; i++)
18942 {
18943 ds[i] = *d;
18944 ds[i].vmode = V32HImode;
18945 ds[i].nelt = 32;
18946 ds[i].target = gen_reg_rtx (V32HImode);
18947 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
18948 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
18949 }
18950
18951 /* Prepare permutations such that the first one takes care of
18952 putting the even bytes into the right positions or one higher
18953 positions (ds[0]) and the second one takes care of
18954 putting the odd bytes into the right positions or one below
18955 (ds[1]). */
18956
18957 for (i = 0; i < nelt; i++)
18958 {
18959 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
18960 if (i & 1)
18961 {
18962 rperm[i] = constm1_rtx;
18963 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
18964 }
18965 else
18966 {
18967 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
18968 rperm[i + 64] = constm1_rtx;
18969 }
18970 }
18971
18972 bool ok = expand_vec_perm_1 (&ds[0]);
18973 gcc_assert (ok);
18974 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
18975
18976 ok = expand_vec_perm_1 (&ds[1]);
18977 gcc_assert (ok);
18978 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
18979
18980 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
18981 vperm = force_reg (vmode, vperm);
18982 target0 = gen_reg_rtx (V64QImode);
18983 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
18984
18985 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
18986 vperm = force_reg (vmode, vperm);
18987 target1 = gen_reg_rtx (V64QImode);
18988 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
18989
18990 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
18991 return true;
18992 }
18993
18994 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
18995 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
18996 all the shorter instruction sequences. */
18997
18998 static bool
18999 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
19000 {
19001 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
19002 unsigned int i, nelt, eltsz;
19003 bool used[4];
19004
19005 if (!TARGET_AVX2
19006 || d->one_operand_p
19007 || (d->vmode != V32QImode && d->vmode != V16HImode))
19008 return false;
19009
19010 if (d->testing_p)
19011 return true;
19012
19013 nelt = d->nelt;
19014 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19015
19016 /* Generate 4 permutation masks. If the required element is within
19017 the same lane, it is shuffled in. If the required element from the
19018 other lane, force a zero by setting bit 7 in the permutation mask.
19019 In the other mask the mask has non-negative elements if element
19020 is requested from the other lane, but also moved to the other lane,
19021 so that the result of vpshufb can have the two V2TImode halves
19022 swapped. */
19023 m128 = GEN_INT (-128);
19024 for (i = 0; i < 32; ++i)
19025 {
19026 rperm[0][i] = m128;
19027 rperm[1][i] = m128;
19028 rperm[2][i] = m128;
19029 rperm[3][i] = m128;
19030 }
19031 used[0] = false;
19032 used[1] = false;
19033 used[2] = false;
19034 used[3] = false;
19035 for (i = 0; i < nelt; ++i)
19036 {
19037 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
19038 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
19039 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
19040
19041 for (j = 0; j < eltsz; ++j)
19042 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
19043 used[which] = true;
19044 }
19045
19046 for (i = 0; i < 2; ++i)
19047 {
19048 if (!used[2 * i + 1])
19049 {
19050 h[i] = NULL_RTX;
19051 continue;
19052 }
19053 vperm = gen_rtx_CONST_VECTOR (V32QImode,
19054 gen_rtvec_v (32, rperm[2 * i + 1]));
19055 vperm = force_reg (V32QImode, vperm);
19056 h[i] = gen_reg_rtx (V32QImode);
19057 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19058 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
19059 }
19060
19061 /* Swap the 128-byte lanes of h[X]. */
19062 for (i = 0; i < 2; ++i)
19063 {
19064 if (h[i] == NULL_RTX)
19065 continue;
19066 op = gen_reg_rtx (V4DImode);
19067 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
19068 const2_rtx, GEN_INT (3), const0_rtx,
19069 const1_rtx));
19070 h[i] = gen_lowpart (V32QImode, op);
19071 }
19072
19073 for (i = 0; i < 2; ++i)
19074 {
19075 if (!used[2 * i])
19076 {
19077 l[i] = NULL_RTX;
19078 continue;
19079 }
19080 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19081 vperm = force_reg (V32QImode, vperm);
19082 l[i] = gen_reg_rtx (V32QImode);
19083 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19084 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19085 }
19086
19087 for (i = 0; i < 2; ++i)
19088 {
19089 if (h[i] && l[i])
19090 {
19091 op = gen_reg_rtx (V32QImode);
19092 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19093 l[i] = op;
19094 }
19095 else if (h[i])
19096 l[i] = h[i];
19097 }
19098
19099 gcc_assert (l[0] && l[1]);
19100 op = d->target;
19101 if (d->vmode != V32QImode)
19102 op = gen_reg_rtx (V32QImode);
19103 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19104 if (op != d->target)
19105 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19106 return true;
19107 }
19108
19109 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19110 taken care of, perform the expansion in D and return true on success. */
19111
19112 static bool
19113 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19114 {
19115 /* Try a single instruction expansion. */
19116 if (expand_vec_perm_1 (d))
19117 return true;
19118
19119 /* Try sequences of two instructions. */
19120
19121 if (expand_vec_perm_pshuflw_pshufhw (d))
19122 return true;
19123
19124 if (expand_vec_perm_palignr (d, false))
19125 return true;
19126
19127 if (expand_vec_perm_interleave2 (d))
19128 return true;
19129
19130 if (expand_vec_perm_broadcast (d))
19131 return true;
19132
19133 if (expand_vec_perm_vpermq_perm_1 (d))
19134 return true;
19135
19136 if (expand_vec_perm_vperm2f128 (d))
19137 return true;
19138
19139 if (expand_vec_perm_pblendv (d))
19140 return true;
19141
19142 /* Try sequences of three instructions. */
19143
19144 if (expand_vec_perm_even_odd_pack (d))
19145 return true;
19146
19147 if (expand_vec_perm_2vperm2f128_vshuf (d))
19148 return true;
19149
19150 if (expand_vec_perm_pshufb2 (d))
19151 return true;
19152
19153 if (expand_vec_perm_interleave3 (d))
19154 return true;
19155
19156 if (expand_vec_perm_vperm2f128_vblend (d))
19157 return true;
19158
19159 /* Try sequences of four instructions. */
19160
19161 if (expand_vec_perm_even_odd_trunc (d))
19162 return true;
19163 if (expand_vec_perm_vpshufb2_vpermq (d))
19164 return true;
19165
19166 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19167 return true;
19168
19169 if (expand_vec_perm_vpermt2_vpshub2 (d))
19170 return true;
19171
19172 /* ??? Look for narrow permutations whose element orderings would
19173 allow the promotion to a wider mode. */
19174
19175 /* ??? Look for sequences of interleave or a wider permute that place
19176 the data into the correct lanes for a half-vector shuffle like
19177 pshuf[lh]w or vpermilps. */
19178
19179 /* ??? Look for sequences of interleave that produce the desired results.
19180 The combinatorics of punpck[lh] get pretty ugly... */
19181
19182 if (expand_vec_perm_even_odd (d))
19183 return true;
19184
19185 /* Even longer sequences. */
19186 if (expand_vec_perm_vpshufb4_vpermq2 (d))
19187 return true;
19188
19189 /* See if we can get the same permutation in different vector integer
19190 mode. */
19191 struct expand_vec_perm_d nd;
19192 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19193 {
19194 if (!d->testing_p)
19195 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19196 return true;
19197 }
19198
19199 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19200 if (expand_vec_perm2_vperm2f128_vblend (d))
19201 return true;
19202
19203 return false;
19204 }
19205
19206 /* If a permutation only uses one operand, make it clear. Returns true
19207 if the permutation references both operands. */
19208
19209 static bool
19210 canonicalize_perm (struct expand_vec_perm_d *d)
19211 {
19212 int i, which, nelt = d->nelt;
19213
19214 for (i = which = 0; i < nelt; ++i)
19215 which |= (d->perm[i] < nelt ? 1 : 2);
19216
19217 d->one_operand_p = true;
19218 switch (which)
19219 {
19220 default:
19221 gcc_unreachable();
19222
19223 case 3:
19224 if (!rtx_equal_p (d->op0, d->op1))
19225 {
19226 d->one_operand_p = false;
19227 break;
19228 }
19229 /* The elements of PERM do not suggest that only the first operand
19230 is used, but both operands are identical. Allow easier matching
19231 of the permutation by folding the permutation into the single
19232 input vector. */
19233 /* FALLTHRU */
19234
19235 case 2:
19236 for (i = 0; i < nelt; ++i)
19237 d->perm[i] &= nelt - 1;
19238 d->op0 = d->op1;
19239 break;
19240
19241 case 1:
19242 d->op1 = d->op0;
19243 break;
19244 }
19245
19246 return (which == 3);
19247 }
19248
19249 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19250
19251 bool
19252 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19253 rtx op1, const vec_perm_indices &sel)
19254 {
19255 struct expand_vec_perm_d d;
19256 unsigned char perm[MAX_VECT_LEN];
19257 unsigned int i, nelt, which;
19258 bool two_args;
19259
19260 d.target = target;
19261 d.op0 = op0;
19262 d.op1 = op1;
19263
19264 d.vmode = vmode;
19265 gcc_assert (VECTOR_MODE_P (d.vmode));
19266 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19267 d.testing_p = !target;
19268
19269 gcc_assert (sel.length () == nelt);
19270 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19271
19272 /* Given sufficient ISA support we can just return true here
19273 for selected vector modes. */
19274 switch (d.vmode)
19275 {
19276 case E_V16SFmode:
19277 case E_V16SImode:
19278 case E_V8DImode:
19279 case E_V8DFmode:
19280 if (!TARGET_AVX512F)
19281 return false;
19282 /* All implementable with a single vperm[it]2 insn. */
19283 if (d.testing_p)
19284 return true;
19285 break;
19286 case E_V32HImode:
19287 if (!TARGET_AVX512BW)
19288 return false;
19289 if (d.testing_p)
19290 /* All implementable with a single vperm[it]2 insn. */
19291 return true;
19292 break;
19293 case E_V64QImode:
19294 if (!TARGET_AVX512BW)
19295 return false;
19296 if (d.testing_p)
19297 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19298 return true;
19299 break;
19300 case E_V8SImode:
19301 case E_V8SFmode:
19302 case E_V4DFmode:
19303 case E_V4DImode:
19304 if (!TARGET_AVX)
19305 return false;
19306 if (d.testing_p && TARGET_AVX512VL)
19307 /* All implementable with a single vperm[it]2 insn. */
19308 return true;
19309 break;
19310 case E_V16HImode:
19311 if (!TARGET_SSE2)
19312 return false;
19313 if (d.testing_p && TARGET_AVX2)
19314 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19315 return true;
19316 break;
19317 case E_V32QImode:
19318 if (!TARGET_SSE2)
19319 return false;
19320 if (d.testing_p && TARGET_AVX2)
19321 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19322 return true;
19323 break;
19324 case E_V8HImode:
19325 case E_V16QImode:
19326 if (!TARGET_SSE2)
19327 return false;
19328 /* Fall through. */
19329 case E_V4SImode:
19330 case E_V4SFmode:
19331 if (!TARGET_SSE)
19332 return false;
19333 /* All implementable with a single vpperm insn. */
19334 if (d.testing_p && TARGET_XOP)
19335 return true;
19336 /* All implementable with 2 pshufb + 1 ior. */
19337 if (d.testing_p && TARGET_SSSE3)
19338 return true;
19339 break;
19340 case E_V2SFmode:
19341 case E_V2SImode:
19342 case E_V4HImode:
19343 if (!TARGET_MMX_WITH_SSE)
19344 return false;
19345 break;
19346 case E_V2DImode:
19347 case E_V2DFmode:
19348 if (!TARGET_SSE)
19349 return false;
19350 /* All implementable with shufpd or unpck[lh]pd. */
19351 if (d.testing_p)
19352 return true;
19353 break;
19354 default:
19355 return false;
19356 }
19357
19358 for (i = which = 0; i < nelt; ++i)
19359 {
19360 unsigned char e = sel[i];
19361 gcc_assert (e < 2 * nelt);
19362 d.perm[i] = e;
19363 perm[i] = e;
19364 which |= (e < nelt ? 1 : 2);
19365 }
19366
19367 if (d.testing_p)
19368 {
19369 /* For all elements from second vector, fold the elements to first. */
19370 if (which == 2)
19371 for (i = 0; i < nelt; ++i)
19372 d.perm[i] -= nelt;
19373
19374 /* Check whether the mask can be applied to the vector type. */
19375 d.one_operand_p = (which != 3);
19376
19377 /* Implementable with shufps or pshufd. */
19378 if (d.one_operand_p
19379 && (d.vmode == V4SFmode || d.vmode == V2SFmode
19380 || d.vmode == V4SImode || d.vmode == V2SImode))
19381 return true;
19382
19383 /* Otherwise we have to go through the motions and see if we can
19384 figure out how to generate the requested permutation. */
19385 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19386 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19387 if (!d.one_operand_p)
19388 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19389
19390 start_sequence ();
19391 bool ret = ix86_expand_vec_perm_const_1 (&d);
19392 end_sequence ();
19393
19394 return ret;
19395 }
19396
19397 two_args = canonicalize_perm (&d);
19398
19399 if (ix86_expand_vec_perm_const_1 (&d))
19400 return true;
19401
19402 /* If the selector says both arguments are needed, but the operands are the
19403 same, the above tried to expand with one_operand_p and flattened selector.
19404 If that didn't work, retry without one_operand_p; we succeeded with that
19405 during testing. */
19406 if (two_args && d.one_operand_p)
19407 {
19408 d.one_operand_p = false;
19409 memcpy (d.perm, perm, sizeof (perm));
19410 return ix86_expand_vec_perm_const_1 (&d);
19411 }
19412
19413 return false;
19414 }
19415
19416 void
19417 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
19418 {
19419 struct expand_vec_perm_d d;
19420 unsigned i, nelt;
19421
19422 d.target = targ;
19423 d.op0 = op0;
19424 d.op1 = op1;
19425 d.vmode = GET_MODE (targ);
19426 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19427 d.one_operand_p = false;
19428 d.testing_p = false;
19429
19430 for (i = 0; i < nelt; ++i)
19431 d.perm[i] = i * 2 + odd;
19432
19433 /* We'll either be able to implement the permutation directly... */
19434 if (expand_vec_perm_1 (&d))
19435 return;
19436
19437 /* ... or we use the special-case patterns. */
19438 expand_vec_perm_even_odd_1 (&d, odd);
19439 }
19440
19441 static void
19442 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
19443 {
19444 struct expand_vec_perm_d d;
19445 unsigned i, nelt, base;
19446 bool ok;
19447
19448 d.target = targ;
19449 d.op0 = op0;
19450 d.op1 = op1;
19451 d.vmode = GET_MODE (targ);
19452 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19453 d.one_operand_p = false;
19454 d.testing_p = false;
19455
19456 base = high_p ? nelt / 2 : 0;
19457 for (i = 0; i < nelt / 2; ++i)
19458 {
19459 d.perm[i * 2] = i + base;
19460 d.perm[i * 2 + 1] = i + base + nelt;
19461 }
19462
19463 /* Note that for AVX this isn't one instruction. */
19464 ok = ix86_expand_vec_perm_const_1 (&d);
19465 gcc_assert (ok);
19466 }
19467
19468 /* Optimize vector MUL generation for V8QI, V16QI and V32QI
19469 under TARGET_AVX512BW. i.e. for v16qi a * b, it has
19470
19471 vpmovzxbw ymm2, xmm0
19472 vpmovzxbw ymm3, xmm1
19473 vpmullw ymm4, ymm2, ymm3
19474 vpmovwb xmm0, ymm4
19475
19476 it would take less instructions than ix86_expand_vecop_qihi.
19477 Return true if success. */
19478
19479 bool
19480 ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
19481 {
19482 machine_mode himode, qimode = GET_MODE (dest);
19483 rtx hop1, hop2, hdest;
19484 rtx (*gen_extend)(rtx, rtx);
19485 rtx (*gen_truncate)(rtx, rtx);
19486
19487 /* There's no V64HImode multiplication instruction. */
19488 if (qimode == E_V64QImode)
19489 return false;
19490
19491 /* vpmovwb only available under AVX512BW. */
19492 if (!TARGET_AVX512BW)
19493 return false;
19494 if ((qimode == V8QImode || qimode == V16QImode)
19495 && !TARGET_AVX512VL)
19496 return false;
19497 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
19498 if (qimode == V32QImode
19499 && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
19500 return false;
19501
19502 switch (qimode)
19503 {
19504 case E_V8QImode:
19505 himode = V8HImode;
19506 gen_extend = gen_zero_extendv8qiv8hi2;
19507 gen_truncate = gen_truncv8hiv8qi2;
19508 break;
19509 case E_V16QImode:
19510 himode = V16HImode;
19511 gen_extend = gen_zero_extendv16qiv16hi2;
19512 gen_truncate = gen_truncv16hiv16qi2;
19513 break;
19514 case E_V32QImode:
19515 himode = V32HImode;
19516 gen_extend = gen_zero_extendv32qiv32hi2;
19517 gen_truncate = gen_truncv32hiv32qi2;
19518 break;
19519 default:
19520 gcc_unreachable ();
19521 }
19522
19523 hop1 = gen_reg_rtx (himode);
19524 hop2 = gen_reg_rtx (himode);
19525 hdest = gen_reg_rtx (himode);
19526 emit_insn (gen_extend (hop1, op1));
19527 emit_insn (gen_extend (hop2, op2));
19528 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
19529 hop1, hop2)));
19530 emit_insn (gen_truncate (dest, hdest));
19531 return true;
19532 }
19533
19534 /* Expand a vector operation shift by constant for a V*QImode in terms of the
19535 same operation on V*HImode. Return true if success. */
19536 bool
19537 ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
19538 {
19539 machine_mode qimode, himode;
19540 unsigned int and_constant, xor_constant;
19541 HOST_WIDE_INT shift_amount;
19542 rtx vec_const_and, vec_const_xor;
19543 rtx tmp, op1_subreg;
19544 rtx (*gen_shift) (rtx, rtx, rtx);
19545 rtx (*gen_and) (rtx, rtx, rtx);
19546 rtx (*gen_xor) (rtx, rtx, rtx);
19547 rtx (*gen_sub) (rtx, rtx, rtx);
19548
19549 /* Only optimize shift by constant. */
19550 if (!CONST_INT_P (op2))
19551 return false;
19552
19553 qimode = GET_MODE (dest);
19554 shift_amount = INTVAL (op2);
19555 /* Do nothing when shift amount greater equal 8. */
19556 if (shift_amount > 7)
19557 return false;
19558
19559 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
19560 /* Record sign bit. */
19561 xor_constant = 1 << (8 - shift_amount - 1);
19562
19563 /* Zero upper/lower bits shift from left/right element. */
19564 and_constant
19565 = (code == ASHIFT ? 256 - (1 << shift_amount)
19566 : (1 << (8 - shift_amount)) - 1);
19567
19568 switch (qimode)
19569 {
19570 case V16QImode:
19571 himode = V8HImode;
19572 gen_shift =
19573 ((code == ASHIFT)
19574 ? gen_ashlv8hi3
19575 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
19576 gen_and = gen_andv16qi3;
19577 gen_xor = gen_xorv16qi3;
19578 gen_sub = gen_subv16qi3;
19579 break;
19580 case V32QImode:
19581 himode = V16HImode;
19582 gen_shift =
19583 ((code == ASHIFT)
19584 ? gen_ashlv16hi3
19585 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
19586 gen_and = gen_andv32qi3;
19587 gen_xor = gen_xorv32qi3;
19588 gen_sub = gen_subv32qi3;
19589 break;
19590 case V64QImode:
19591 himode = V32HImode;
19592 gen_shift =
19593 ((code == ASHIFT)
19594 ? gen_ashlv32hi3
19595 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
19596 gen_and = gen_andv64qi3;
19597 gen_xor = gen_xorv64qi3;
19598 gen_sub = gen_subv64qi3;
19599 break;
19600 default:
19601 gcc_unreachable ();
19602 }
19603
19604 tmp = gen_reg_rtx (himode);
19605 vec_const_and = gen_reg_rtx (qimode);
19606 op1_subreg = lowpart_subreg (himode, op1, qimode);
19607
19608 /* For ASHIFT and LSHIFTRT, perform operation like
19609 vpsllw/vpsrlw $shift_amount, %op1, %dest.
19610 vpand %vec_const_and, %dest. */
19611 emit_insn (gen_shift (tmp, op1_subreg, op2));
19612 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
19613 emit_move_insn (vec_const_and,
19614 ix86_build_const_vector (qimode, true,
19615 GEN_INT (and_constant)));
19616 emit_insn (gen_and (dest, dest, vec_const_and));
19617
19618 /* For ASHIFTRT, perform extra operation like
19619 vpxor %vec_const_xor, %dest, %dest
19620 vpsubb %vec_const_xor, %dest, %dest */
19621 if (code == ASHIFTRT)
19622 {
19623 vec_const_xor = gen_reg_rtx (qimode);
19624 emit_move_insn (vec_const_xor,
19625 ix86_build_const_vector (qimode, true,
19626 GEN_INT (xor_constant)));
19627 emit_insn (gen_xor (dest, dest, vec_const_xor));
19628 emit_insn (gen_sub (dest, dest, vec_const_xor));
19629 }
19630 return true;
19631 }
19632
19633 /* Expand a vector operation CODE for a V*QImode in terms of the
19634 same operation on V*HImode. */
19635
19636 void
19637 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
19638 {
19639 machine_mode qimode = GET_MODE (dest);
19640 machine_mode himode;
19641 rtx (*gen_il) (rtx, rtx, rtx);
19642 rtx (*gen_ih) (rtx, rtx, rtx);
19643 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
19644 struct expand_vec_perm_d d;
19645 bool ok, full_interleave;
19646 bool uns_p = false;
19647 int i;
19648
19649 switch (qimode)
19650 {
19651 case E_V16QImode:
19652 himode = V8HImode;
19653 gen_il = gen_vec_interleave_lowv16qi;
19654 gen_ih = gen_vec_interleave_highv16qi;
19655 break;
19656 case E_V32QImode:
19657 himode = V16HImode;
19658 gen_il = gen_avx2_interleave_lowv32qi;
19659 gen_ih = gen_avx2_interleave_highv32qi;
19660 break;
19661 case E_V64QImode:
19662 himode = V32HImode;
19663 gen_il = gen_avx512bw_interleave_lowv64qi;
19664 gen_ih = gen_avx512bw_interleave_highv64qi;
19665 break;
19666 default:
19667 gcc_unreachable ();
19668 }
19669
19670 op2_l = op2_h = op2;
19671 switch (code)
19672 {
19673 case MULT:
19674 /* Unpack data such that we've got a source byte in each low byte of
19675 each word. We don't care what goes into the high byte of each word.
19676 Rather than trying to get zero in there, most convenient is to let
19677 it be a copy of the low byte. */
19678 op2_l = gen_reg_rtx (qimode);
19679 op2_h = gen_reg_rtx (qimode);
19680 emit_insn (gen_il (op2_l, op2, op2));
19681 emit_insn (gen_ih (op2_h, op2, op2));
19682
19683 op1_l = gen_reg_rtx (qimode);
19684 op1_h = gen_reg_rtx (qimode);
19685 emit_insn (gen_il (op1_l, op1, op1));
19686 emit_insn (gen_ih (op1_h, op1, op1));
19687 full_interleave = qimode == V16QImode;
19688 break;
19689
19690 case ASHIFT:
19691 case LSHIFTRT:
19692 uns_p = true;
19693 /* FALLTHRU */
19694 case ASHIFTRT:
19695 op1_l = gen_reg_rtx (himode);
19696 op1_h = gen_reg_rtx (himode);
19697 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
19698 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
19699 full_interleave = true;
19700 break;
19701 default:
19702 gcc_unreachable ();
19703 }
19704
19705 /* Perform the operation. */
19706 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
19707 1, OPTAB_DIRECT);
19708 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
19709 1, OPTAB_DIRECT);
19710 gcc_assert (res_l && res_h);
19711
19712 /* Merge the data back into the right place. */
19713 d.target = dest;
19714 d.op0 = gen_lowpart (qimode, res_l);
19715 d.op1 = gen_lowpart (qimode, res_h);
19716 d.vmode = qimode;
19717 d.nelt = GET_MODE_NUNITS (qimode);
19718 d.one_operand_p = false;
19719 d.testing_p = false;
19720
19721 if (full_interleave)
19722 {
19723 /* For SSE2, we used an full interleave, so the desired
19724 results are in the even elements. */
19725 for (i = 0; i < d.nelt; ++i)
19726 d.perm[i] = i * 2;
19727 }
19728 else
19729 {
19730 /* For AVX, the interleave used above was not cross-lane. So the
19731 extraction is evens but with the second and third quarter swapped.
19732 Happily, that is even one insn shorter than even extraction.
19733 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19734 always first from the first and then from the second source operand,
19735 the index bits above the low 4 bits remains the same.
19736 Thus, for d.nelt == 32 we want permutation
19737 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19738 and for d.nelt == 64 we want permutation
19739 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19740 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19741 for (i = 0; i < d.nelt; ++i)
19742 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
19743 }
19744
19745 ok = ix86_expand_vec_perm_const_1 (&d);
19746 gcc_assert (ok);
19747
19748 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19749 gen_rtx_fmt_ee (code, qimode, op1, op2));
19750 }
19751
19752 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
19753 if op is CONST_VECTOR with all odd elements equal to their
19754 preceding element. */
19755
19756 static bool
19757 const_vector_equal_evenodd_p (rtx op)
19758 {
19759 machine_mode mode = GET_MODE (op);
19760 int i, nunits = GET_MODE_NUNITS (mode);
19761 if (GET_CODE (op) != CONST_VECTOR
19762 || nunits != CONST_VECTOR_NUNITS (op))
19763 return false;
19764 for (i = 0; i < nunits; i += 2)
19765 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
19766 return false;
19767 return true;
19768 }
19769
19770 void
19771 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
19772 bool uns_p, bool odd_p)
19773 {
19774 machine_mode mode = GET_MODE (op1);
19775 machine_mode wmode = GET_MODE (dest);
19776 rtx x;
19777 rtx orig_op1 = op1, orig_op2 = op2;
19778
19779 if (!nonimmediate_operand (op1, mode))
19780 op1 = force_reg (mode, op1);
19781 if (!nonimmediate_operand (op2, mode))
19782 op2 = force_reg (mode, op2);
19783
19784 /* We only play even/odd games with vectors of SImode. */
19785 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
19786
19787 /* If we're looking for the odd results, shift those members down to
19788 the even slots. For some cpus this is faster than a PSHUFD. */
19789 if (odd_p)
19790 {
19791 /* For XOP use vpmacsdqh, but only for smult, as it is only
19792 signed. */
19793 if (TARGET_XOP && mode == V4SImode && !uns_p)
19794 {
19795 x = force_reg (wmode, CONST0_RTX (wmode));
19796 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
19797 return;
19798 }
19799
19800 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
19801 if (!const_vector_equal_evenodd_p (orig_op1))
19802 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
19803 x, NULL, 1, OPTAB_DIRECT);
19804 if (!const_vector_equal_evenodd_p (orig_op2))
19805 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
19806 x, NULL, 1, OPTAB_DIRECT);
19807 op1 = gen_lowpart (mode, op1);
19808 op2 = gen_lowpart (mode, op2);
19809 }
19810
19811 if (mode == V16SImode)
19812 {
19813 if (uns_p)
19814 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
19815 else
19816 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
19817 }
19818 else if (mode == V8SImode)
19819 {
19820 if (uns_p)
19821 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
19822 else
19823 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
19824 }
19825 else if (uns_p)
19826 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
19827 else if (TARGET_SSE4_1)
19828 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
19829 else
19830 {
19831 rtx s1, s2, t0, t1, t2;
19832
19833 /* The easiest way to implement this without PMULDQ is to go through
19834 the motions as if we are performing a full 64-bit multiply. With
19835 the exception that we need to do less shuffling of the elements. */
19836
19837 /* Compute the sign-extension, aka highparts, of the two operands. */
19838 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19839 op1, pc_rtx, pc_rtx);
19840 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19841 op2, pc_rtx, pc_rtx);
19842
19843 /* Multiply LO(A) * HI(B), and vice-versa. */
19844 t1 = gen_reg_rtx (wmode);
19845 t2 = gen_reg_rtx (wmode);
19846 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
19847 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
19848
19849 /* Multiply LO(A) * LO(B). */
19850 t0 = gen_reg_rtx (wmode);
19851 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
19852
19853 /* Combine and shift the highparts into place. */
19854 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
19855 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
19856 1, OPTAB_DIRECT);
19857
19858 /* Combine high and low parts. */
19859 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
19860 return;
19861 }
19862 emit_insn (x);
19863 }
19864
19865 void
19866 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
19867 bool uns_p, bool high_p)
19868 {
19869 machine_mode wmode = GET_MODE (dest);
19870 machine_mode mode = GET_MODE (op1);
19871 rtx t1, t2, t3, t4, mask;
19872
19873 switch (mode)
19874 {
19875 case E_V4SImode:
19876 t1 = gen_reg_rtx (mode);
19877 t2 = gen_reg_rtx (mode);
19878 if (TARGET_XOP && !uns_p)
19879 {
19880 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19881 shuffle the elements once so that all elements are in the right
19882 place for immediate use: { A C B D }. */
19883 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
19884 const1_rtx, GEN_INT (3)));
19885 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
19886 const1_rtx, GEN_INT (3)));
19887 }
19888 else
19889 {
19890 /* Put the elements into place for the multiply. */
19891 ix86_expand_vec_interleave (t1, op1, op1, high_p);
19892 ix86_expand_vec_interleave (t2, op2, op2, high_p);
19893 high_p = false;
19894 }
19895 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
19896 break;
19897
19898 case E_V8SImode:
19899 /* Shuffle the elements between the lanes. After this we
19900 have { A B E F | C D G H } for each operand. */
19901 t1 = gen_reg_rtx (V4DImode);
19902 t2 = gen_reg_rtx (V4DImode);
19903 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
19904 const0_rtx, const2_rtx,
19905 const1_rtx, GEN_INT (3)));
19906 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
19907 const0_rtx, const2_rtx,
19908 const1_rtx, GEN_INT (3)));
19909
19910 /* Shuffle the elements within the lanes. After this we
19911 have { A A B B | C C D D } or { E E F F | G G H H }. */
19912 t3 = gen_reg_rtx (V8SImode);
19913 t4 = gen_reg_rtx (V8SImode);
19914 mask = GEN_INT (high_p
19915 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19916 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19917 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
19918 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
19919
19920 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
19921 break;
19922
19923 case E_V8HImode:
19924 case E_V16HImode:
19925 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
19926 uns_p, OPTAB_DIRECT);
19927 t2 = expand_binop (mode,
19928 uns_p ? umul_highpart_optab : smul_highpart_optab,
19929 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
19930 gcc_assert (t1 && t2);
19931
19932 t3 = gen_reg_rtx (mode);
19933 ix86_expand_vec_interleave (t3, t1, t2, high_p);
19934 emit_move_insn (dest, gen_lowpart (wmode, t3));
19935 break;
19936
19937 case E_V16QImode:
19938 case E_V32QImode:
19939 case E_V32HImode:
19940 case E_V16SImode:
19941 case E_V64QImode:
19942 t1 = gen_reg_rtx (wmode);
19943 t2 = gen_reg_rtx (wmode);
19944 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
19945 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
19946
19947 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
19948 break;
19949
19950 default:
19951 gcc_unreachable ();
19952 }
19953 }
19954
19955 void
19956 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
19957 {
19958 rtx res_1, res_2, res_3, res_4;
19959
19960 res_1 = gen_reg_rtx (V4SImode);
19961 res_2 = gen_reg_rtx (V4SImode);
19962 res_3 = gen_reg_rtx (V2DImode);
19963 res_4 = gen_reg_rtx (V2DImode);
19964 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
19965 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
19966
19967 /* Move the results in element 2 down to element 1; we don't care
19968 what goes in elements 2 and 3. Then we can merge the parts
19969 back together with an interleave.
19970
19971 Note that two other sequences were tried:
19972 (1) Use interleaves at the start instead of psrldq, which allows
19973 us to use a single shufps to merge things back at the end.
19974 (2) Use shufps here to combine the two vectors, then pshufd to
19975 put the elements in the correct order.
19976 In both cases the cost of the reformatting stall was too high
19977 and the overall sequence slower. */
19978
19979 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
19980 const0_rtx, const2_rtx,
19981 const0_rtx, const0_rtx));
19982 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
19983 const0_rtx, const2_rtx,
19984 const0_rtx, const0_rtx));
19985 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
19986
19987 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
19988 }
19989
19990 void
19991 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
19992 {
19993 machine_mode mode = GET_MODE (op0);
19994 rtx t1, t2, t3, t4, t5, t6;
19995
19996 if (TARGET_AVX512DQ && mode == V8DImode)
19997 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
19998 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
19999 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
20000 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
20001 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
20002 else if (TARGET_XOP && mode == V2DImode)
20003 {
20004 /* op1: A,B,C,D, op2: E,F,G,H */
20005 op1 = gen_lowpart (V4SImode, op1);
20006 op2 = gen_lowpart (V4SImode, op2);
20007
20008 t1 = gen_reg_rtx (V4SImode);
20009 t2 = gen_reg_rtx (V4SImode);
20010 t3 = gen_reg_rtx (V2DImode);
20011 t4 = gen_reg_rtx (V2DImode);
20012
20013 /* t1: B,A,D,C */
20014 emit_insn (gen_sse2_pshufd_1 (t1, op1,
20015 GEN_INT (1),
20016 GEN_INT (0),
20017 GEN_INT (3),
20018 GEN_INT (2)));
20019
20020 /* t2: (B*E),(A*F),(D*G),(C*H) */
20021 emit_insn (gen_mulv4si3 (t2, t1, op2));
20022
20023 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
20024 emit_insn (gen_xop_phadddq (t3, t2));
20025
20026 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
20027 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
20028
20029 /* Multiply lower parts and add all */
20030 t5 = gen_reg_rtx (V2DImode);
20031 emit_insn (gen_vec_widen_umult_even_v4si (t5,
20032 gen_lowpart (V4SImode, op1),
20033 gen_lowpart (V4SImode, op2)));
20034 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
20035 }
20036 else
20037 {
20038 machine_mode nmode;
20039 rtx (*umul) (rtx, rtx, rtx);
20040
20041 if (mode == V2DImode)
20042 {
20043 umul = gen_vec_widen_umult_even_v4si;
20044 nmode = V4SImode;
20045 }
20046 else if (mode == V4DImode)
20047 {
20048 umul = gen_vec_widen_umult_even_v8si;
20049 nmode = V8SImode;
20050 }
20051 else if (mode == V8DImode)
20052 {
20053 umul = gen_vec_widen_umult_even_v16si;
20054 nmode = V16SImode;
20055 }
20056 else
20057 gcc_unreachable ();
20058
20059
20060 /* Multiply low parts. */
20061 t1 = gen_reg_rtx (mode);
20062 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
20063
20064 /* Shift input vectors right 32 bits so we can multiply high parts. */
20065 t6 = GEN_INT (32);
20066 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
20067 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
20068
20069 /* Multiply high parts by low parts. */
20070 t4 = gen_reg_rtx (mode);
20071 t5 = gen_reg_rtx (mode);
20072 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
20073 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
20074
20075 /* Combine and shift the highparts back. */
20076 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
20077 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
20078
20079 /* Combine high and low parts. */
20080 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
20081 }
20082
20083 set_unique_reg_note (get_last_insn (), REG_EQUAL,
20084 gen_rtx_MULT (mode, op1, op2));
20085 }
20086
20087 /* Return 1 if control tansfer instruction INSN
20088 should be encoded with notrack prefix. */
20089
20090 bool
20091 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
20092 {
20093 if (!insn || !((flag_cf_protection & CF_BRANCH)))
20094 return false;
20095
20096 if (CALL_P (insn))
20097 {
20098 rtx call = get_call_rtx_from (insn);
20099 gcc_assert (call != NULL_RTX);
20100 rtx addr = XEXP (call, 0);
20101
20102 /* Do not emit 'notrack' if it's not an indirect call. */
20103 if (MEM_P (addr)
20104 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
20105 return false;
20106 else
20107 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
20108 }
20109
20110 if (JUMP_P (insn) && !flag_cet_switch)
20111 {
20112 rtx target = JUMP_LABEL (insn);
20113 if (target == NULL_RTX || ANY_RETURN_P (target))
20114 return false;
20115
20116 /* Check the jump is a switch table. */
20117 rtx_insn *label = as_a<rtx_insn *> (target);
20118 rtx_insn *table = next_insn (label);
20119 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
20120 return false;
20121 else
20122 return true;
20123 }
20124 return false;
20125 }
20126
20127 /* Calculate integer abs() using only SSE2 instructions. */
20128
20129 void
20130 ix86_expand_sse2_abs (rtx target, rtx input)
20131 {
20132 machine_mode mode = GET_MODE (target);
20133 rtx tmp0, tmp1, x;
20134
20135 switch (mode)
20136 {
20137 case E_V2DImode:
20138 case E_V4DImode:
20139 /* For 64-bit signed integer X, with SSE4.2 use
20140 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20141 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20142 32 and use logical instead of arithmetic right shift (which is
20143 unimplemented) and subtract. */
20144 if (TARGET_SSE4_2)
20145 {
20146 tmp0 = gen_reg_rtx (mode);
20147 tmp1 = gen_reg_rtx (mode);
20148 emit_move_insn (tmp1, CONST0_RTX (mode));
20149 if (mode == E_V2DImode)
20150 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
20151 else
20152 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
20153 }
20154 else
20155 {
20156 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
20157 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
20158 - 1), NULL, 0, OPTAB_DIRECT);
20159 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
20160 }
20161
20162 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20163 NULL, 0, OPTAB_DIRECT);
20164 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20165 target, 0, OPTAB_DIRECT);
20166 break;
20167
20168 case E_V4SImode:
20169 /* For 32-bit signed integer X, the best way to calculate the absolute
20170 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20171 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
20172 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
20173 NULL, 0, OPTAB_DIRECT);
20174 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20175 NULL, 0, OPTAB_DIRECT);
20176 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20177 target, 0, OPTAB_DIRECT);
20178 break;
20179
20180 case E_V8HImode:
20181 /* For 16-bit signed integer X, the best way to calculate the absolute
20182 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20183 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20184
20185 x = expand_simple_binop (mode, SMAX, tmp0, input,
20186 target, 0, OPTAB_DIRECT);
20187 break;
20188
20189 case E_V16QImode:
20190 /* For 8-bit signed integer X, the best way to calculate the absolute
20191 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20192 as SSE2 provides the PMINUB insn. */
20193 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20194
20195 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
20196 target, 0, OPTAB_DIRECT);
20197 break;
20198
20199 default:
20200 gcc_unreachable ();
20201 }
20202
20203 if (x != target)
20204 emit_move_insn (target, x);
20205 }
20206
20207 /* Expand an extract from a vector register through pextr insn.
20208 Return true if successful. */
20209
20210 bool
20211 ix86_expand_pextr (rtx *operands)
20212 {
20213 rtx dst = operands[0];
20214 rtx src = operands[1];
20215
20216 unsigned int size = INTVAL (operands[2]);
20217 unsigned int pos = INTVAL (operands[3]);
20218
20219 if (SUBREG_P (dst))
20220 {
20221 /* Reject non-lowpart subregs. */
20222 if (SUBREG_BYTE (dst) > 0)
20223 return false;
20224 dst = SUBREG_REG (dst);
20225 }
20226
20227 if (SUBREG_P (src))
20228 {
20229 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
20230 src = SUBREG_REG (src);
20231 }
20232
20233 switch (GET_MODE (src))
20234 {
20235 case E_V16QImode:
20236 case E_V8HImode:
20237 case E_V4SImode:
20238 case E_V2DImode:
20239 case E_V1TImode:
20240 case E_TImode:
20241 {
20242 machine_mode srcmode, dstmode;
20243 rtx d, pat;
20244
20245 if (!int_mode_for_size (size, 0).exists (&dstmode))
20246 return false;
20247
20248 switch (dstmode)
20249 {
20250 case E_QImode:
20251 if (!TARGET_SSE4_1)
20252 return false;
20253 srcmode = V16QImode;
20254 break;
20255
20256 case E_HImode:
20257 if (!TARGET_SSE2)
20258 return false;
20259 srcmode = V8HImode;
20260 break;
20261
20262 case E_SImode:
20263 if (!TARGET_SSE4_1)
20264 return false;
20265 srcmode = V4SImode;
20266 break;
20267
20268 case E_DImode:
20269 gcc_assert (TARGET_64BIT);
20270 if (!TARGET_SSE4_1)
20271 return false;
20272 srcmode = V2DImode;
20273 break;
20274
20275 default:
20276 return false;
20277 }
20278
20279 /* Reject extractions from misaligned positions. */
20280 if (pos & (size-1))
20281 return false;
20282
20283 if (GET_MODE (dst) == dstmode)
20284 d = dst;
20285 else
20286 d = gen_reg_rtx (dstmode);
20287
20288 /* Construct insn pattern. */
20289 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20290 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20291
20292 /* Let the rtl optimizers know about the zero extension performed. */
20293 if (dstmode == QImode || dstmode == HImode)
20294 {
20295 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20296 d = gen_lowpart (SImode, d);
20297 }
20298
20299 emit_insn (gen_rtx_SET (d, pat));
20300
20301 if (d != dst)
20302 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20303 return true;
20304 }
20305
20306 default:
20307 return false;
20308 }
20309 }
20310
20311 /* Expand an insert into a vector register through pinsr insn.
20312 Return true if successful. */
20313
20314 bool
20315 ix86_expand_pinsr (rtx *operands)
20316 {
20317 rtx dst = operands[0];
20318 rtx src = operands[3];
20319
20320 unsigned int size = INTVAL (operands[1]);
20321 unsigned int pos = INTVAL (operands[2]);
20322
20323 if (SUBREG_P (dst))
20324 {
20325 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20326 dst = SUBREG_REG (dst);
20327 }
20328
20329 switch (GET_MODE (dst))
20330 {
20331 case E_V16QImode:
20332 case E_V8HImode:
20333 case E_V4SImode:
20334 case E_V2DImode:
20335 case E_V1TImode:
20336 case E_TImode:
20337 {
20338 machine_mode srcmode, dstmode;
20339 rtx (*pinsr)(rtx, rtx, rtx, rtx);
20340 rtx d;
20341
20342 if (!int_mode_for_size (size, 0).exists (&srcmode))
20343 return false;
20344
20345 switch (srcmode)
20346 {
20347 case E_QImode:
20348 if (!TARGET_SSE4_1)
20349 return false;
20350 dstmode = V16QImode;
20351 pinsr = gen_sse4_1_pinsrb;
20352 break;
20353
20354 case E_HImode:
20355 if (!TARGET_SSE2)
20356 return false;
20357 dstmode = V8HImode;
20358 pinsr = gen_sse2_pinsrw;
20359 break;
20360
20361 case E_SImode:
20362 if (!TARGET_SSE4_1)
20363 return false;
20364 dstmode = V4SImode;
20365 pinsr = gen_sse4_1_pinsrd;
20366 break;
20367
20368 case E_DImode:
20369 gcc_assert (TARGET_64BIT);
20370 if (!TARGET_SSE4_1)
20371 return false;
20372 dstmode = V2DImode;
20373 pinsr = gen_sse4_1_pinsrq;
20374 break;
20375
20376 default:
20377 return false;
20378 }
20379
20380 /* Reject insertions to misaligned positions. */
20381 if (pos & (size-1))
20382 return false;
20383
20384 if (SUBREG_P (src))
20385 {
20386 unsigned int srcpos = SUBREG_BYTE (src);
20387
20388 if (srcpos > 0)
20389 {
20390 rtx extr_ops[4];
20391
20392 extr_ops[0] = gen_reg_rtx (srcmode);
20393 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
20394 extr_ops[2] = GEN_INT (size);
20395 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
20396
20397 if (!ix86_expand_pextr (extr_ops))
20398 return false;
20399
20400 src = extr_ops[0];
20401 }
20402 else
20403 src = gen_lowpart (srcmode, SUBREG_REG (src));
20404 }
20405
20406 if (GET_MODE (dst) == dstmode)
20407 d = dst;
20408 else
20409 d = gen_reg_rtx (dstmode);
20410
20411 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
20412 gen_lowpart (srcmode, src),
20413 GEN_INT (1 << (pos / size))));
20414 if (d != dst)
20415 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20416 return true;
20417 }
20418
20419 default:
20420 return false;
20421 }
20422 }
20423
20424 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20425 upper against lower halves up to SSE reg size. */
20426
20427 machine_mode
20428 ix86_split_reduction (machine_mode mode)
20429 {
20430 /* Reduce lowpart against highpart until we reach SSE reg width to
20431 avoid cross-lane operations. */
20432 switch (mode)
20433 {
20434 case E_V8DImode:
20435 case E_V4DImode:
20436 return V2DImode;
20437 case E_V16SImode:
20438 case E_V8SImode:
20439 return V4SImode;
20440 case E_V32HImode:
20441 case E_V16HImode:
20442 return V8HImode;
20443 case E_V64QImode:
20444 case E_V32QImode:
20445 return V16QImode;
20446 case E_V16SFmode:
20447 case E_V8SFmode:
20448 return V4SFmode;
20449 case E_V8DFmode:
20450 case E_V4DFmode:
20451 return V2DFmode;
20452 default:
20453 return mode;
20454 }
20455 }
20456
20457 /* Generate call to __divmoddi4. */
20458
20459 void
20460 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
20461 rtx op0, rtx op1,
20462 rtx *quot_p, rtx *rem_p)
20463 {
20464 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
20465
20466 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
20467 mode, op0, mode, op1, mode,
20468 XEXP (rem, 0), Pmode);
20469 *quot_p = quot;
20470 *rem_p = rem;
20471 }
20472
20473 #include "gt-i386-expand.h"