Add expander for movp2hi and movp2qi.
[gcc.git] / gcc / config / i386 / i386-expand.c
1 /* Copyright (C) 1988-2020 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102 void
103 split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105 {
106 machine_mode half_mode;
107 unsigned int byte;
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), 0);
157 hi_half[num] = simplify_gen_subreg (half_mode, op,
158 GET_MODE (op) == VOIDmode
159 ? mode : GET_MODE (op), byte);
160 }
161 }
162 }
163
164 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
165 for the target. */
166
167 void
168 ix86_expand_clear (rtx dest)
169 {
170 rtx tmp;
171
172 /* We play register width games, which are only valid after reload. */
173 gcc_assert (reload_completed);
174
175 /* Avoid HImode and its attendant prefix byte. */
176 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
177 dest = gen_rtx_REG (SImode, REGNO (dest));
178 tmp = gen_rtx_SET (dest, const0_rtx);
179
180 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
181 {
182 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
183 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
184 }
185
186 emit_insn (tmp);
187 }
188
189 void
190 ix86_expand_move (machine_mode mode, rtx operands[])
191 {
192 rtx op0, op1;
193 rtx tmp, addend = NULL_RTX;
194 enum tls_model model;
195
196 op0 = operands[0];
197 op1 = operands[1];
198
199 switch (GET_CODE (op1))
200 {
201 case CONST:
202 tmp = XEXP (op1, 0);
203
204 if (GET_CODE (tmp) != PLUS
205 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
206 break;
207
208 op1 = XEXP (tmp, 0);
209 addend = XEXP (tmp, 1);
210 /* FALLTHRU */
211
212 case SYMBOL_REF:
213 model = SYMBOL_REF_TLS_MODEL (op1);
214
215 if (model)
216 op1 = legitimize_tls_address (op1, model, true);
217 else if (ix86_force_load_from_GOT_p (op1))
218 {
219 /* Load the external function address via GOT slot to avoid PLT. */
220 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
221 (TARGET_64BIT
222 ? UNSPEC_GOTPCREL
223 : UNSPEC_GOT));
224 op1 = gen_rtx_CONST (Pmode, op1);
225 op1 = gen_const_mem (Pmode, op1);
226 set_mem_alias_set (op1, ix86_GOT_alias_set ());
227 }
228 else
229 {
230 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
231 if (tmp)
232 {
233 op1 = tmp;
234 if (!addend)
235 break;
236 }
237 else
238 {
239 op1 = operands[1];
240 break;
241 }
242 }
243
244 if (addend)
245 {
246 op1 = force_operand (op1, NULL_RTX);
247 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
248 op0, 1, OPTAB_DIRECT);
249 }
250 else
251 op1 = force_operand (op1, op0);
252
253 if (op1 == op0)
254 return;
255
256 op1 = convert_to_mode (mode, op1, 1);
257
258 default:
259 break;
260 }
261
262 if ((flag_pic || MACHOPIC_INDIRECT)
263 && symbolic_operand (op1, mode))
264 {
265 if (TARGET_MACHO && !TARGET_64BIT)
266 {
267 #if TARGET_MACHO
268 /* dynamic-no-pic */
269 if (MACHOPIC_INDIRECT)
270 {
271 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
272 ? op0 : gen_reg_rtx (Pmode);
273 op1 = machopic_indirect_data_reference (op1, temp);
274 if (MACHOPIC_PURE)
275 op1 = machopic_legitimize_pic_address (op1, mode,
276 temp == op1 ? 0 : temp);
277 }
278 if (op0 != op1 && GET_CODE (op0) != MEM)
279 {
280 rtx insn = gen_rtx_SET (op0, op1);
281 emit_insn (insn);
282 return;
283 }
284 if (GET_CODE (op0) == MEM)
285 op1 = force_reg (Pmode, op1);
286 else
287 {
288 rtx temp = op0;
289 if (GET_CODE (temp) != REG)
290 temp = gen_reg_rtx (Pmode);
291 temp = legitimize_pic_address (op1, temp);
292 if (temp == op0)
293 return;
294 op1 = temp;
295 }
296 /* dynamic-no-pic */
297 #endif
298 }
299 else
300 {
301 if (MEM_P (op0))
302 op1 = force_reg (mode, op1);
303 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
304 {
305 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
306 op1 = legitimize_pic_address (op1, reg);
307 if (op0 == op1)
308 return;
309 op1 = convert_to_mode (mode, op1, 1);
310 }
311 }
312 }
313 else
314 {
315 if (MEM_P (op0)
316 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
317 || !push_operand (op0, mode))
318 && MEM_P (op1))
319 op1 = force_reg (mode, op1);
320
321 if (push_operand (op0, mode)
322 && ! general_no_elim_operand (op1, mode))
323 op1 = copy_to_mode_reg (mode, op1);
324
325 /* Force large constants in 64bit compilation into register
326 to get them CSEed. */
327 if (can_create_pseudo_p ()
328 && (mode == DImode) && TARGET_64BIT
329 && immediate_operand (op1, mode)
330 && !x86_64_zext_immediate_operand (op1, VOIDmode)
331 && !register_operand (op0, mode)
332 && optimize)
333 op1 = copy_to_mode_reg (mode, op1);
334
335 if (can_create_pseudo_p ()
336 && CONST_DOUBLE_P (op1))
337 {
338 /* If we are loading a floating point constant to a register,
339 force the value to memory now, since we'll get better code
340 out the back end. */
341
342 op1 = validize_mem (force_const_mem (mode, op1));
343 if (!register_operand (op0, mode))
344 {
345 rtx temp = gen_reg_rtx (mode);
346 emit_insn (gen_rtx_SET (temp, op1));
347 emit_move_insn (op0, temp);
348 return;
349 }
350 }
351 }
352
353 emit_insn (gen_rtx_SET (op0, op1));
354 }
355
356 void
357 ix86_expand_vector_move (machine_mode mode, rtx operands[])
358 {
359 rtx op0 = operands[0], op1 = operands[1];
360 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
361 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
362 unsigned int align = (TARGET_IAMCU
363 ? GET_MODE_BITSIZE (mode)
364 : GET_MODE_ALIGNMENT (mode));
365
366 if (push_operand (op0, VOIDmode))
367 op0 = emit_move_resolve_push (mode, op0);
368
369 /* Force constants other than zero into memory. We do not know how
370 the instructions used to build constants modify the upper 64 bits
371 of the register, once we have that information we may be able
372 to handle some of them more efficiently. */
373 if (can_create_pseudo_p ()
374 && (CONSTANT_P (op1)
375 || (SUBREG_P (op1)
376 && CONSTANT_P (SUBREG_REG (op1))))
377 && ((register_operand (op0, mode)
378 && !standard_sse_constant_p (op1, mode))
379 /* ix86_expand_vector_move_misalign() does not like constants. */
380 || (SSE_REG_MODE_P (mode)
381 && MEM_P (op0)
382 && MEM_ALIGN (op0) < align)))
383 {
384 if (SUBREG_P (op1))
385 {
386 machine_mode imode = GET_MODE (SUBREG_REG (op1));
387 rtx r = force_const_mem (imode, SUBREG_REG (op1));
388 if (r)
389 r = validize_mem (r);
390 else
391 r = force_reg (imode, SUBREG_REG (op1));
392 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
393 }
394 else
395 op1 = validize_mem (force_const_mem (mode, op1));
396 }
397
398 /* We need to check memory alignment for SSE mode since attribute
399 can make operands unaligned. */
400 if (can_create_pseudo_p ()
401 && SSE_REG_MODE_P (mode)
402 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
403 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
404 {
405 rtx tmp[2];
406
407 /* ix86_expand_vector_move_misalign() does not like both
408 arguments in memory. */
409 if (!register_operand (op0, mode)
410 && !register_operand (op1, mode))
411 op1 = force_reg (mode, op1);
412
413 tmp[0] = op0; tmp[1] = op1;
414 ix86_expand_vector_move_misalign (mode, tmp);
415 return;
416 }
417
418 /* Make operand1 a register if it isn't already. */
419 if (can_create_pseudo_p ()
420 && !register_operand (op0, mode)
421 && !register_operand (op1, mode))
422 {
423 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
424 return;
425 }
426
427 emit_insn (gen_rtx_SET (op0, op1));
428 }
429
430 /* Split 32-byte AVX unaligned load and store if needed. */
431
432 static void
433 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
434 {
435 rtx m;
436 rtx (*extract) (rtx, rtx, rtx);
437 machine_mode mode;
438
439 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
440 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
441 {
442 emit_insn (gen_rtx_SET (op0, op1));
443 return;
444 }
445
446 rtx orig_op0 = NULL_RTX;
447 mode = GET_MODE (op0);
448 switch (GET_MODE_CLASS (mode))
449 {
450 case MODE_VECTOR_INT:
451 case MODE_INT:
452 if (mode != V32QImode)
453 {
454 if (!MEM_P (op0))
455 {
456 orig_op0 = op0;
457 op0 = gen_reg_rtx (V32QImode);
458 }
459 else
460 op0 = gen_lowpart (V32QImode, op0);
461 op1 = gen_lowpart (V32QImode, op1);
462 mode = V32QImode;
463 }
464 break;
465 case MODE_VECTOR_FLOAT:
466 break;
467 default:
468 gcc_unreachable ();
469 }
470
471 switch (mode)
472 {
473 default:
474 gcc_unreachable ();
475 case E_V32QImode:
476 extract = gen_avx_vextractf128v32qi;
477 mode = V16QImode;
478 break;
479 case E_V8SFmode:
480 extract = gen_avx_vextractf128v8sf;
481 mode = V4SFmode;
482 break;
483 case E_V4DFmode:
484 extract = gen_avx_vextractf128v4df;
485 mode = V2DFmode;
486 break;
487 }
488
489 if (MEM_P (op1))
490 {
491 rtx r = gen_reg_rtx (mode);
492 m = adjust_address (op1, mode, 0);
493 emit_move_insn (r, m);
494 m = adjust_address (op1, mode, 16);
495 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
496 emit_move_insn (op0, r);
497 }
498 else if (MEM_P (op0))
499 {
500 m = adjust_address (op0, mode, 0);
501 emit_insn (extract (m, op1, const0_rtx));
502 m = adjust_address (op0, mode, 16);
503 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
504 }
505 else
506 gcc_unreachable ();
507
508 if (orig_op0)
509 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
510 }
511
512 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
513 straight to ix86_expand_vector_move. */
514 /* Code generation for scalar reg-reg moves of single and double precision data:
515 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
516 movaps reg, reg
517 else
518 movss reg, reg
519 if (x86_sse_partial_reg_dependency == true)
520 movapd reg, reg
521 else
522 movsd reg, reg
523
524 Code generation for scalar loads of double precision data:
525 if (x86_sse_split_regs == true)
526 movlpd mem, reg (gas syntax)
527 else
528 movsd mem, reg
529
530 Code generation for unaligned packed loads of single precision data
531 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
532 if (x86_sse_unaligned_move_optimal)
533 movups mem, reg
534
535 if (x86_sse_partial_reg_dependency == true)
536 {
537 xorps reg, reg
538 movlps mem, reg
539 movhps mem+8, reg
540 }
541 else
542 {
543 movlps mem, reg
544 movhps mem+8, reg
545 }
546
547 Code generation for unaligned packed loads of double precision data
548 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
549 if (x86_sse_unaligned_move_optimal)
550 movupd mem, reg
551
552 if (x86_sse_split_regs == true)
553 {
554 movlpd mem, reg
555 movhpd mem+8, reg
556 }
557 else
558 {
559 movsd mem, reg
560 movhpd mem+8, reg
561 }
562 */
563
564 void
565 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
566 {
567 rtx op0, op1, m;
568
569 op0 = operands[0];
570 op1 = operands[1];
571
572 /* Use unaligned load/store for AVX512 or when optimizing for size. */
573 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
574 {
575 emit_insn (gen_rtx_SET (op0, op1));
576 return;
577 }
578
579 if (TARGET_AVX)
580 {
581 if (GET_MODE_SIZE (mode) == 32)
582 ix86_avx256_split_vector_move_misalign (op0, op1);
583 else
584 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
585 emit_insn (gen_rtx_SET (op0, op1));
586 return;
587 }
588
589 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
590 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
591 {
592 emit_insn (gen_rtx_SET (op0, op1));
593 return;
594 }
595
596 /* ??? If we have typed data, then it would appear that using
597 movdqu is the only way to get unaligned data loaded with
598 integer type. */
599 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
600 {
601 emit_insn (gen_rtx_SET (op0, op1));
602 return;
603 }
604
605 if (MEM_P (op1))
606 {
607 if (TARGET_SSE2 && mode == V2DFmode)
608 {
609 rtx zero;
610
611 /* When SSE registers are split into halves, we can avoid
612 writing to the top half twice. */
613 if (TARGET_SSE_SPLIT_REGS)
614 {
615 emit_clobber (op0);
616 zero = op0;
617 }
618 else
619 {
620 /* ??? Not sure about the best option for the Intel chips.
621 The following would seem to satisfy; the register is
622 entirely cleared, breaking the dependency chain. We
623 then store to the upper half, with a dependency depth
624 of one. A rumor has it that Intel recommends two movsd
625 followed by an unpacklpd, but this is unconfirmed. And
626 given that the dependency depth of the unpacklpd would
627 still be one, I'm not sure why this would be better. */
628 zero = CONST0_RTX (V2DFmode);
629 }
630
631 m = adjust_address (op1, DFmode, 0);
632 emit_insn (gen_sse2_loadlpd (op0, zero, m));
633 m = adjust_address (op1, DFmode, 8);
634 emit_insn (gen_sse2_loadhpd (op0, op0, m));
635 }
636 else
637 {
638 rtx t;
639
640 if (mode != V4SFmode)
641 t = gen_reg_rtx (V4SFmode);
642 else
643 t = op0;
644
645 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
646 emit_move_insn (t, CONST0_RTX (V4SFmode));
647 else
648 emit_clobber (t);
649
650 m = adjust_address (op1, V2SFmode, 0);
651 emit_insn (gen_sse_loadlps (t, t, m));
652 m = adjust_address (op1, V2SFmode, 8);
653 emit_insn (gen_sse_loadhps (t, t, m));
654 if (mode != V4SFmode)
655 emit_move_insn (op0, gen_lowpart (mode, t));
656 }
657 }
658 else if (MEM_P (op0))
659 {
660 if (TARGET_SSE2 && mode == V2DFmode)
661 {
662 m = adjust_address (op0, DFmode, 0);
663 emit_insn (gen_sse2_storelpd (m, op1));
664 m = adjust_address (op0, DFmode, 8);
665 emit_insn (gen_sse2_storehpd (m, op1));
666 }
667 else
668 {
669 if (mode != V4SFmode)
670 op1 = gen_lowpart (V4SFmode, op1);
671
672 m = adjust_address (op0, V2SFmode, 0);
673 emit_insn (gen_sse_storelps (m, op1));
674 m = adjust_address (op0, V2SFmode, 8);
675 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
676 }
677 }
678 else
679 gcc_unreachable ();
680 }
681
682 /* Move bits 64:95 to bits 32:63. */
683
684 void
685 ix86_move_vector_high_sse_to_mmx (rtx op)
686 {
687 rtx mask = gen_rtx_PARALLEL (VOIDmode,
688 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
689 GEN_INT (0), GEN_INT (0)));
690 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
691 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
692 rtx insn = gen_rtx_SET (dest, op);
693 emit_insn (insn);
694 }
695
696 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
697
698 void
699 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
700 {
701 rtx op0 = operands[0];
702 rtx op1 = operands[1];
703 rtx op2 = operands[2];
704
705 machine_mode dmode = GET_MODE (op0);
706 machine_mode smode = GET_MODE (op1);
707 machine_mode inner_dmode = GET_MODE_INNER (dmode);
708 machine_mode inner_smode = GET_MODE_INNER (smode);
709
710 /* Get the corresponding SSE mode for destination. */
711 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
712 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
713 nunits).require ();
714 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
715 nunits / 2).require ();
716
717 /* Get the corresponding SSE mode for source. */
718 nunits = 16 / GET_MODE_SIZE (inner_smode);
719 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
720 nunits).require ();
721
722 /* Generate SSE pack with signed/unsigned saturation. */
723 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
724 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
725 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
726
727 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
728 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
729 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
730 op1, op2));
731 emit_insn (insn);
732
733 ix86_move_vector_high_sse_to_mmx (op0);
734 }
735
736 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
737
738 void
739 ix86_split_mmx_punpck (rtx operands[], bool high_p)
740 {
741 rtx op0 = operands[0];
742 rtx op1 = operands[1];
743 rtx op2 = operands[2];
744 machine_mode mode = GET_MODE (op0);
745 rtx mask;
746 /* The corresponding SSE mode. */
747 machine_mode sse_mode, double_sse_mode;
748
749 switch (mode)
750 {
751 case E_V8QImode:
752 sse_mode = V16QImode;
753 double_sse_mode = V32QImode;
754 mask = gen_rtx_PARALLEL (VOIDmode,
755 gen_rtvec (16,
756 GEN_INT (0), GEN_INT (16),
757 GEN_INT (1), GEN_INT (17),
758 GEN_INT (2), GEN_INT (18),
759 GEN_INT (3), GEN_INT (19),
760 GEN_INT (4), GEN_INT (20),
761 GEN_INT (5), GEN_INT (21),
762 GEN_INT (6), GEN_INT (22),
763 GEN_INT (7), GEN_INT (23)));
764 break;
765
766 case E_V4HImode:
767 sse_mode = V8HImode;
768 double_sse_mode = V16HImode;
769 mask = gen_rtx_PARALLEL (VOIDmode,
770 gen_rtvec (8,
771 GEN_INT (0), GEN_INT (8),
772 GEN_INT (1), GEN_INT (9),
773 GEN_INT (2), GEN_INT (10),
774 GEN_INT (3), GEN_INT (11)));
775 break;
776
777 case E_V2SImode:
778 sse_mode = V4SImode;
779 double_sse_mode = V8SImode;
780 mask = gen_rtx_PARALLEL (VOIDmode,
781 gen_rtvec (4,
782 GEN_INT (0), GEN_INT (4),
783 GEN_INT (1), GEN_INT (5)));
784 break;
785
786 default:
787 gcc_unreachable ();
788 }
789
790 /* Generate SSE punpcklXX. */
791 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
792 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
793 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
794
795 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
796 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
797 rtx insn = gen_rtx_SET (dest, op2);
798 emit_insn (insn);
799
800 if (high_p)
801 {
802 /* Move bits 64:127 to bits 0:63. */
803 mask = gen_rtx_PARALLEL (VOIDmode,
804 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
805 GEN_INT (0), GEN_INT (0)));
806 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
807 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
808 insn = gen_rtx_SET (dest, op1);
809 emit_insn (insn);
810 }
811 }
812
813 /* Helper function of ix86_fixup_binary_operands to canonicalize
814 operand order. Returns true if the operands should be swapped. */
815
816 static bool
817 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
818 rtx operands[])
819 {
820 rtx dst = operands[0];
821 rtx src1 = operands[1];
822 rtx src2 = operands[2];
823
824 /* If the operation is not commutative, we can't do anything. */
825 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
826 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
827 return false;
828
829 /* Highest priority is that src1 should match dst. */
830 if (rtx_equal_p (dst, src1))
831 return false;
832 if (rtx_equal_p (dst, src2))
833 return true;
834
835 /* Next highest priority is that immediate constants come second. */
836 if (immediate_operand (src2, mode))
837 return false;
838 if (immediate_operand (src1, mode))
839 return true;
840
841 /* Lowest priority is that memory references should come second. */
842 if (MEM_P (src2))
843 return false;
844 if (MEM_P (src1))
845 return true;
846
847 return false;
848 }
849
850
851 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
852 destination to use for the operation. If different from the true
853 destination in operands[0], a copy operation will be required. */
854
855 rtx
856 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
857 rtx operands[])
858 {
859 rtx dst = operands[0];
860 rtx src1 = operands[1];
861 rtx src2 = operands[2];
862
863 /* Canonicalize operand order. */
864 if (ix86_swap_binary_operands_p (code, mode, operands))
865 {
866 /* It is invalid to swap operands of different modes. */
867 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
868
869 std::swap (src1, src2);
870 }
871
872 /* Both source operands cannot be in memory. */
873 if (MEM_P (src1) && MEM_P (src2))
874 {
875 /* Optimization: Only read from memory once. */
876 if (rtx_equal_p (src1, src2))
877 {
878 src2 = force_reg (mode, src2);
879 src1 = src2;
880 }
881 else if (rtx_equal_p (dst, src1))
882 src2 = force_reg (mode, src2);
883 else
884 src1 = force_reg (mode, src1);
885 }
886
887 /* If the destination is memory, and we do not have matching source
888 operands, do things in registers. */
889 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
890 dst = gen_reg_rtx (mode);
891
892 /* Source 1 cannot be a constant. */
893 if (CONSTANT_P (src1))
894 src1 = force_reg (mode, src1);
895
896 /* Source 1 cannot be a non-matching memory. */
897 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
898 src1 = force_reg (mode, src1);
899
900 /* Improve address combine. */
901 if (code == PLUS
902 && GET_MODE_CLASS (mode) == MODE_INT
903 && MEM_P (src2))
904 src2 = force_reg (mode, src2);
905
906 operands[1] = src1;
907 operands[2] = src2;
908 return dst;
909 }
910
911 /* Similarly, but assume that the destination has already been
912 set up properly. */
913
914 void
915 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
916 machine_mode mode, rtx operands[])
917 {
918 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
919 gcc_assert (dst == operands[0]);
920 }
921
922 /* Attempt to expand a binary operator. Make the expansion closer to the
923 actual machine, then just general_operand, which will allow 3 separate
924 memory references (one output, two input) in a single insn. */
925
926 void
927 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
928 rtx operands[])
929 {
930 rtx src1, src2, dst, op, clob;
931
932 dst = ix86_fixup_binary_operands (code, mode, operands);
933 src1 = operands[1];
934 src2 = operands[2];
935
936 /* Emit the instruction. */
937
938 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
939
940 if (reload_completed
941 && code == PLUS
942 && !rtx_equal_p (dst, src1))
943 {
944 /* This is going to be an LEA; avoid splitting it later. */
945 emit_insn (op);
946 }
947 else
948 {
949 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
950 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
951 }
952
953 /* Fix up the destination if needed. */
954 if (dst != operands[0])
955 emit_move_insn (operands[0], dst);
956 }
957
958 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
959 the given OPERANDS. */
960
961 void
962 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
963 rtx operands[])
964 {
965 rtx op1 = NULL_RTX, op2 = NULL_RTX;
966 if (SUBREG_P (operands[1]))
967 {
968 op1 = operands[1];
969 op2 = operands[2];
970 }
971 else if (SUBREG_P (operands[2]))
972 {
973 op1 = operands[2];
974 op2 = operands[1];
975 }
976 /* Optimize (__m128i) d | (__m128i) e and similar code
977 when d and e are float vectors into float vector logical
978 insn. In C/C++ without using intrinsics there is no other way
979 to express vector logical operation on float vectors than
980 to cast them temporarily to integer vectors. */
981 if (op1
982 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
983 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
984 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
985 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
986 && SUBREG_BYTE (op1) == 0
987 && (GET_CODE (op2) == CONST_VECTOR
988 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
989 && SUBREG_BYTE (op2) == 0))
990 && can_create_pseudo_p ())
991 {
992 rtx dst;
993 switch (GET_MODE (SUBREG_REG (op1)))
994 {
995 case E_V4SFmode:
996 case E_V8SFmode:
997 case E_V16SFmode:
998 case E_V2DFmode:
999 case E_V4DFmode:
1000 case E_V8DFmode:
1001 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1002 if (GET_CODE (op2) == CONST_VECTOR)
1003 {
1004 op2 = gen_lowpart (GET_MODE (dst), op2);
1005 op2 = force_reg (GET_MODE (dst), op2);
1006 }
1007 else
1008 {
1009 op1 = operands[1];
1010 op2 = SUBREG_REG (operands[2]);
1011 if (!vector_operand (op2, GET_MODE (dst)))
1012 op2 = force_reg (GET_MODE (dst), op2);
1013 }
1014 op1 = SUBREG_REG (op1);
1015 if (!vector_operand (op1, GET_MODE (dst)))
1016 op1 = force_reg (GET_MODE (dst), op1);
1017 emit_insn (gen_rtx_SET (dst,
1018 gen_rtx_fmt_ee (code, GET_MODE (dst),
1019 op1, op2)));
1020 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1021 return;
1022 default:
1023 break;
1024 }
1025 }
1026 if (!vector_operand (operands[1], mode))
1027 operands[1] = force_reg (mode, operands[1]);
1028 if (!vector_operand (operands[2], mode))
1029 operands[2] = force_reg (mode, operands[2]);
1030 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1031 emit_insn (gen_rtx_SET (operands[0],
1032 gen_rtx_fmt_ee (code, mode, operands[1],
1033 operands[2])));
1034 }
1035
1036 /* Return TRUE or FALSE depending on whether the binary operator meets the
1037 appropriate constraints. */
1038
1039 bool
1040 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1041 rtx operands[3])
1042 {
1043 rtx dst = operands[0];
1044 rtx src1 = operands[1];
1045 rtx src2 = operands[2];
1046
1047 /* Both source operands cannot be in memory. */
1048 if (MEM_P (src1) && MEM_P (src2))
1049 return false;
1050
1051 /* Canonicalize operand order for commutative operators. */
1052 if (ix86_swap_binary_operands_p (code, mode, operands))
1053 std::swap (src1, src2);
1054
1055 /* If the destination is memory, we must have a matching source operand. */
1056 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1057 return false;
1058
1059 /* Source 1 cannot be a constant. */
1060 if (CONSTANT_P (src1))
1061 return false;
1062
1063 /* Source 1 cannot be a non-matching memory. */
1064 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1065 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1066 return (code == AND
1067 && (mode == HImode
1068 || mode == SImode
1069 || (TARGET_64BIT && mode == DImode))
1070 && satisfies_constraint_L (src2));
1071
1072 return true;
1073 }
1074
1075 /* Attempt to expand a unary operator. Make the expansion closer to the
1076 actual machine, then just general_operand, which will allow 2 separate
1077 memory references (one output, one input) in a single insn. */
1078
1079 void
1080 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1081 rtx operands[])
1082 {
1083 bool matching_memory = false;
1084 rtx src, dst, op, clob;
1085
1086 dst = operands[0];
1087 src = operands[1];
1088
1089 /* If the destination is memory, and we do not have matching source
1090 operands, do things in registers. */
1091 if (MEM_P (dst))
1092 {
1093 if (rtx_equal_p (dst, src))
1094 matching_memory = true;
1095 else
1096 dst = gen_reg_rtx (mode);
1097 }
1098
1099 /* When source operand is memory, destination must match. */
1100 if (MEM_P (src) && !matching_memory)
1101 src = force_reg (mode, src);
1102
1103 /* Emit the instruction. */
1104
1105 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1106
1107 if (code == NOT)
1108 emit_insn (op);
1109 else
1110 {
1111 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1112 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1113 }
1114
1115 /* Fix up the destination if needed. */
1116 if (dst != operands[0])
1117 emit_move_insn (operands[0], dst);
1118 }
1119
1120 /* Predict just emitted jump instruction to be taken with probability PROB. */
1121
1122 static void
1123 predict_jump (int prob)
1124 {
1125 rtx_insn *insn = get_last_insn ();
1126 gcc_assert (JUMP_P (insn));
1127 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1128 }
1129
1130 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1131 divisor are within the range [0-255]. */
1132
1133 void
1134 ix86_split_idivmod (machine_mode mode, rtx operands[],
1135 bool unsigned_p)
1136 {
1137 rtx_code_label *end_label, *qimode_label;
1138 rtx div, mod;
1139 rtx_insn *insn;
1140 rtx scratch, tmp0, tmp1, tmp2;
1141 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1142
1143 switch (mode)
1144 {
1145 case E_SImode:
1146 if (GET_MODE (operands[0]) == SImode)
1147 {
1148 if (GET_MODE (operands[1]) == SImode)
1149 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1150 else
1151 gen_divmod4_1
1152 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1153 }
1154 else
1155 gen_divmod4_1
1156 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1157 break;
1158
1159 case E_DImode:
1160 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1161 break;
1162
1163 default:
1164 gcc_unreachable ();
1165 }
1166
1167 end_label = gen_label_rtx ();
1168 qimode_label = gen_label_rtx ();
1169
1170 scratch = gen_reg_rtx (mode);
1171
1172 /* Use 8bit unsigned divimod if dividend and divisor are within
1173 the range [0-255]. */
1174 emit_move_insn (scratch, operands[2]);
1175 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1176 scratch, 1, OPTAB_DIRECT);
1177 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1178 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1179 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1180 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1181 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1182 pc_rtx);
1183 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1184 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1185 JUMP_LABEL (insn) = qimode_label;
1186
1187 /* Generate original signed/unsigned divimod. */
1188 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1189 operands[2], operands[3]));
1190
1191 /* Branch to the end. */
1192 emit_jump_insn (gen_jump (end_label));
1193 emit_barrier ();
1194
1195 /* Generate 8bit unsigned divide. */
1196 emit_label (qimode_label);
1197 /* Don't use operands[0] for result of 8bit divide since not all
1198 registers support QImode ZERO_EXTRACT. */
1199 tmp0 = lowpart_subreg (HImode, scratch, mode);
1200 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1201 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1202 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1203
1204 if (unsigned_p)
1205 {
1206 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1207 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1208 }
1209 else
1210 {
1211 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1212 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1213 }
1214 if (mode == SImode)
1215 {
1216 if (GET_MODE (operands[0]) != SImode)
1217 div = gen_rtx_ZERO_EXTEND (DImode, div);
1218 if (GET_MODE (operands[1]) != SImode)
1219 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1220 }
1221
1222 /* Extract remainder from AH. */
1223 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1224 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1225 GEN_INT (8), GEN_INT (8));
1226 insn = emit_move_insn (operands[1], tmp1);
1227 set_unique_reg_note (insn, REG_EQUAL, mod);
1228
1229 /* Zero extend quotient from AL. */
1230 tmp1 = gen_lowpart (QImode, tmp0);
1231 insn = emit_insn (gen_extend_insn
1232 (operands[0], tmp1,
1233 GET_MODE (operands[0]), QImode, 1));
1234 set_unique_reg_note (insn, REG_EQUAL, div);
1235
1236 emit_label (end_label);
1237 }
1238
1239 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1240 matches destination. RTX includes clobber of FLAGS_REG. */
1241
1242 void
1243 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1244 rtx dst, rtx src)
1245 {
1246 rtx op, clob;
1247
1248 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1249 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1250
1251 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1252 }
1253
1254 /* Return true if regno1 def is nearest to the insn. */
1255
1256 static bool
1257 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1258 {
1259 rtx_insn *prev = insn;
1260 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1261
1262 if (insn == start)
1263 return false;
1264 while (prev && prev != start)
1265 {
1266 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1267 {
1268 prev = PREV_INSN (prev);
1269 continue;
1270 }
1271 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1272 return true;
1273 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1274 return false;
1275 prev = PREV_INSN (prev);
1276 }
1277
1278 /* None of the regs is defined in the bb. */
1279 return false;
1280 }
1281
1282 /* Split lea instructions into a sequence of instructions
1283 which are executed on ALU to avoid AGU stalls.
1284 It is assumed that it is allowed to clobber flags register
1285 at lea position. */
1286
1287 void
1288 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1289 {
1290 unsigned int regno0, regno1, regno2;
1291 struct ix86_address parts;
1292 rtx target, tmp;
1293 int ok, adds;
1294
1295 ok = ix86_decompose_address (operands[1], &parts);
1296 gcc_assert (ok);
1297
1298 target = gen_lowpart (mode, operands[0]);
1299
1300 regno0 = true_regnum (target);
1301 regno1 = INVALID_REGNUM;
1302 regno2 = INVALID_REGNUM;
1303
1304 if (parts.base)
1305 {
1306 parts.base = gen_lowpart (mode, parts.base);
1307 regno1 = true_regnum (parts.base);
1308 }
1309
1310 if (parts.index)
1311 {
1312 parts.index = gen_lowpart (mode, parts.index);
1313 regno2 = true_regnum (parts.index);
1314 }
1315
1316 if (parts.disp)
1317 parts.disp = gen_lowpart (mode, parts.disp);
1318
1319 if (parts.scale > 1)
1320 {
1321 /* Case r1 = r1 + ... */
1322 if (regno1 == regno0)
1323 {
1324 /* If we have a case r1 = r1 + C * r2 then we
1325 should use multiplication which is very
1326 expensive. Assume cost model is wrong if we
1327 have such case here. */
1328 gcc_assert (regno2 != regno0);
1329
1330 for (adds = parts.scale; adds > 0; adds--)
1331 ix86_emit_binop (PLUS, mode, target, parts.index);
1332 }
1333 else
1334 {
1335 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1336 if (regno0 != regno2)
1337 emit_insn (gen_rtx_SET (target, parts.index));
1338
1339 /* Use shift for scaling. */
1340 ix86_emit_binop (ASHIFT, mode, target,
1341 GEN_INT (exact_log2 (parts.scale)));
1342
1343 if (parts.base)
1344 ix86_emit_binop (PLUS, mode, target, parts.base);
1345
1346 if (parts.disp && parts.disp != const0_rtx)
1347 ix86_emit_binop (PLUS, mode, target, parts.disp);
1348 }
1349 }
1350 else if (!parts.base && !parts.index)
1351 {
1352 gcc_assert(parts.disp);
1353 emit_insn (gen_rtx_SET (target, parts.disp));
1354 }
1355 else
1356 {
1357 if (!parts.base)
1358 {
1359 if (regno0 != regno2)
1360 emit_insn (gen_rtx_SET (target, parts.index));
1361 }
1362 else if (!parts.index)
1363 {
1364 if (regno0 != regno1)
1365 emit_insn (gen_rtx_SET (target, parts.base));
1366 }
1367 else
1368 {
1369 if (regno0 == regno1)
1370 tmp = parts.index;
1371 else if (regno0 == regno2)
1372 tmp = parts.base;
1373 else
1374 {
1375 rtx tmp1;
1376
1377 /* Find better operand for SET instruction, depending
1378 on which definition is farther from the insn. */
1379 if (find_nearest_reg_def (insn, regno1, regno2))
1380 tmp = parts.index, tmp1 = parts.base;
1381 else
1382 tmp = parts.base, tmp1 = parts.index;
1383
1384 emit_insn (gen_rtx_SET (target, tmp));
1385
1386 if (parts.disp && parts.disp != const0_rtx)
1387 ix86_emit_binop (PLUS, mode, target, parts.disp);
1388
1389 ix86_emit_binop (PLUS, mode, target, tmp1);
1390 return;
1391 }
1392
1393 ix86_emit_binop (PLUS, mode, target, tmp);
1394 }
1395
1396 if (parts.disp && parts.disp != const0_rtx)
1397 ix86_emit_binop (PLUS, mode, target, parts.disp);
1398 }
1399 }
1400
1401 /* Post-reload splitter for converting an SF or DFmode value in an
1402 SSE register into an unsigned SImode. */
1403
1404 void
1405 ix86_split_convert_uns_si_sse (rtx operands[])
1406 {
1407 machine_mode vecmode;
1408 rtx value, large, zero_or_two31, input, two31, x;
1409
1410 large = operands[1];
1411 zero_or_two31 = operands[2];
1412 input = operands[3];
1413 two31 = operands[4];
1414 vecmode = GET_MODE (large);
1415 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1416
1417 /* Load up the value into the low element. We must ensure that the other
1418 elements are valid floats -- zero is the easiest such value. */
1419 if (MEM_P (input))
1420 {
1421 if (vecmode == V4SFmode)
1422 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1423 else
1424 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1425 }
1426 else
1427 {
1428 input = gen_rtx_REG (vecmode, REGNO (input));
1429 emit_move_insn (value, CONST0_RTX (vecmode));
1430 if (vecmode == V4SFmode)
1431 emit_insn (gen_sse_movss (value, value, input));
1432 else
1433 emit_insn (gen_sse2_movsd (value, value, input));
1434 }
1435
1436 emit_move_insn (large, two31);
1437 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1438
1439 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1440 emit_insn (gen_rtx_SET (large, x));
1441
1442 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1443 emit_insn (gen_rtx_SET (zero_or_two31, x));
1444
1445 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1446 emit_insn (gen_rtx_SET (value, x));
1447
1448 large = gen_rtx_REG (V4SImode, REGNO (large));
1449 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1450
1451 x = gen_rtx_REG (V4SImode, REGNO (value));
1452 if (vecmode == V4SFmode)
1453 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1454 else
1455 emit_insn (gen_sse2_cvttpd2dq (x, value));
1456 value = x;
1457
1458 emit_insn (gen_xorv4si3 (value, value, large));
1459 }
1460
1461 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1462 machine_mode mode, rtx target,
1463 rtx var, int one_var);
1464
1465 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1466 Expects the 64-bit DImode to be supplied in a pair of integral
1467 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1468 -mfpmath=sse, !optimize_size only. */
1469
1470 void
1471 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1472 {
1473 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1474 rtx int_xmm, fp_xmm;
1475 rtx biases, exponents;
1476 rtx x;
1477
1478 int_xmm = gen_reg_rtx (V4SImode);
1479 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1480 emit_insn (gen_movdi_to_sse (int_xmm, input));
1481 else if (TARGET_SSE_SPLIT_REGS)
1482 {
1483 emit_clobber (int_xmm);
1484 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1485 }
1486 else
1487 {
1488 x = gen_reg_rtx (V2DImode);
1489 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1490 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1491 }
1492
1493 x = gen_rtx_CONST_VECTOR (V4SImode,
1494 gen_rtvec (4, GEN_INT (0x43300000UL),
1495 GEN_INT (0x45300000UL),
1496 const0_rtx, const0_rtx));
1497 exponents = validize_mem (force_const_mem (V4SImode, x));
1498
1499 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1500 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1501
1502 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1503 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1504 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1505 (0x1.0p84 + double(fp_value_hi_xmm)).
1506 Note these exponents differ by 32. */
1507
1508 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1509
1510 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1511 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1512 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1513 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1514 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1515 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1516 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1517 biases = validize_mem (force_const_mem (V2DFmode, biases));
1518 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1519
1520 /* Add the upper and lower DFmode values together. */
1521 if (TARGET_SSE3)
1522 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1523 else
1524 {
1525 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1526 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1527 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1528 }
1529
1530 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1531 }
1532
1533 /* Not used, but eases macroization of patterns. */
1534 void
1535 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1536 {
1537 gcc_unreachable ();
1538 }
1539
1540 /* Convert an unsigned SImode value into a DFmode. Only currently used
1541 for SSE, but applicable anywhere. */
1542
1543 void
1544 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1545 {
1546 REAL_VALUE_TYPE TWO31r;
1547 rtx x, fp;
1548
1549 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1550 NULL, 1, OPTAB_DIRECT);
1551
1552 fp = gen_reg_rtx (DFmode);
1553 emit_insn (gen_floatsidf2 (fp, x));
1554
1555 real_ldexp (&TWO31r, &dconst1, 31);
1556 x = const_double_from_real_value (TWO31r, DFmode);
1557
1558 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1559 if (x != target)
1560 emit_move_insn (target, x);
1561 }
1562
1563 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1564 32-bit mode; otherwise we have a direct convert instruction. */
1565
1566 void
1567 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1568 {
1569 REAL_VALUE_TYPE TWO32r;
1570 rtx fp_lo, fp_hi, x;
1571
1572 fp_lo = gen_reg_rtx (DFmode);
1573 fp_hi = gen_reg_rtx (DFmode);
1574
1575 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1576
1577 real_ldexp (&TWO32r, &dconst1, 32);
1578 x = const_double_from_real_value (TWO32r, DFmode);
1579 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1580
1581 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1582
1583 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1584 0, OPTAB_DIRECT);
1585 if (x != target)
1586 emit_move_insn (target, x);
1587 }
1588
1589 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1590 For x86_32, -mfpmath=sse, !optimize_size only. */
1591 void
1592 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1593 {
1594 REAL_VALUE_TYPE ONE16r;
1595 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1596
1597 real_ldexp (&ONE16r, &dconst1, 16);
1598 x = const_double_from_real_value (ONE16r, SFmode);
1599 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1600 NULL, 0, OPTAB_DIRECT);
1601 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1602 NULL, 0, OPTAB_DIRECT);
1603 fp_hi = gen_reg_rtx (SFmode);
1604 fp_lo = gen_reg_rtx (SFmode);
1605 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1606 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1607 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1608 0, OPTAB_DIRECT);
1609 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1610 0, OPTAB_DIRECT);
1611 if (!rtx_equal_p (target, fp_hi))
1612 emit_move_insn (target, fp_hi);
1613 }
1614
1615 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1616 a vector of unsigned ints VAL to vector of floats TARGET. */
1617
1618 void
1619 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1620 {
1621 rtx tmp[8];
1622 REAL_VALUE_TYPE TWO16r;
1623 machine_mode intmode = GET_MODE (val);
1624 machine_mode fltmode = GET_MODE (target);
1625 rtx (*cvt) (rtx, rtx);
1626
1627 if (intmode == V4SImode)
1628 cvt = gen_floatv4siv4sf2;
1629 else
1630 cvt = gen_floatv8siv8sf2;
1631 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1632 tmp[0] = force_reg (intmode, tmp[0]);
1633 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1634 OPTAB_DIRECT);
1635 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1636 NULL_RTX, 1, OPTAB_DIRECT);
1637 tmp[3] = gen_reg_rtx (fltmode);
1638 emit_insn (cvt (tmp[3], tmp[1]));
1639 tmp[4] = gen_reg_rtx (fltmode);
1640 emit_insn (cvt (tmp[4], tmp[2]));
1641 real_ldexp (&TWO16r, &dconst1, 16);
1642 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1643 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1644 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1645 OPTAB_DIRECT);
1646 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1647 OPTAB_DIRECT);
1648 if (tmp[7] != target)
1649 emit_move_insn (target, tmp[7]);
1650 }
1651
1652 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1653 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1654 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1655 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1656
1657 rtx
1658 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1659 {
1660 REAL_VALUE_TYPE TWO31r;
1661 rtx two31r, tmp[4];
1662 machine_mode mode = GET_MODE (val);
1663 machine_mode scalarmode = GET_MODE_INNER (mode);
1664 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1665 rtx (*cmp) (rtx, rtx, rtx, rtx);
1666 int i;
1667
1668 for (i = 0; i < 3; i++)
1669 tmp[i] = gen_reg_rtx (mode);
1670 real_ldexp (&TWO31r, &dconst1, 31);
1671 two31r = const_double_from_real_value (TWO31r, scalarmode);
1672 two31r = ix86_build_const_vector (mode, 1, two31r);
1673 two31r = force_reg (mode, two31r);
1674 switch (mode)
1675 {
1676 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1677 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1678 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1679 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1680 default: gcc_unreachable ();
1681 }
1682 tmp[3] = gen_rtx_LE (mode, two31r, val);
1683 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1684 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1685 0, OPTAB_DIRECT);
1686 if (intmode == V4SImode || TARGET_AVX2)
1687 *xorp = expand_simple_binop (intmode, ASHIFT,
1688 gen_lowpart (intmode, tmp[0]),
1689 GEN_INT (31), NULL_RTX, 0,
1690 OPTAB_DIRECT);
1691 else
1692 {
1693 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
1694 two31 = ix86_build_const_vector (intmode, 1, two31);
1695 *xorp = expand_simple_binop (intmode, AND,
1696 gen_lowpart (intmode, tmp[0]),
1697 two31, NULL_RTX, 0,
1698 OPTAB_DIRECT);
1699 }
1700 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1701 0, OPTAB_DIRECT);
1702 }
1703
1704 /* Generate code for floating point ABS or NEG. */
1705
1706 void
1707 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1708 rtx operands[])
1709 {
1710 rtx set, dst, src;
1711 bool use_sse = false;
1712 bool vector_mode = VECTOR_MODE_P (mode);
1713 machine_mode vmode = mode;
1714 rtvec par;
1715
1716 if (vector_mode || mode == TFmode)
1717 use_sse = true;
1718 else if (TARGET_SSE_MATH)
1719 {
1720 use_sse = SSE_FLOAT_MODE_P (mode);
1721 if (mode == SFmode)
1722 vmode = V4SFmode;
1723 else if (mode == DFmode)
1724 vmode = V2DFmode;
1725 }
1726
1727 dst = operands[0];
1728 src = operands[1];
1729
1730 set = gen_rtx_fmt_e (code, mode, src);
1731 set = gen_rtx_SET (dst, set);
1732
1733 if (use_sse)
1734 {
1735 rtx mask, use, clob;
1736
1737 /* NEG and ABS performed with SSE use bitwise mask operations.
1738 Create the appropriate mask now. */
1739 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
1740 use = gen_rtx_USE (VOIDmode, mask);
1741 if (vector_mode || mode == TFmode)
1742 par = gen_rtvec (2, set, use);
1743 else
1744 {
1745 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1746 par = gen_rtvec (3, set, use, clob);
1747 }
1748 }
1749 else
1750 {
1751 rtx clob;
1752
1753 /* Changing of sign for FP values is doable using integer unit too. */
1754 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1755 par = gen_rtvec (2, set, clob);
1756 }
1757
1758 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1759 }
1760
1761 /* Deconstruct a floating point ABS or NEG operation
1762 with integer registers into integer operations. */
1763
1764 void
1765 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1766 rtx operands[])
1767 {
1768 enum rtx_code absneg_op;
1769 rtx dst, set;
1770
1771 gcc_assert (operands_match_p (operands[0], operands[1]));
1772
1773 switch (mode)
1774 {
1775 case E_SFmode:
1776 dst = gen_lowpart (SImode, operands[0]);
1777
1778 if (code == ABS)
1779 {
1780 set = gen_int_mode (0x7fffffff, SImode);
1781 absneg_op = AND;
1782 }
1783 else
1784 {
1785 set = gen_int_mode (0x80000000, SImode);
1786 absneg_op = XOR;
1787 }
1788 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1789 break;
1790
1791 case E_DFmode:
1792 if (TARGET_64BIT)
1793 {
1794 dst = gen_lowpart (DImode, operands[0]);
1795 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1796
1797 if (code == ABS)
1798 set = const0_rtx;
1799 else
1800 set = gen_rtx_NOT (DImode, dst);
1801 }
1802 else
1803 {
1804 dst = gen_highpart (SImode, operands[0]);
1805
1806 if (code == ABS)
1807 {
1808 set = gen_int_mode (0x7fffffff, SImode);
1809 absneg_op = AND;
1810 }
1811 else
1812 {
1813 set = gen_int_mode (0x80000000, SImode);
1814 absneg_op = XOR;
1815 }
1816 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1817 }
1818 break;
1819
1820 case E_XFmode:
1821 dst = gen_rtx_REG (SImode,
1822 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1823 if (code == ABS)
1824 {
1825 set = GEN_INT (0x7fff);
1826 absneg_op = AND;
1827 }
1828 else
1829 {
1830 set = GEN_INT (0x8000);
1831 absneg_op = XOR;
1832 }
1833 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1834 break;
1835
1836 default:
1837 gcc_unreachable ();
1838 }
1839
1840 set = gen_rtx_SET (dst, set);
1841
1842 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1843 rtvec par = gen_rtvec (2, set, clob);
1844
1845 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1846 }
1847
1848 /* Expand a copysign operation. Special case operand 0 being a constant. */
1849
1850 void
1851 ix86_expand_copysign (rtx operands[])
1852 {
1853 machine_mode mode, vmode;
1854 rtx dest, op0, op1, mask;
1855
1856 dest = operands[0];
1857 op0 = operands[1];
1858 op1 = operands[2];
1859
1860 mode = GET_MODE (dest);
1861
1862 if (mode == SFmode)
1863 vmode = V4SFmode;
1864 else if (mode == DFmode)
1865 vmode = V2DFmode;
1866 else if (mode == TFmode)
1867 vmode = mode;
1868 else
1869 gcc_unreachable ();
1870
1871 mask = ix86_build_signbit_mask (vmode, 0, 0);
1872
1873 if (CONST_DOUBLE_P (op0))
1874 {
1875 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1876 op0 = simplify_unary_operation (ABS, mode, op0, mode);
1877
1878 if (mode == SFmode || mode == DFmode)
1879 {
1880 if (op0 == CONST0_RTX (mode))
1881 op0 = CONST0_RTX (vmode);
1882 else
1883 {
1884 rtx v = ix86_build_const_vector (vmode, false, op0);
1885
1886 op0 = force_reg (vmode, v);
1887 }
1888 }
1889 else if (op0 != CONST0_RTX (mode))
1890 op0 = force_reg (mode, op0);
1891
1892 emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
1893 }
1894 else
1895 {
1896 rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
1897
1898 emit_insn (gen_copysign3_var
1899 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
1900 }
1901 }
1902
1903 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1904 be a constant, and so has already been expanded into a vector constant. */
1905
1906 void
1907 ix86_split_copysign_const (rtx operands[])
1908 {
1909 machine_mode mode, vmode;
1910 rtx dest, op0, mask, x;
1911
1912 dest = operands[0];
1913 op0 = operands[1];
1914 mask = operands[3];
1915
1916 mode = GET_MODE (dest);
1917 vmode = GET_MODE (mask);
1918
1919 dest = lowpart_subreg (vmode, dest, mode);
1920 x = gen_rtx_AND (vmode, dest, mask);
1921 emit_insn (gen_rtx_SET (dest, x));
1922
1923 if (op0 != CONST0_RTX (vmode))
1924 {
1925 x = gen_rtx_IOR (vmode, dest, op0);
1926 emit_insn (gen_rtx_SET (dest, x));
1927 }
1928 }
1929
1930 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1931 so we have to do two masks. */
1932
1933 void
1934 ix86_split_copysign_var (rtx operands[])
1935 {
1936 machine_mode mode, vmode;
1937 rtx dest, scratch, op0, op1, mask, nmask, x;
1938
1939 dest = operands[0];
1940 scratch = operands[1];
1941 op0 = operands[2];
1942 op1 = operands[3];
1943 nmask = operands[4];
1944 mask = operands[5];
1945
1946 mode = GET_MODE (dest);
1947 vmode = GET_MODE (mask);
1948
1949 if (rtx_equal_p (op0, op1))
1950 {
1951 /* Shouldn't happen often (it's useless, obviously), but when it does
1952 we'd generate incorrect code if we continue below. */
1953 emit_move_insn (dest, op0);
1954 return;
1955 }
1956
1957 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
1958 {
1959 gcc_assert (REGNO (op1) == REGNO (scratch));
1960
1961 x = gen_rtx_AND (vmode, scratch, mask);
1962 emit_insn (gen_rtx_SET (scratch, x));
1963
1964 dest = mask;
1965 op0 = lowpart_subreg (vmode, op0, mode);
1966 x = gen_rtx_NOT (vmode, dest);
1967 x = gen_rtx_AND (vmode, x, op0);
1968 emit_insn (gen_rtx_SET (dest, x));
1969 }
1970 else
1971 {
1972 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
1973 {
1974 x = gen_rtx_AND (vmode, scratch, mask);
1975 }
1976 else /* alternative 2,4 */
1977 {
1978 gcc_assert (REGNO (mask) == REGNO (scratch));
1979 op1 = lowpart_subreg (vmode, op1, mode);
1980 x = gen_rtx_AND (vmode, scratch, op1);
1981 }
1982 emit_insn (gen_rtx_SET (scratch, x));
1983
1984 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
1985 {
1986 dest = lowpart_subreg (vmode, op0, mode);
1987 x = gen_rtx_AND (vmode, dest, nmask);
1988 }
1989 else /* alternative 3,4 */
1990 {
1991 gcc_assert (REGNO (nmask) == REGNO (dest));
1992 dest = nmask;
1993 op0 = lowpart_subreg (vmode, op0, mode);
1994 x = gen_rtx_AND (vmode, dest, op0);
1995 }
1996 emit_insn (gen_rtx_SET (dest, x));
1997 }
1998
1999 x = gen_rtx_IOR (vmode, dest, scratch);
2000 emit_insn (gen_rtx_SET (dest, x));
2001 }
2002
2003 /* Expand an xorsign operation. */
2004
2005 void
2006 ix86_expand_xorsign (rtx operands[])
2007 {
2008 machine_mode mode, vmode;
2009 rtx dest, op0, op1, mask;
2010
2011 dest = operands[0];
2012 op0 = operands[1];
2013 op1 = operands[2];
2014
2015 mode = GET_MODE (dest);
2016
2017 if (mode == SFmode)
2018 vmode = V4SFmode;
2019 else if (mode == DFmode)
2020 vmode = V2DFmode;
2021 else
2022 gcc_unreachable ();
2023
2024 mask = ix86_build_signbit_mask (vmode, 0, 0);
2025
2026 emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2027 }
2028
2029 /* Deconstruct an xorsign operation into bit masks. */
2030
2031 void
2032 ix86_split_xorsign (rtx operands[])
2033 {
2034 machine_mode mode, vmode;
2035 rtx dest, op0, mask, x;
2036
2037 dest = operands[0];
2038 op0 = operands[1];
2039 mask = operands[3];
2040
2041 mode = GET_MODE (dest);
2042 vmode = GET_MODE (mask);
2043
2044 dest = lowpart_subreg (vmode, dest, mode);
2045 x = gen_rtx_AND (vmode, dest, mask);
2046 emit_insn (gen_rtx_SET (dest, x));
2047
2048 op0 = lowpart_subreg (vmode, op0, mode);
2049 x = gen_rtx_XOR (vmode, dest, op0);
2050 emit_insn (gen_rtx_SET (dest, x));
2051 }
2052
2053 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2054
2055 void
2056 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2057 {
2058 machine_mode mode = GET_MODE (op0);
2059 rtx tmp;
2060
2061 /* Handle special case - vector comparsion with boolean result, transform
2062 it using ptest instruction. */
2063 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2064 {
2065 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2066 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2067
2068 gcc_assert (code == EQ || code == NE);
2069 /* Generate XOR since we can't check that one operand is zero vector. */
2070 tmp = gen_reg_rtx (mode);
2071 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2072 tmp = gen_lowpart (p_mode, tmp);
2073 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2074 gen_rtx_UNSPEC (CCmode,
2075 gen_rtvec (2, tmp, tmp),
2076 UNSPEC_PTEST)));
2077 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2078 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2079 gen_rtx_LABEL_REF (VOIDmode, label),
2080 pc_rtx);
2081 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2082 return;
2083 }
2084
2085 switch (mode)
2086 {
2087 case E_SFmode:
2088 case E_DFmode:
2089 case E_XFmode:
2090 case E_QImode:
2091 case E_HImode:
2092 case E_SImode:
2093 simple:
2094 tmp = ix86_expand_compare (code, op0, op1);
2095 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2096 gen_rtx_LABEL_REF (VOIDmode, label),
2097 pc_rtx);
2098 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2099 return;
2100
2101 case E_DImode:
2102 if (TARGET_64BIT)
2103 goto simple;
2104 /* For 32-bit target DI comparison may be performed on
2105 SSE registers. To allow this we should avoid split
2106 to SI mode which is achieved by doing xor in DI mode
2107 and then comparing with zero (which is recognized by
2108 STV pass). We don't compare using xor when optimizing
2109 for size. */
2110 if (!optimize_insn_for_size_p ()
2111 && TARGET_STV
2112 && (code == EQ || code == NE))
2113 {
2114 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2115 op1 = const0_rtx;
2116 }
2117 /* FALLTHRU */
2118 case E_TImode:
2119 /* Expand DImode branch into multiple compare+branch. */
2120 {
2121 rtx lo[2], hi[2];
2122 rtx_code_label *label2;
2123 enum rtx_code code1, code2, code3;
2124 machine_mode submode;
2125
2126 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2127 {
2128 std::swap (op0, op1);
2129 code = swap_condition (code);
2130 }
2131
2132 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2133 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2134
2135 submode = mode == DImode ? SImode : DImode;
2136
2137 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2138 avoid two branches. This costs one extra insn, so disable when
2139 optimizing for size. */
2140
2141 if ((code == EQ || code == NE)
2142 && (!optimize_insn_for_size_p ()
2143 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2144 {
2145 rtx xor0, xor1;
2146
2147 xor1 = hi[0];
2148 if (hi[1] != const0_rtx)
2149 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2150 NULL_RTX, 0, OPTAB_WIDEN);
2151
2152 xor0 = lo[0];
2153 if (lo[1] != const0_rtx)
2154 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2155 NULL_RTX, 0, OPTAB_WIDEN);
2156
2157 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2158 NULL_RTX, 0, OPTAB_WIDEN);
2159
2160 ix86_expand_branch (code, tmp, const0_rtx, label);
2161 return;
2162 }
2163
2164 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2165 op1 is a constant and the low word is zero, then we can just
2166 examine the high word. Similarly for low word -1 and
2167 less-or-equal-than or greater-than. */
2168
2169 if (CONST_INT_P (hi[1]))
2170 switch (code)
2171 {
2172 case LT: case LTU: case GE: case GEU:
2173 if (lo[1] == const0_rtx)
2174 {
2175 ix86_expand_branch (code, hi[0], hi[1], label);
2176 return;
2177 }
2178 break;
2179 case LE: case LEU: case GT: case GTU:
2180 if (lo[1] == constm1_rtx)
2181 {
2182 ix86_expand_branch (code, hi[0], hi[1], label);
2183 return;
2184 }
2185 break;
2186 default:
2187 break;
2188 }
2189
2190 /* Emulate comparisons that do not depend on Zero flag with
2191 double-word subtraction. Note that only Overflow, Sign
2192 and Carry flags are valid, so swap arguments and condition
2193 of comparisons that would otherwise test Zero flag. */
2194
2195 switch (code)
2196 {
2197 case LE: case LEU: case GT: case GTU:
2198 std::swap (lo[0], lo[1]);
2199 std::swap (hi[0], hi[1]);
2200 code = swap_condition (code);
2201 /* FALLTHRU */
2202
2203 case LT: case LTU: case GE: case GEU:
2204 {
2205 bool uns = (code == LTU || code == GEU);
2206 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2207 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2208
2209 if (!nonimmediate_operand (lo[0], submode))
2210 lo[0] = force_reg (submode, lo[0]);
2211 if (!x86_64_general_operand (lo[1], submode))
2212 lo[1] = force_reg (submode, lo[1]);
2213
2214 if (!register_operand (hi[0], submode))
2215 hi[0] = force_reg (submode, hi[0]);
2216 if ((uns && !nonimmediate_operand (hi[1], submode))
2217 || (!uns && !x86_64_general_operand (hi[1], submode)))
2218 hi[1] = force_reg (submode, hi[1]);
2219
2220 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2221
2222 tmp = gen_rtx_SCRATCH (submode);
2223 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2224
2225 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2226 ix86_expand_branch (code, tmp, const0_rtx, label);
2227 return;
2228 }
2229
2230 default:
2231 break;
2232 }
2233
2234 /* Otherwise, we need two or three jumps. */
2235
2236 label2 = gen_label_rtx ();
2237
2238 code1 = code;
2239 code2 = swap_condition (code);
2240 code3 = unsigned_condition (code);
2241
2242 switch (code)
2243 {
2244 case LT: case GT: case LTU: case GTU:
2245 break;
2246
2247 case LE: code1 = LT; code2 = GT; break;
2248 case GE: code1 = GT; code2 = LT; break;
2249 case LEU: code1 = LTU; code2 = GTU; break;
2250 case GEU: code1 = GTU; code2 = LTU; break;
2251
2252 case EQ: code1 = UNKNOWN; code2 = NE; break;
2253 case NE: code2 = UNKNOWN; break;
2254
2255 default:
2256 gcc_unreachable ();
2257 }
2258
2259 /*
2260 * a < b =>
2261 * if (hi(a) < hi(b)) goto true;
2262 * if (hi(a) > hi(b)) goto false;
2263 * if (lo(a) < lo(b)) goto true;
2264 * false:
2265 */
2266
2267 if (code1 != UNKNOWN)
2268 ix86_expand_branch (code1, hi[0], hi[1], label);
2269 if (code2 != UNKNOWN)
2270 ix86_expand_branch (code2, hi[0], hi[1], label2);
2271
2272 ix86_expand_branch (code3, lo[0], lo[1], label);
2273
2274 if (code2 != UNKNOWN)
2275 emit_label (label2);
2276 return;
2277 }
2278
2279 default:
2280 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2281 goto simple;
2282 }
2283 }
2284
2285 /* Figure out whether to use unordered fp comparisons. */
2286
2287 static bool
2288 ix86_unordered_fp_compare (enum rtx_code code)
2289 {
2290 if (!TARGET_IEEE_FP)
2291 return false;
2292
2293 switch (code)
2294 {
2295 case LT:
2296 case LE:
2297 case GT:
2298 case GE:
2299 case LTGT:
2300 return false;
2301
2302 case EQ:
2303 case NE:
2304
2305 case UNORDERED:
2306 case ORDERED:
2307 case UNLT:
2308 case UNLE:
2309 case UNGT:
2310 case UNGE:
2311 case UNEQ:
2312 return true;
2313
2314 default:
2315 gcc_unreachable ();
2316 }
2317 }
2318
2319 /* Return a comparison we can do and that it is equivalent to
2320 swap_condition (code) apart possibly from orderedness.
2321 But, never change orderedness if TARGET_IEEE_FP, returning
2322 UNKNOWN in that case if necessary. */
2323
2324 static enum rtx_code
2325 ix86_fp_swap_condition (enum rtx_code code)
2326 {
2327 switch (code)
2328 {
2329 case GT: /* GTU - CF=0 & ZF=0 */
2330 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2331 case GE: /* GEU - CF=0 */
2332 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2333 case UNLT: /* LTU - CF=1 */
2334 return TARGET_IEEE_FP ? UNKNOWN : GT;
2335 case UNLE: /* LEU - CF=1 | ZF=1 */
2336 return TARGET_IEEE_FP ? UNKNOWN : GE;
2337 default:
2338 return swap_condition (code);
2339 }
2340 }
2341
2342 /* Return cost of comparison CODE using the best strategy for performance.
2343 All following functions do use number of instructions as a cost metrics.
2344 In future this should be tweaked to compute bytes for optimize_size and
2345 take into account performance of various instructions on various CPUs. */
2346
2347 static int
2348 ix86_fp_comparison_cost (enum rtx_code code)
2349 {
2350 int arith_cost;
2351
2352 /* The cost of code using bit-twiddling on %ah. */
2353 switch (code)
2354 {
2355 case UNLE:
2356 case UNLT:
2357 case LTGT:
2358 case GT:
2359 case GE:
2360 case UNORDERED:
2361 case ORDERED:
2362 case UNEQ:
2363 arith_cost = 4;
2364 break;
2365 case LT:
2366 case NE:
2367 case EQ:
2368 case UNGE:
2369 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2370 break;
2371 case LE:
2372 case UNGT:
2373 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2374 break;
2375 default:
2376 gcc_unreachable ();
2377 }
2378
2379 switch (ix86_fp_comparison_strategy (code))
2380 {
2381 case IX86_FPCMP_COMI:
2382 return arith_cost > 4 ? 3 : 2;
2383 case IX86_FPCMP_SAHF:
2384 return arith_cost > 4 ? 4 : 3;
2385 default:
2386 return arith_cost;
2387 }
2388 }
2389
2390 /* Swap, force into registers, or otherwise massage the two operands
2391 to a fp comparison. The operands are updated in place; the new
2392 comparison code is returned. */
2393
2394 static enum rtx_code
2395 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2396 {
2397 bool unordered_compare = ix86_unordered_fp_compare (code);
2398 rtx op0 = *pop0, op1 = *pop1;
2399 machine_mode op_mode = GET_MODE (op0);
2400 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2401
2402 /* All of the unordered compare instructions only work on registers.
2403 The same is true of the fcomi compare instructions. The XFmode
2404 compare instructions require registers except when comparing
2405 against zero or when converting operand 1 from fixed point to
2406 floating point. */
2407
2408 if (!is_sse
2409 && (unordered_compare
2410 || (op_mode == XFmode
2411 && ! (standard_80387_constant_p (op0) == 1
2412 || standard_80387_constant_p (op1) == 1)
2413 && GET_CODE (op1) != FLOAT)
2414 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2415 {
2416 op0 = force_reg (op_mode, op0);
2417 op1 = force_reg (op_mode, op1);
2418 }
2419 else
2420 {
2421 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2422 things around if they appear profitable, otherwise force op0
2423 into a register. */
2424
2425 if (standard_80387_constant_p (op0) == 0
2426 || (MEM_P (op0)
2427 && ! (standard_80387_constant_p (op1) == 0
2428 || MEM_P (op1))))
2429 {
2430 enum rtx_code new_code = ix86_fp_swap_condition (code);
2431 if (new_code != UNKNOWN)
2432 {
2433 std::swap (op0, op1);
2434 code = new_code;
2435 }
2436 }
2437
2438 if (!REG_P (op0))
2439 op0 = force_reg (op_mode, op0);
2440
2441 if (CONSTANT_P (op1))
2442 {
2443 int tmp = standard_80387_constant_p (op1);
2444 if (tmp == 0)
2445 op1 = validize_mem (force_const_mem (op_mode, op1));
2446 else if (tmp == 1)
2447 {
2448 if (TARGET_CMOVE)
2449 op1 = force_reg (op_mode, op1);
2450 }
2451 else
2452 op1 = force_reg (op_mode, op1);
2453 }
2454 }
2455
2456 /* Try to rearrange the comparison to make it cheaper. */
2457 if (ix86_fp_comparison_cost (code)
2458 > ix86_fp_comparison_cost (swap_condition (code))
2459 && (REG_P (op1) || can_create_pseudo_p ()))
2460 {
2461 std::swap (op0, op1);
2462 code = swap_condition (code);
2463 if (!REG_P (op0))
2464 op0 = force_reg (op_mode, op0);
2465 }
2466
2467 *pop0 = op0;
2468 *pop1 = op1;
2469 return code;
2470 }
2471
2472 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2473
2474 static rtx
2475 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2476 {
2477 bool unordered_compare = ix86_unordered_fp_compare (code);
2478 machine_mode cmp_mode;
2479 rtx tmp, scratch;
2480
2481 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2482
2483 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2484 if (unordered_compare)
2485 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2486
2487 /* Do fcomi/sahf based test when profitable. */
2488 switch (ix86_fp_comparison_strategy (code))
2489 {
2490 case IX86_FPCMP_COMI:
2491 cmp_mode = CCFPmode;
2492 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2493 break;
2494
2495 case IX86_FPCMP_SAHF:
2496 cmp_mode = CCFPmode;
2497 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2498 scratch = gen_reg_rtx (HImode);
2499 emit_insn (gen_rtx_SET (scratch, tmp));
2500 emit_insn (gen_x86_sahf_1 (scratch));
2501 break;
2502
2503 case IX86_FPCMP_ARITH:
2504 cmp_mode = CCNOmode;
2505 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2506 scratch = gen_reg_rtx (HImode);
2507 emit_insn (gen_rtx_SET (scratch, tmp));
2508
2509 /* In the unordered case, we have to check C2 for NaN's, which
2510 doesn't happen to work out to anything nice combination-wise.
2511 So do some bit twiddling on the value we've got in AH to come
2512 up with an appropriate set of condition codes. */
2513
2514 switch (code)
2515 {
2516 case GT:
2517 case UNGT:
2518 if (code == GT || !TARGET_IEEE_FP)
2519 {
2520 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2521 code = EQ;
2522 }
2523 else
2524 {
2525 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2526 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2527 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2528 cmp_mode = CCmode;
2529 code = GEU;
2530 }
2531 break;
2532 case LT:
2533 case UNLT:
2534 if (code == LT && TARGET_IEEE_FP)
2535 {
2536 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2537 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2538 cmp_mode = CCmode;
2539 code = EQ;
2540 }
2541 else
2542 {
2543 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2544 code = NE;
2545 }
2546 break;
2547 case GE:
2548 case UNGE:
2549 if (code == GE || !TARGET_IEEE_FP)
2550 {
2551 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2552 code = EQ;
2553 }
2554 else
2555 {
2556 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2557 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2558 code = NE;
2559 }
2560 break;
2561 case LE:
2562 case UNLE:
2563 if (code == LE && TARGET_IEEE_FP)
2564 {
2565 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2566 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2567 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2568 cmp_mode = CCmode;
2569 code = LTU;
2570 }
2571 else
2572 {
2573 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2574 code = NE;
2575 }
2576 break;
2577 case EQ:
2578 case UNEQ:
2579 if (code == EQ && TARGET_IEEE_FP)
2580 {
2581 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2582 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2583 cmp_mode = CCmode;
2584 code = EQ;
2585 }
2586 else
2587 {
2588 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2589 code = NE;
2590 }
2591 break;
2592 case NE:
2593 case LTGT:
2594 if (code == NE && TARGET_IEEE_FP)
2595 {
2596 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2597 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2598 GEN_INT (0x40)));
2599 code = NE;
2600 }
2601 else
2602 {
2603 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2604 code = EQ;
2605 }
2606 break;
2607
2608 case UNORDERED:
2609 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2610 code = NE;
2611 break;
2612 case ORDERED:
2613 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2614 code = EQ;
2615 break;
2616
2617 default:
2618 gcc_unreachable ();
2619 }
2620 break;
2621
2622 default:
2623 gcc_unreachable();
2624 }
2625
2626 /* Return the test that should be put into the flags user, i.e.
2627 the bcc, scc, or cmov instruction. */
2628 return gen_rtx_fmt_ee (code, VOIDmode,
2629 gen_rtx_REG (cmp_mode, FLAGS_REG),
2630 const0_rtx);
2631 }
2632
2633 /* Generate insn patterns to do an integer compare of OPERANDS. */
2634
2635 static rtx
2636 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2637 {
2638 machine_mode cmpmode;
2639 rtx tmp, flags;
2640
2641 cmpmode = SELECT_CC_MODE (code, op0, op1);
2642 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2643
2644 /* This is very simple, but making the interface the same as in the
2645 FP case makes the rest of the code easier. */
2646 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2647 emit_insn (gen_rtx_SET (flags, tmp));
2648
2649 /* Return the test that should be put into the flags user, i.e.
2650 the bcc, scc, or cmov instruction. */
2651 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2652 }
2653
2654 static rtx
2655 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2656 {
2657 rtx ret;
2658
2659 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2660 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2661
2662 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2663 {
2664 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2665 ret = ix86_expand_fp_compare (code, op0, op1);
2666 }
2667 else
2668 ret = ix86_expand_int_compare (code, op0, op1);
2669
2670 return ret;
2671 }
2672
2673 void
2674 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2675 {
2676 rtx ret;
2677
2678 gcc_assert (GET_MODE (dest) == QImode);
2679
2680 ret = ix86_expand_compare (code, op0, op1);
2681 PUT_MODE (ret, QImode);
2682 emit_insn (gen_rtx_SET (dest, ret));
2683 }
2684
2685 /* Expand comparison setting or clearing carry flag. Return true when
2686 successful and set pop for the operation. */
2687 static bool
2688 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2689 {
2690 machine_mode mode
2691 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2692
2693 /* Do not handle double-mode compares that go through special path. */
2694 if (mode == (TARGET_64BIT ? TImode : DImode))
2695 return false;
2696
2697 if (SCALAR_FLOAT_MODE_P (mode))
2698 {
2699 rtx compare_op;
2700 rtx_insn *compare_seq;
2701
2702 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2703
2704 /* Shortcut: following common codes never translate
2705 into carry flag compares. */
2706 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2707 || code == ORDERED || code == UNORDERED)
2708 return false;
2709
2710 /* These comparisons require zero flag; swap operands so they won't. */
2711 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2712 && !TARGET_IEEE_FP)
2713 {
2714 std::swap (op0, op1);
2715 code = swap_condition (code);
2716 }
2717
2718 /* Try to expand the comparison and verify that we end up with
2719 carry flag based comparison. This fails to be true only when
2720 we decide to expand comparison using arithmetic that is not
2721 too common scenario. */
2722 start_sequence ();
2723 compare_op = ix86_expand_fp_compare (code, op0, op1);
2724 compare_seq = get_insns ();
2725 end_sequence ();
2726
2727 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2728 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2729 else
2730 code = GET_CODE (compare_op);
2731
2732 if (code != LTU && code != GEU)
2733 return false;
2734
2735 emit_insn (compare_seq);
2736 *pop = compare_op;
2737 return true;
2738 }
2739
2740 if (!INTEGRAL_MODE_P (mode))
2741 return false;
2742
2743 switch (code)
2744 {
2745 case LTU:
2746 case GEU:
2747 break;
2748
2749 /* Convert a==0 into (unsigned)a<1. */
2750 case EQ:
2751 case NE:
2752 if (op1 != const0_rtx)
2753 return false;
2754 op1 = const1_rtx;
2755 code = (code == EQ ? LTU : GEU);
2756 break;
2757
2758 /* Convert a>b into b<a or a>=b-1. */
2759 case GTU:
2760 case LEU:
2761 if (CONST_INT_P (op1))
2762 {
2763 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2764 /* Bail out on overflow. We still can swap operands but that
2765 would force loading of the constant into register. */
2766 if (op1 == const0_rtx
2767 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2768 return false;
2769 code = (code == GTU ? GEU : LTU);
2770 }
2771 else
2772 {
2773 std::swap (op0, op1);
2774 code = (code == GTU ? LTU : GEU);
2775 }
2776 break;
2777
2778 /* Convert a>=0 into (unsigned)a<0x80000000. */
2779 case LT:
2780 case GE:
2781 if (mode == DImode || op1 != const0_rtx)
2782 return false;
2783 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2784 code = (code == LT ? GEU : LTU);
2785 break;
2786 case LE:
2787 case GT:
2788 if (mode == DImode || op1 != constm1_rtx)
2789 return false;
2790 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2791 code = (code == LE ? GEU : LTU);
2792 break;
2793
2794 default:
2795 return false;
2796 }
2797 /* Swapping operands may cause constant to appear as first operand. */
2798 if (!nonimmediate_operand (op0, VOIDmode))
2799 {
2800 if (!can_create_pseudo_p ())
2801 return false;
2802 op0 = force_reg (mode, op0);
2803 }
2804 *pop = ix86_expand_compare (code, op0, op1);
2805 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2806 return true;
2807 }
2808
2809 /* Expand conditional increment or decrement using adb/sbb instructions.
2810 The default case using setcc followed by the conditional move can be
2811 done by generic code. */
2812 bool
2813 ix86_expand_int_addcc (rtx operands[])
2814 {
2815 enum rtx_code code = GET_CODE (operands[1]);
2816 rtx flags;
2817 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2818 rtx compare_op;
2819 rtx val = const0_rtx;
2820 bool fpcmp = false;
2821 machine_mode mode;
2822 rtx op0 = XEXP (operands[1], 0);
2823 rtx op1 = XEXP (operands[1], 1);
2824
2825 if (operands[3] != const1_rtx
2826 && operands[3] != constm1_rtx)
2827 return false;
2828 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2829 return false;
2830 code = GET_CODE (compare_op);
2831
2832 flags = XEXP (compare_op, 0);
2833
2834 if (GET_MODE (flags) == CCFPmode)
2835 {
2836 fpcmp = true;
2837 code = ix86_fp_compare_code_to_integer (code);
2838 }
2839
2840 if (code != LTU)
2841 {
2842 val = constm1_rtx;
2843 if (fpcmp)
2844 PUT_CODE (compare_op,
2845 reverse_condition_maybe_unordered
2846 (GET_CODE (compare_op)));
2847 else
2848 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2849 }
2850
2851 mode = GET_MODE (operands[0]);
2852
2853 /* Construct either adc or sbb insn. */
2854 if ((code == LTU) == (operands[3] == constm1_rtx))
2855 insn = gen_sub3_carry;
2856 else
2857 insn = gen_add3_carry;
2858
2859 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2860
2861 return true;
2862 }
2863
2864 bool
2865 ix86_expand_int_movcc (rtx operands[])
2866 {
2867 enum rtx_code code = GET_CODE (operands[1]), compare_code;
2868 rtx_insn *compare_seq;
2869 rtx compare_op;
2870 machine_mode mode = GET_MODE (operands[0]);
2871 bool sign_bit_compare_p = false;
2872 rtx op0 = XEXP (operands[1], 0);
2873 rtx op1 = XEXP (operands[1], 1);
2874
2875 if (GET_MODE (op0) == TImode
2876 || (GET_MODE (op0) == DImode
2877 && !TARGET_64BIT))
2878 return false;
2879
2880 start_sequence ();
2881 compare_op = ix86_expand_compare (code, op0, op1);
2882 compare_seq = get_insns ();
2883 end_sequence ();
2884
2885 compare_code = GET_CODE (compare_op);
2886
2887 if ((op1 == const0_rtx && (code == GE || code == LT))
2888 || (op1 == constm1_rtx && (code == GT || code == LE)))
2889 sign_bit_compare_p = true;
2890
2891 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2892 HImode insns, we'd be swallowed in word prefix ops. */
2893
2894 if ((mode != HImode || TARGET_FAST_PREFIX)
2895 && (mode != (TARGET_64BIT ? TImode : DImode))
2896 && CONST_INT_P (operands[2])
2897 && CONST_INT_P (operands[3]))
2898 {
2899 rtx out = operands[0];
2900 HOST_WIDE_INT ct = INTVAL (operands[2]);
2901 HOST_WIDE_INT cf = INTVAL (operands[3]);
2902 HOST_WIDE_INT diff;
2903
2904 diff = ct - cf;
2905 /* Sign bit compares are better done using shifts than we do by using
2906 sbb. */
2907 if (sign_bit_compare_p
2908 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2909 {
2910 /* Detect overlap between destination and compare sources. */
2911 rtx tmp = out;
2912
2913 if (!sign_bit_compare_p)
2914 {
2915 rtx flags;
2916 bool fpcmp = false;
2917
2918 compare_code = GET_CODE (compare_op);
2919
2920 flags = XEXP (compare_op, 0);
2921
2922 if (GET_MODE (flags) == CCFPmode)
2923 {
2924 fpcmp = true;
2925 compare_code
2926 = ix86_fp_compare_code_to_integer (compare_code);
2927 }
2928
2929 /* To simplify rest of code, restrict to the GEU case. */
2930 if (compare_code == LTU)
2931 {
2932 std::swap (ct, cf);
2933 compare_code = reverse_condition (compare_code);
2934 code = reverse_condition (code);
2935 }
2936 else
2937 {
2938 if (fpcmp)
2939 PUT_CODE (compare_op,
2940 reverse_condition_maybe_unordered
2941 (GET_CODE (compare_op)));
2942 else
2943 PUT_CODE (compare_op,
2944 reverse_condition (GET_CODE (compare_op)));
2945 }
2946 diff = ct - cf;
2947
2948 if (reg_overlap_mentioned_p (out, op0)
2949 || reg_overlap_mentioned_p (out, op1))
2950 tmp = gen_reg_rtx (mode);
2951
2952 if (mode == DImode)
2953 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2954 else
2955 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
2956 flags, compare_op));
2957 }
2958 else
2959 {
2960 if (code == GT || code == GE)
2961 code = reverse_condition (code);
2962 else
2963 {
2964 std::swap (ct, cf);
2965 diff = ct - cf;
2966 }
2967 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2968 }
2969
2970 if (diff == 1)
2971 {
2972 /*
2973 * cmpl op0,op1
2974 * sbbl dest,dest
2975 * [addl dest, ct]
2976 *
2977 * Size 5 - 8.
2978 */
2979 if (ct)
2980 tmp = expand_simple_binop (mode, PLUS,
2981 tmp, GEN_INT (ct),
2982 copy_rtx (tmp), 1, OPTAB_DIRECT);
2983 }
2984 else if (cf == -1)
2985 {
2986 /*
2987 * cmpl op0,op1
2988 * sbbl dest,dest
2989 * orl $ct, dest
2990 *
2991 * Size 8.
2992 */
2993 tmp = expand_simple_binop (mode, IOR,
2994 tmp, GEN_INT (ct),
2995 copy_rtx (tmp), 1, OPTAB_DIRECT);
2996 }
2997 else if (diff == -1 && ct)
2998 {
2999 /*
3000 * cmpl op0,op1
3001 * sbbl dest,dest
3002 * notl dest
3003 * [addl dest, cf]
3004 *
3005 * Size 8 - 11.
3006 */
3007 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3008 if (cf)
3009 tmp = expand_simple_binop (mode, PLUS,
3010 copy_rtx (tmp), GEN_INT (cf),
3011 copy_rtx (tmp), 1, OPTAB_DIRECT);
3012 }
3013 else
3014 {
3015 /*
3016 * cmpl op0,op1
3017 * sbbl dest,dest
3018 * [notl dest]
3019 * andl cf - ct, dest
3020 * [addl dest, ct]
3021 *
3022 * Size 8 - 11.
3023 */
3024
3025 if (cf == 0)
3026 {
3027 cf = ct;
3028 ct = 0;
3029 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3030 }
3031
3032 tmp = expand_simple_binop (mode, AND,
3033 copy_rtx (tmp),
3034 gen_int_mode (cf - ct, mode),
3035 copy_rtx (tmp), 1, OPTAB_DIRECT);
3036 if (ct)
3037 tmp = expand_simple_binop (mode, PLUS,
3038 copy_rtx (tmp), GEN_INT (ct),
3039 copy_rtx (tmp), 1, OPTAB_DIRECT);
3040 }
3041
3042 if (!rtx_equal_p (tmp, out))
3043 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3044
3045 return true;
3046 }
3047
3048 if (diff < 0)
3049 {
3050 machine_mode cmp_mode = GET_MODE (op0);
3051 enum rtx_code new_code;
3052
3053 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3054 {
3055 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3056
3057 /* We may be reversing a non-trapping
3058 comparison to a trapping comparison. */
3059 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3060 && code != EQ && code != NE
3061 && code != ORDERED && code != UNORDERED)
3062 new_code = UNKNOWN;
3063 else
3064 new_code = reverse_condition_maybe_unordered (code);
3065 }
3066 else
3067 new_code = ix86_reverse_condition (code, cmp_mode);
3068 if (new_code != UNKNOWN)
3069 {
3070 std::swap (ct, cf);
3071 diff = -diff;
3072 code = new_code;
3073 }
3074 }
3075
3076 compare_code = UNKNOWN;
3077 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3078 && CONST_INT_P (op1))
3079 {
3080 if (op1 == const0_rtx
3081 && (code == LT || code == GE))
3082 compare_code = code;
3083 else if (op1 == constm1_rtx)
3084 {
3085 if (code == LE)
3086 compare_code = LT;
3087 else if (code == GT)
3088 compare_code = GE;
3089 }
3090 }
3091
3092 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3093 if (compare_code != UNKNOWN
3094 && GET_MODE (op0) == GET_MODE (out)
3095 && (cf == -1 || ct == -1))
3096 {
3097 /* If lea code below could be used, only optimize
3098 if it results in a 2 insn sequence. */
3099
3100 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3101 || diff == 3 || diff == 5 || diff == 9)
3102 || (compare_code == LT && ct == -1)
3103 || (compare_code == GE && cf == -1))
3104 {
3105 /*
3106 * notl op1 (if necessary)
3107 * sarl $31, op1
3108 * orl cf, op1
3109 */
3110 if (ct != -1)
3111 {
3112 cf = ct;
3113 ct = -1;
3114 code = reverse_condition (code);
3115 }
3116
3117 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3118
3119 out = expand_simple_binop (mode, IOR,
3120 out, GEN_INT (cf),
3121 out, 1, OPTAB_DIRECT);
3122 if (out != operands[0])
3123 emit_move_insn (operands[0], out);
3124
3125 return true;
3126 }
3127 }
3128
3129
3130 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3131 || diff == 3 || diff == 5 || diff == 9)
3132 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3133 && (mode != DImode
3134 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3135 {
3136 /*
3137 * xorl dest,dest
3138 * cmpl op1,op2
3139 * setcc dest
3140 * lea cf(dest*(ct-cf)),dest
3141 *
3142 * Size 14.
3143 *
3144 * This also catches the degenerate setcc-only case.
3145 */
3146
3147 rtx tmp;
3148 int nops;
3149
3150 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3151
3152 nops = 0;
3153 /* On x86_64 the lea instruction operates on Pmode, so we need
3154 to get arithmetics done in proper mode to match. */
3155 if (diff == 1)
3156 tmp = copy_rtx (out);
3157 else
3158 {
3159 rtx out1;
3160 out1 = copy_rtx (out);
3161 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3162 nops++;
3163 if (diff & 1)
3164 {
3165 tmp = gen_rtx_PLUS (mode, tmp, out1);
3166 nops++;
3167 }
3168 }
3169 if (cf != 0)
3170 {
3171 tmp = plus_constant (mode, tmp, cf);
3172 nops++;
3173 }
3174 if (!rtx_equal_p (tmp, out))
3175 {
3176 if (nops == 1)
3177 out = force_operand (tmp, copy_rtx (out));
3178 else
3179 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3180 }
3181 if (!rtx_equal_p (out, operands[0]))
3182 emit_move_insn (operands[0], copy_rtx (out));
3183
3184 return true;
3185 }
3186
3187 /*
3188 * General case: Jumpful:
3189 * xorl dest,dest cmpl op1, op2
3190 * cmpl op1, op2 movl ct, dest
3191 * setcc dest jcc 1f
3192 * decl dest movl cf, dest
3193 * andl (cf-ct),dest 1:
3194 * addl ct,dest
3195 *
3196 * Size 20. Size 14.
3197 *
3198 * This is reasonably steep, but branch mispredict costs are
3199 * high on modern cpus, so consider failing only if optimizing
3200 * for space.
3201 */
3202
3203 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3204 && BRANCH_COST (optimize_insn_for_speed_p (),
3205 false) >= 2)
3206 {
3207 if (cf == 0)
3208 {
3209 machine_mode cmp_mode = GET_MODE (op0);
3210 enum rtx_code new_code;
3211
3212 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3213 {
3214 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3215
3216 /* We may be reversing a non-trapping
3217 comparison to a trapping comparison. */
3218 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3219 && code != EQ && code != NE
3220 && code != ORDERED && code != UNORDERED)
3221 new_code = UNKNOWN;
3222 else
3223 new_code = reverse_condition_maybe_unordered (code);
3224
3225 }
3226 else
3227 {
3228 new_code = ix86_reverse_condition (code, cmp_mode);
3229 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3230 compare_code = reverse_condition (compare_code);
3231 }
3232
3233 if (new_code != UNKNOWN)
3234 {
3235 cf = ct;
3236 ct = 0;
3237 code = new_code;
3238 }
3239 }
3240
3241 if (compare_code != UNKNOWN)
3242 {
3243 /* notl op1 (if needed)
3244 sarl $31, op1
3245 andl (cf-ct), op1
3246 addl ct, op1
3247
3248 For x < 0 (resp. x <= -1) there will be no notl,
3249 so if possible swap the constants to get rid of the
3250 complement.
3251 True/false will be -1/0 while code below (store flag
3252 followed by decrement) is 0/-1, so the constants need
3253 to be exchanged once more. */
3254
3255 if (compare_code == GE || !cf)
3256 {
3257 code = reverse_condition (code);
3258 compare_code = LT;
3259 }
3260 else
3261 std::swap (ct, cf);
3262
3263 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3264 }
3265 else
3266 {
3267 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3268
3269 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3270 constm1_rtx,
3271 copy_rtx (out), 1, OPTAB_DIRECT);
3272 }
3273
3274 out = expand_simple_binop (mode, AND, copy_rtx (out),
3275 gen_int_mode (cf - ct, mode),
3276 copy_rtx (out), 1, OPTAB_DIRECT);
3277 if (ct)
3278 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3279 copy_rtx (out), 1, OPTAB_DIRECT);
3280 if (!rtx_equal_p (out, operands[0]))
3281 emit_move_insn (operands[0], copy_rtx (out));
3282
3283 return true;
3284 }
3285 }
3286
3287 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3288 {
3289 /* Try a few things more with specific constants and a variable. */
3290
3291 optab op;
3292 rtx var, orig_out, out, tmp;
3293
3294 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3295 return false;
3296
3297 /* If one of the two operands is an interesting constant, load a
3298 constant with the above and mask it in with a logical operation. */
3299
3300 if (CONST_INT_P (operands[2]))
3301 {
3302 var = operands[3];
3303 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3304 operands[3] = constm1_rtx, op = and_optab;
3305 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3306 operands[3] = const0_rtx, op = ior_optab;
3307 else
3308 return false;
3309 }
3310 else if (CONST_INT_P (operands[3]))
3311 {
3312 var = operands[2];
3313 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3314 {
3315 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3316 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3317 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3318 operands[1] = simplify_gen_relational (LT, VOIDmode,
3319 GET_MODE (op0),
3320 op0, const0_rtx);
3321
3322 operands[2] = constm1_rtx;
3323 op = and_optab;
3324 }
3325 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3326 operands[2] = const0_rtx, op = ior_optab;
3327 else
3328 return false;
3329 }
3330 else
3331 return false;
3332
3333 orig_out = operands[0];
3334 tmp = gen_reg_rtx (mode);
3335 operands[0] = tmp;
3336
3337 /* Recurse to get the constant loaded. */
3338 if (!ix86_expand_int_movcc (operands))
3339 return false;
3340
3341 /* Mask in the interesting variable. */
3342 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3343 OPTAB_WIDEN);
3344 if (!rtx_equal_p (out, orig_out))
3345 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3346
3347 return true;
3348 }
3349
3350 /*
3351 * For comparison with above,
3352 *
3353 * movl cf,dest
3354 * movl ct,tmp
3355 * cmpl op1,op2
3356 * cmovcc tmp,dest
3357 *
3358 * Size 15.
3359 */
3360
3361 if (! nonimmediate_operand (operands[2], mode))
3362 operands[2] = force_reg (mode, operands[2]);
3363 if (! nonimmediate_operand (operands[3], mode))
3364 operands[3] = force_reg (mode, operands[3]);
3365
3366 if (! register_operand (operands[2], VOIDmode)
3367 && (mode == QImode
3368 || ! register_operand (operands[3], VOIDmode)))
3369 operands[2] = force_reg (mode, operands[2]);
3370
3371 if (mode == QImode
3372 && ! register_operand (operands[3], VOIDmode))
3373 operands[3] = force_reg (mode, operands[3]);
3374
3375 emit_insn (compare_seq);
3376 emit_insn (gen_rtx_SET (operands[0],
3377 gen_rtx_IF_THEN_ELSE (mode,
3378 compare_op, operands[2],
3379 operands[3])));
3380 return true;
3381 }
3382
3383 /* Detect conditional moves that exactly match min/max operational
3384 semantics. Note that this is IEEE safe, as long as we don't
3385 interchange the operands.
3386
3387 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3388 and TRUE if the operation is successful and instructions are emitted. */
3389
3390 static bool
3391 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3392 rtx cmp_op1, rtx if_true, rtx if_false)
3393 {
3394 machine_mode mode;
3395 bool is_min;
3396 rtx tmp;
3397
3398 if (code == LT)
3399 ;
3400 else if (code == UNGE)
3401 std::swap (if_true, if_false);
3402 else
3403 return false;
3404
3405 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3406 is_min = true;
3407 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3408 is_min = false;
3409 else
3410 return false;
3411
3412 mode = GET_MODE (dest);
3413
3414 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3415 but MODE may be a vector mode and thus not appropriate. */
3416 if (!flag_finite_math_only || flag_signed_zeros)
3417 {
3418 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3419 rtvec v;
3420
3421 if_true = force_reg (mode, if_true);
3422 v = gen_rtvec (2, if_true, if_false);
3423 tmp = gen_rtx_UNSPEC (mode, v, u);
3424 }
3425 else
3426 {
3427 code = is_min ? SMIN : SMAX;
3428 if (MEM_P (if_true) && MEM_P (if_false))
3429 if_true = force_reg (mode, if_true);
3430 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3431 }
3432
3433 emit_insn (gen_rtx_SET (dest, tmp));
3434 return true;
3435 }
3436
3437 /* Return true if MODE is valid for vector compare to mask register,
3438 Same result for conditionl vector move with mask register. */
3439 static bool
3440 ix86_valid_mask_cmp_mode (machine_mode mode)
3441 {
3442 /* XOP has its own vector conditional movement. */
3443 if (TARGET_XOP && !TARGET_AVX512F)
3444 return false;
3445
3446 /* AVX512F is needed for mask operation. */
3447 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3448 return false;
3449
3450 /* AVX512BW is needed for vector QI/HImode,
3451 AVX512VL is needed for 128/256-bit vector. */
3452 machine_mode inner_mode = GET_MODE_INNER (mode);
3453 int vector_size = GET_MODE_SIZE (mode);
3454 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3455 return false;
3456
3457 return vector_size == 64 || TARGET_AVX512VL;
3458 }
3459
3460 /* Expand an SSE comparison. Return the register with the result. */
3461
3462 static rtx
3463 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3464 rtx op_true, rtx op_false)
3465 {
3466 machine_mode mode = GET_MODE (dest);
3467 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3468
3469 /* In general case result of comparison can differ from operands' type. */
3470 machine_mode cmp_mode;
3471
3472 /* In AVX512F the result of comparison is an integer mask. */
3473 bool maskcmp = false;
3474 rtx x;
3475
3476 if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
3477 {
3478 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3479 maskcmp = true;
3480 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3481 }
3482 else
3483 cmp_mode = cmp_ops_mode;
3484
3485 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3486
3487 int (*op1_predicate)(rtx, machine_mode)
3488 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3489
3490 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3491 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3492
3493 if (optimize
3494 || (maskcmp && cmp_mode != mode)
3495 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3496 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3497 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3498
3499 if (maskcmp)
3500 {
3501 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3502 gcc_assert (ok);
3503 return dest;
3504 }
3505
3506 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3507
3508 if (cmp_mode != mode && !maskcmp)
3509 {
3510 x = force_reg (cmp_ops_mode, x);
3511 convert_move (dest, x, false);
3512 }
3513 else
3514 emit_insn (gen_rtx_SET (dest, x));
3515
3516 return dest;
3517 }
3518
3519 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3520 operations. This is used for both scalar and vector conditional moves. */
3521
3522 void
3523 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3524 {
3525 machine_mode mode = GET_MODE (dest);
3526 machine_mode cmpmode = GET_MODE (cmp);
3527
3528 /* In AVX512F the result of comparison is an integer mask. */
3529 bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
3530
3531 rtx t2, t3, x;
3532
3533 /* If we have an integer mask and FP value then we need
3534 to cast mask to FP mode. */
3535 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3536 {
3537 cmp = force_reg (cmpmode, cmp);
3538 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3539 }
3540
3541 if (maskcmp)
3542 {
3543 /* Using vector move with mask register. */
3544 cmp = force_reg (cmpmode, cmp);
3545 /* Optimize for mask zero. */
3546 op_true = (op_true != CONST0_RTX (mode)
3547 ? force_reg (mode, op_true) : op_true);
3548 op_false = (op_false != CONST0_RTX (mode)
3549 ? force_reg (mode, op_false) : op_false);
3550 if (op_true == CONST0_RTX (mode))
3551 {
3552 rtx (*gen_not) (rtx, rtx);
3553 switch (cmpmode)
3554 {
3555 case E_QImode: gen_not = gen_knotqi; break;
3556 case E_HImode: gen_not = gen_knothi; break;
3557 case E_SImode: gen_not = gen_knotsi; break;
3558 case E_DImode: gen_not = gen_knotdi; break;
3559 default: gcc_unreachable ();
3560 }
3561 rtx n = gen_reg_rtx (cmpmode);
3562 emit_insn (gen_not (n, cmp));
3563 cmp = n;
3564 /* Reverse op_true op_false. */
3565 std::swap (op_true, op_false);
3566 }
3567
3568 rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
3569 emit_insn (gen_rtx_SET (dest, vec_merge));
3570 return;
3571 }
3572 else if (vector_all_ones_operand (op_true, mode)
3573 && op_false == CONST0_RTX (mode))
3574 {
3575 emit_insn (gen_rtx_SET (dest, cmp));
3576 return;
3577 }
3578 else if (op_false == CONST0_RTX (mode))
3579 {
3580 op_true = force_reg (mode, op_true);
3581 x = gen_rtx_AND (mode, cmp, op_true);
3582 emit_insn (gen_rtx_SET (dest, x));
3583 return;
3584 }
3585 else if (op_true == CONST0_RTX (mode))
3586 {
3587 op_false = force_reg (mode, op_false);
3588 x = gen_rtx_NOT (mode, cmp);
3589 x = gen_rtx_AND (mode, x, op_false);
3590 emit_insn (gen_rtx_SET (dest, x));
3591 return;
3592 }
3593 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3594 {
3595 op_false = force_reg (mode, op_false);
3596 x = gen_rtx_IOR (mode, cmp, op_false);
3597 emit_insn (gen_rtx_SET (dest, x));
3598 return;
3599 }
3600 else if (TARGET_XOP)
3601 {
3602 op_true = force_reg (mode, op_true);
3603
3604 if (!nonimmediate_operand (op_false, mode))
3605 op_false = force_reg (mode, op_false);
3606
3607 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3608 op_true,
3609 op_false)));
3610 return;
3611 }
3612
3613 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3614 rtx d = dest;
3615
3616 if (!vector_operand (op_true, mode))
3617 op_true = force_reg (mode, op_true);
3618
3619 op_false = force_reg (mode, op_false);
3620
3621 switch (mode)
3622 {
3623 case E_V4SFmode:
3624 if (TARGET_SSE4_1)
3625 gen = gen_sse4_1_blendvps;
3626 break;
3627 case E_V2DFmode:
3628 if (TARGET_SSE4_1)
3629 gen = gen_sse4_1_blendvpd;
3630 break;
3631 case E_SFmode:
3632 if (TARGET_SSE4_1)
3633 {
3634 gen = gen_sse4_1_blendvss;
3635 op_true = force_reg (mode, op_true);
3636 }
3637 break;
3638 case E_DFmode:
3639 if (TARGET_SSE4_1)
3640 {
3641 gen = gen_sse4_1_blendvsd;
3642 op_true = force_reg (mode, op_true);
3643 }
3644 break;
3645 case E_V16QImode:
3646 case E_V8HImode:
3647 case E_V4SImode:
3648 case E_V2DImode:
3649 if (TARGET_SSE4_1)
3650 {
3651 gen = gen_sse4_1_pblendvb;
3652 if (mode != V16QImode)
3653 d = gen_reg_rtx (V16QImode);
3654 op_false = gen_lowpart (V16QImode, op_false);
3655 op_true = gen_lowpart (V16QImode, op_true);
3656 cmp = gen_lowpart (V16QImode, cmp);
3657 }
3658 break;
3659 case E_V8SFmode:
3660 if (TARGET_AVX)
3661 gen = gen_avx_blendvps256;
3662 break;
3663 case E_V4DFmode:
3664 if (TARGET_AVX)
3665 gen = gen_avx_blendvpd256;
3666 break;
3667 case E_V32QImode:
3668 case E_V16HImode:
3669 case E_V8SImode:
3670 case E_V4DImode:
3671 if (TARGET_AVX2)
3672 {
3673 gen = gen_avx2_pblendvb;
3674 if (mode != V32QImode)
3675 d = gen_reg_rtx (V32QImode);
3676 op_false = gen_lowpart (V32QImode, op_false);
3677 op_true = gen_lowpart (V32QImode, op_true);
3678 cmp = gen_lowpart (V32QImode, cmp);
3679 }
3680 break;
3681
3682 case E_V64QImode:
3683 gen = gen_avx512bw_blendmv64qi;
3684 break;
3685 case E_V32HImode:
3686 gen = gen_avx512bw_blendmv32hi;
3687 break;
3688 case E_V16SImode:
3689 gen = gen_avx512f_blendmv16si;
3690 break;
3691 case E_V8DImode:
3692 gen = gen_avx512f_blendmv8di;
3693 break;
3694 case E_V8DFmode:
3695 gen = gen_avx512f_blendmv8df;
3696 break;
3697 case E_V16SFmode:
3698 gen = gen_avx512f_blendmv16sf;
3699 break;
3700
3701 default:
3702 break;
3703 }
3704
3705 if (gen != NULL)
3706 {
3707 emit_insn (gen (d, op_false, op_true, cmp));
3708 if (d != dest)
3709 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3710 }
3711 else
3712 {
3713 op_true = force_reg (mode, op_true);
3714
3715 t2 = gen_reg_rtx (mode);
3716 if (optimize)
3717 t3 = gen_reg_rtx (mode);
3718 else
3719 t3 = dest;
3720
3721 x = gen_rtx_AND (mode, op_true, cmp);
3722 emit_insn (gen_rtx_SET (t2, x));
3723
3724 x = gen_rtx_NOT (mode, cmp);
3725 x = gen_rtx_AND (mode, x, op_false);
3726 emit_insn (gen_rtx_SET (t3, x));
3727
3728 x = gen_rtx_IOR (mode, t3, t2);
3729 emit_insn (gen_rtx_SET (dest, x));
3730 }
3731 }
3732
3733 /* Swap, force into registers, or otherwise massage the two operands
3734 to an sse comparison with a mask result. Thus we differ a bit from
3735 ix86_prepare_fp_compare_args which expects to produce a flags result.
3736
3737 The DEST operand exists to help determine whether to commute commutative
3738 operators. The POP0/POP1 operands are updated in place. The new
3739 comparison code is returned, or UNKNOWN if not implementable. */
3740
3741 static enum rtx_code
3742 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3743 rtx *pop0, rtx *pop1)
3744 {
3745 switch (code)
3746 {
3747 case LTGT:
3748 case UNEQ:
3749 /* AVX supports all the needed comparisons. */
3750 if (TARGET_AVX)
3751 break;
3752 /* We have no LTGT as an operator. We could implement it with
3753 NE & ORDERED, but this requires an extra temporary. It's
3754 not clear that it's worth it. */
3755 return UNKNOWN;
3756
3757 case LT:
3758 case LE:
3759 case UNGT:
3760 case UNGE:
3761 /* These are supported directly. */
3762 break;
3763
3764 case EQ:
3765 case NE:
3766 case UNORDERED:
3767 case ORDERED:
3768 /* AVX has 3 operand comparisons, no need to swap anything. */
3769 if (TARGET_AVX)
3770 break;
3771 /* For commutative operators, try to canonicalize the destination
3772 operand to be first in the comparison - this helps reload to
3773 avoid extra moves. */
3774 if (!dest || !rtx_equal_p (dest, *pop1))
3775 break;
3776 /* FALLTHRU */
3777
3778 case GE:
3779 case GT:
3780 case UNLE:
3781 case UNLT:
3782 /* These are not supported directly before AVX, and furthermore
3783 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3784 comparison operands to transform into something that is
3785 supported. */
3786 std::swap (*pop0, *pop1);
3787 code = swap_condition (code);
3788 break;
3789
3790 default:
3791 gcc_unreachable ();
3792 }
3793
3794 return code;
3795 }
3796
3797 /* Expand a floating-point conditional move. Return true if successful. */
3798
3799 bool
3800 ix86_expand_fp_movcc (rtx operands[])
3801 {
3802 machine_mode mode = GET_MODE (operands[0]);
3803 enum rtx_code code = GET_CODE (operands[1]);
3804 rtx tmp, compare_op;
3805 rtx op0 = XEXP (operands[1], 0);
3806 rtx op1 = XEXP (operands[1], 1);
3807
3808 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3809 {
3810 machine_mode cmode;
3811
3812 /* Since we've no cmove for sse registers, don't force bad register
3813 allocation just to gain access to it. Deny movcc when the
3814 comparison mode doesn't match the move mode. */
3815 cmode = GET_MODE (op0);
3816 if (cmode == VOIDmode)
3817 cmode = GET_MODE (op1);
3818 if (cmode != mode)
3819 return false;
3820
3821 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3822 if (code == UNKNOWN)
3823 return false;
3824
3825 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3826 operands[2], operands[3]))
3827 return true;
3828
3829 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3830 operands[2], operands[3]);
3831 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3832 return true;
3833 }
3834
3835 if (GET_MODE (op0) == TImode
3836 || (GET_MODE (op0) == DImode
3837 && !TARGET_64BIT))
3838 return false;
3839
3840 /* The floating point conditional move instructions don't directly
3841 support conditions resulting from a signed integer comparison. */
3842
3843 compare_op = ix86_expand_compare (code, op0, op1);
3844 if (!fcmov_comparison_operator (compare_op, VOIDmode))
3845 {
3846 tmp = gen_reg_rtx (QImode);
3847 ix86_expand_setcc (tmp, code, op0, op1);
3848
3849 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3850 }
3851
3852 emit_insn (gen_rtx_SET (operands[0],
3853 gen_rtx_IF_THEN_ELSE (mode, compare_op,
3854 operands[2], operands[3])));
3855
3856 return true;
3857 }
3858
3859 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3860
3861 static int
3862 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3863 {
3864 switch (code)
3865 {
3866 case EQ:
3867 return 0;
3868 case LT:
3869 case LTU:
3870 return 1;
3871 case LE:
3872 case LEU:
3873 return 2;
3874 case NE:
3875 return 4;
3876 case GE:
3877 case GEU:
3878 return 5;
3879 case GT:
3880 case GTU:
3881 return 6;
3882 default:
3883 gcc_unreachable ();
3884 }
3885 }
3886
3887 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3888
3889 static int
3890 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3891 {
3892 switch (code)
3893 {
3894 case EQ:
3895 return 0x00;
3896 case NE:
3897 return 0x04;
3898 case GT:
3899 return 0x0e;
3900 case LE:
3901 return 0x02;
3902 case GE:
3903 return 0x0d;
3904 case LT:
3905 return 0x01;
3906 case UNLE:
3907 return 0x0a;
3908 case UNLT:
3909 return 0x09;
3910 case UNGE:
3911 return 0x05;
3912 case UNGT:
3913 return 0x06;
3914 case UNEQ:
3915 return 0x18;
3916 case LTGT:
3917 return 0x0c;
3918 case ORDERED:
3919 return 0x07;
3920 case UNORDERED:
3921 return 0x03;
3922 default:
3923 gcc_unreachable ();
3924 }
3925 }
3926
3927 /* Return immediate value to be used in UNSPEC_PCMP
3928 for comparison CODE in MODE. */
3929
3930 static int
3931 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3932 {
3933 if (FLOAT_MODE_P (mode))
3934 return ix86_fp_cmp_code_to_pcmp_immediate (code);
3935 return ix86_int_cmp_code_to_pcmp_immediate (code);
3936 }
3937
3938 /* Expand AVX-512 vector comparison. */
3939
3940 bool
3941 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
3942 {
3943 machine_mode mask_mode = GET_MODE (dest);
3944 machine_mode cmp_mode = GET_MODE (cmp_op0);
3945 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3946 int unspec_code;
3947 rtx unspec;
3948
3949 switch (code)
3950 {
3951 case LEU:
3952 case GTU:
3953 case GEU:
3954 case LTU:
3955 unspec_code = UNSPEC_UNSIGNED_PCMP;
3956 break;
3957
3958 default:
3959 unspec_code = UNSPEC_PCMP;
3960 }
3961
3962 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
3963 unspec_code);
3964 emit_insn (gen_rtx_SET (dest, unspec));
3965
3966 return true;
3967 }
3968
3969 /* Expand fp vector comparison. */
3970
3971 bool
3972 ix86_expand_fp_vec_cmp (rtx operands[])
3973 {
3974 enum rtx_code code = GET_CODE (operands[1]);
3975 rtx cmp;
3976
3977 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
3978 &operands[2], &operands[3]);
3979 if (code == UNKNOWN)
3980 {
3981 rtx temp;
3982 switch (GET_CODE (operands[1]))
3983 {
3984 case LTGT:
3985 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
3986 operands[3], NULL, NULL);
3987 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
3988 operands[3], NULL, NULL);
3989 code = AND;
3990 break;
3991 case UNEQ:
3992 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
3993 operands[3], NULL, NULL);
3994 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
3995 operands[3], NULL, NULL);
3996 code = IOR;
3997 break;
3998 default:
3999 gcc_unreachable ();
4000 }
4001 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4002 OPTAB_DIRECT);
4003 }
4004 else
4005 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4006 operands[1], operands[2]);
4007
4008 if (operands[0] != cmp)
4009 emit_move_insn (operands[0], cmp);
4010
4011 return true;
4012 }
4013
4014 static rtx
4015 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4016 rtx op_true, rtx op_false, bool *negate)
4017 {
4018 machine_mode data_mode = GET_MODE (dest);
4019 machine_mode mode = GET_MODE (cop0);
4020 rtx x;
4021
4022 *negate = false;
4023
4024 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4025 if (TARGET_XOP
4026 && (mode == V16QImode || mode == V8HImode
4027 || mode == V4SImode || mode == V2DImode))
4028 ;
4029 /* AVX512F supports all of the comparsions
4030 on all 128/256/512-bit vector int types. */
4031 else if (ix86_valid_mask_cmp_mode (mode))
4032 ;
4033 else
4034 {
4035 /* Canonicalize the comparison to EQ, GT, GTU. */
4036 switch (code)
4037 {
4038 case EQ:
4039 case GT:
4040 case GTU:
4041 break;
4042
4043 case NE:
4044 case LE:
4045 case LEU:
4046 code = reverse_condition (code);
4047 *negate = true;
4048 break;
4049
4050 case GE:
4051 case GEU:
4052 code = reverse_condition (code);
4053 *negate = true;
4054 /* FALLTHRU */
4055
4056 case LT:
4057 case LTU:
4058 std::swap (cop0, cop1);
4059 code = swap_condition (code);
4060 break;
4061
4062 default:
4063 gcc_unreachable ();
4064 }
4065
4066 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4067 if (mode == V2DImode)
4068 {
4069 switch (code)
4070 {
4071 case EQ:
4072 /* SSE4.1 supports EQ. */
4073 if (!TARGET_SSE4_1)
4074 return NULL;
4075 break;
4076
4077 case GT:
4078 case GTU:
4079 /* SSE4.2 supports GT/GTU. */
4080 if (!TARGET_SSE4_2)
4081 return NULL;
4082 break;
4083
4084 default:
4085 gcc_unreachable ();
4086 }
4087 }
4088
4089 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4090 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4091 if (*negate)
4092 std::swap (optrue, opfalse);
4093
4094 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4095 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4096 min (x, y) == x). While we add one instruction (the minimum),
4097 we remove the need for two instructions in the negation, as the
4098 result is done this way.
4099 When using masks, do it for SI/DImode element types, as it is shorter
4100 than the two subtractions. */
4101 if ((code != EQ
4102 && GET_MODE_SIZE (mode) != 64
4103 && vector_all_ones_operand (opfalse, data_mode)
4104 && optrue == CONST0_RTX (data_mode))
4105 || (code == GTU
4106 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4107 /* Don't do it if not using integer masks and we'd end up with
4108 the right values in the registers though. */
4109 && (GET_MODE_SIZE (mode) == 64
4110 || !vector_all_ones_operand (optrue, data_mode)
4111 || opfalse != CONST0_RTX (data_mode))))
4112 {
4113 rtx (*gen) (rtx, rtx, rtx) = NULL;
4114
4115 switch (mode)
4116 {
4117 case E_V16SImode:
4118 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4119 break;
4120 case E_V8DImode:
4121 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4122 cop0 = force_reg (mode, cop0);
4123 cop1 = force_reg (mode, cop1);
4124 break;
4125 case E_V32QImode:
4126 if (TARGET_AVX2)
4127 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4128 break;
4129 case E_V16HImode:
4130 if (TARGET_AVX2)
4131 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4132 break;
4133 case E_V8SImode:
4134 if (TARGET_AVX2)
4135 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4136 break;
4137 case E_V4DImode:
4138 if (TARGET_AVX512VL)
4139 {
4140 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4141 cop0 = force_reg (mode, cop0);
4142 cop1 = force_reg (mode, cop1);
4143 }
4144 break;
4145 case E_V16QImode:
4146 if (code == GTU && TARGET_SSE2)
4147 gen = gen_uminv16qi3;
4148 else if (code == GT && TARGET_SSE4_1)
4149 gen = gen_sminv16qi3;
4150 break;
4151 case E_V8HImode:
4152 if (code == GTU && TARGET_SSE4_1)
4153 gen = gen_uminv8hi3;
4154 else if (code == GT && TARGET_SSE2)
4155 gen = gen_sminv8hi3;
4156 break;
4157 case E_V4SImode:
4158 if (TARGET_SSE4_1)
4159 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4160 break;
4161 case E_V2DImode:
4162 if (TARGET_AVX512VL)
4163 {
4164 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4165 cop0 = force_reg (mode, cop0);
4166 cop1 = force_reg (mode, cop1);
4167 }
4168 break;
4169 default:
4170 break;
4171 }
4172
4173 if (gen)
4174 {
4175 rtx tem = gen_reg_rtx (mode);
4176 if (!vector_operand (cop0, mode))
4177 cop0 = force_reg (mode, cop0);
4178 if (!vector_operand (cop1, mode))
4179 cop1 = force_reg (mode, cop1);
4180 *negate = !*negate;
4181 emit_insn (gen (tem, cop0, cop1));
4182 cop1 = tem;
4183 code = EQ;
4184 }
4185 }
4186
4187 /* Unsigned parallel compare is not supported by the hardware.
4188 Play some tricks to turn this into a signed comparison
4189 against 0. */
4190 if (code == GTU)
4191 {
4192 cop0 = force_reg (mode, cop0);
4193
4194 switch (mode)
4195 {
4196 case E_V16SImode:
4197 case E_V8DImode:
4198 case E_V8SImode:
4199 case E_V4DImode:
4200 case E_V4SImode:
4201 case E_V2DImode:
4202 {
4203 rtx t1, t2, mask;
4204
4205 /* Subtract (-(INT MAX) - 1) from both operands to make
4206 them signed. */
4207 mask = ix86_build_signbit_mask (mode, true, false);
4208 t1 = gen_reg_rtx (mode);
4209 emit_insn (gen_sub3_insn (t1, cop0, mask));
4210
4211 t2 = gen_reg_rtx (mode);
4212 emit_insn (gen_sub3_insn (t2, cop1, mask));
4213
4214 cop0 = t1;
4215 cop1 = t2;
4216 code = GT;
4217 }
4218 break;
4219
4220 case E_V64QImode:
4221 case E_V32HImode:
4222 case E_V32QImode:
4223 case E_V16HImode:
4224 case E_V16QImode:
4225 case E_V8HImode:
4226 /* Perform a parallel unsigned saturating subtraction. */
4227 x = gen_reg_rtx (mode);
4228 emit_insn (gen_rtx_SET
4229 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4230 cop0 = x;
4231 cop1 = CONST0_RTX (mode);
4232 code = EQ;
4233 *negate = !*negate;
4234 break;
4235
4236 default:
4237 gcc_unreachable ();
4238 }
4239 }
4240 }
4241
4242 if (*negate)
4243 std::swap (op_true, op_false);
4244
4245 /* Allow the comparison to be done in one mode, but the movcc to
4246 happen in another mode. */
4247 if (data_mode == mode)
4248 {
4249 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4250 op_true, op_false);
4251 }
4252 else
4253 {
4254 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4255 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4256 op_true, op_false);
4257 if (GET_MODE (x) == mode)
4258 x = gen_lowpart (data_mode, x);
4259 }
4260
4261 return x;
4262 }
4263
4264 /* Expand integer vector comparison. */
4265
4266 bool
4267 ix86_expand_int_vec_cmp (rtx operands[])
4268 {
4269 rtx_code code = GET_CODE (operands[1]);
4270 bool negate = false;
4271 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4272 operands[3], NULL, NULL, &negate);
4273
4274 if (!cmp)
4275 return false;
4276
4277 if (negate)
4278 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4279 CONST0_RTX (GET_MODE (cmp)),
4280 NULL, NULL, &negate);
4281
4282 gcc_assert (!negate);
4283
4284 if (operands[0] != cmp)
4285 emit_move_insn (operands[0], cmp);
4286
4287 return true;
4288 }
4289
4290 /* Expand a floating-point vector conditional move; a vcond operation
4291 rather than a movcc operation. */
4292
4293 bool
4294 ix86_expand_fp_vcond (rtx operands[])
4295 {
4296 enum rtx_code code = GET_CODE (operands[3]);
4297 rtx cmp;
4298
4299 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4300 &operands[4], &operands[5]);
4301 if (code == UNKNOWN)
4302 {
4303 rtx temp;
4304 switch (GET_CODE (operands[3]))
4305 {
4306 case LTGT:
4307 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4308 operands[5], operands[0], operands[0]);
4309 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4310 operands[5], operands[1], operands[2]);
4311 code = AND;
4312 break;
4313 case UNEQ:
4314 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4315 operands[5], operands[0], operands[0]);
4316 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4317 operands[5], operands[1], operands[2]);
4318 code = IOR;
4319 break;
4320 default:
4321 gcc_unreachable ();
4322 }
4323 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4324 OPTAB_DIRECT);
4325 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4326 return true;
4327 }
4328
4329 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4330 operands[5], operands[1], operands[2]))
4331 return true;
4332
4333 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4334 operands[1], operands[2]);
4335 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4336 return true;
4337 }
4338
4339 /* Expand a signed/unsigned integral vector conditional move. */
4340
4341 bool
4342 ix86_expand_int_vcond (rtx operands[])
4343 {
4344 machine_mode data_mode = GET_MODE (operands[0]);
4345 machine_mode mode = GET_MODE (operands[4]);
4346 enum rtx_code code = GET_CODE (operands[3]);
4347 bool negate = false;
4348 rtx x, cop0, cop1;
4349
4350 cop0 = operands[4];
4351 cop1 = operands[5];
4352
4353 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4354 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4355 if ((code == LT || code == GE)
4356 && data_mode == mode
4357 && cop1 == CONST0_RTX (mode)
4358 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4359 && GET_MODE_UNIT_SIZE (data_mode) > 1
4360 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4361 && (GET_MODE_SIZE (data_mode) == 16
4362 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4363 {
4364 rtx negop = operands[2 - (code == LT)];
4365 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4366 if (negop == CONST1_RTX (data_mode))
4367 {
4368 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4369 operands[0], 1, OPTAB_DIRECT);
4370 if (res != operands[0])
4371 emit_move_insn (operands[0], res);
4372 return true;
4373 }
4374 else if (GET_MODE_INNER (data_mode) != DImode
4375 && vector_all_ones_operand (negop, data_mode))
4376 {
4377 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4378 operands[0], 0, OPTAB_DIRECT);
4379 if (res != operands[0])
4380 emit_move_insn (operands[0], res);
4381 return true;
4382 }
4383 }
4384
4385 if (!nonimmediate_operand (cop1, mode))
4386 cop1 = force_reg (mode, cop1);
4387 if (!general_operand (operands[1], data_mode))
4388 operands[1] = force_reg (data_mode, operands[1]);
4389 if (!general_operand (operands[2], data_mode))
4390 operands[2] = force_reg (data_mode, operands[2]);
4391
4392 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4393 operands[1], operands[2], &negate);
4394
4395 if (!x)
4396 return false;
4397
4398 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4399 operands[2-negate]);
4400 return true;
4401 }
4402
4403 static bool
4404 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4405 struct expand_vec_perm_d *d)
4406 {
4407 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4408 expander, so args are either in d, or in op0, op1 etc. */
4409 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4410 machine_mode maskmode = mode;
4411 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4412
4413 switch (mode)
4414 {
4415 case E_V8HImode:
4416 if (TARGET_AVX512VL && TARGET_AVX512BW)
4417 gen = gen_avx512vl_vpermt2varv8hi3;
4418 break;
4419 case E_V16HImode:
4420 if (TARGET_AVX512VL && TARGET_AVX512BW)
4421 gen = gen_avx512vl_vpermt2varv16hi3;
4422 break;
4423 case E_V64QImode:
4424 if (TARGET_AVX512VBMI)
4425 gen = gen_avx512bw_vpermt2varv64qi3;
4426 break;
4427 case E_V32HImode:
4428 if (TARGET_AVX512BW)
4429 gen = gen_avx512bw_vpermt2varv32hi3;
4430 break;
4431 case E_V4SImode:
4432 if (TARGET_AVX512VL)
4433 gen = gen_avx512vl_vpermt2varv4si3;
4434 break;
4435 case E_V8SImode:
4436 if (TARGET_AVX512VL)
4437 gen = gen_avx512vl_vpermt2varv8si3;
4438 break;
4439 case E_V16SImode:
4440 if (TARGET_AVX512F)
4441 gen = gen_avx512f_vpermt2varv16si3;
4442 break;
4443 case E_V4SFmode:
4444 if (TARGET_AVX512VL)
4445 {
4446 gen = gen_avx512vl_vpermt2varv4sf3;
4447 maskmode = V4SImode;
4448 }
4449 break;
4450 case E_V8SFmode:
4451 if (TARGET_AVX512VL)
4452 {
4453 gen = gen_avx512vl_vpermt2varv8sf3;
4454 maskmode = V8SImode;
4455 }
4456 break;
4457 case E_V16SFmode:
4458 if (TARGET_AVX512F)
4459 {
4460 gen = gen_avx512f_vpermt2varv16sf3;
4461 maskmode = V16SImode;
4462 }
4463 break;
4464 case E_V2DImode:
4465 if (TARGET_AVX512VL)
4466 gen = gen_avx512vl_vpermt2varv2di3;
4467 break;
4468 case E_V4DImode:
4469 if (TARGET_AVX512VL)
4470 gen = gen_avx512vl_vpermt2varv4di3;
4471 break;
4472 case E_V8DImode:
4473 if (TARGET_AVX512F)
4474 gen = gen_avx512f_vpermt2varv8di3;
4475 break;
4476 case E_V2DFmode:
4477 if (TARGET_AVX512VL)
4478 {
4479 gen = gen_avx512vl_vpermt2varv2df3;
4480 maskmode = V2DImode;
4481 }
4482 break;
4483 case E_V4DFmode:
4484 if (TARGET_AVX512VL)
4485 {
4486 gen = gen_avx512vl_vpermt2varv4df3;
4487 maskmode = V4DImode;
4488 }
4489 break;
4490 case E_V8DFmode:
4491 if (TARGET_AVX512F)
4492 {
4493 gen = gen_avx512f_vpermt2varv8df3;
4494 maskmode = V8DImode;
4495 }
4496 break;
4497 default:
4498 break;
4499 }
4500
4501 if (gen == NULL)
4502 return false;
4503
4504 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4505 expander, so args are either in d, or in op0, op1 etc. */
4506 if (d)
4507 {
4508 rtx vec[64];
4509 target = d->target;
4510 op0 = d->op0;
4511 op1 = d->op1;
4512 for (int i = 0; i < d->nelt; ++i)
4513 vec[i] = GEN_INT (d->perm[i]);
4514 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4515 }
4516
4517 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4518 return true;
4519 }
4520
4521 /* Expand a variable vector permutation. */
4522
4523 void
4524 ix86_expand_vec_perm (rtx operands[])
4525 {
4526 rtx target = operands[0];
4527 rtx op0 = operands[1];
4528 rtx op1 = operands[2];
4529 rtx mask = operands[3];
4530 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4531 machine_mode mode = GET_MODE (op0);
4532 machine_mode maskmode = GET_MODE (mask);
4533 int w, e, i;
4534 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4535
4536 /* Number of elements in the vector. */
4537 w = GET_MODE_NUNITS (mode);
4538 e = GET_MODE_UNIT_SIZE (mode);
4539 gcc_assert (w <= 64);
4540
4541 if (TARGET_AVX512F && one_operand_shuffle)
4542 {
4543 rtx (*gen) (rtx, rtx, rtx) = NULL;
4544 switch (mode)
4545 {
4546 case E_V16SImode:
4547 gen =gen_avx512f_permvarv16si;
4548 break;
4549 case E_V16SFmode:
4550 gen = gen_avx512f_permvarv16sf;
4551 break;
4552 case E_V8DImode:
4553 gen = gen_avx512f_permvarv8di;
4554 break;
4555 case E_V8DFmode:
4556 gen = gen_avx512f_permvarv8df;
4557 break;
4558 default:
4559 break;
4560 }
4561 if (gen != NULL)
4562 {
4563 emit_insn (gen (target, op0, mask));
4564 return;
4565 }
4566 }
4567
4568 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4569 return;
4570
4571 if (TARGET_AVX2)
4572 {
4573 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4574 {
4575 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4576 an constant shuffle operand. With a tiny bit of effort we can
4577 use VPERMD instead. A re-interpretation stall for V4DFmode is
4578 unfortunate but there's no avoiding it.
4579 Similarly for V16HImode we don't have instructions for variable
4580 shuffling, while for V32QImode we can use after preparing suitable
4581 masks vpshufb; vpshufb; vpermq; vpor. */
4582
4583 if (mode == V16HImode)
4584 {
4585 maskmode = mode = V32QImode;
4586 w = 32;
4587 e = 1;
4588 }
4589 else
4590 {
4591 maskmode = mode = V8SImode;
4592 w = 8;
4593 e = 4;
4594 }
4595 t1 = gen_reg_rtx (maskmode);
4596
4597 /* Replicate the low bits of the V4DImode mask into V8SImode:
4598 mask = { A B C D }
4599 t1 = { A A B B C C D D }. */
4600 for (i = 0; i < w / 2; ++i)
4601 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4602 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4603 vt = force_reg (maskmode, vt);
4604 mask = gen_lowpart (maskmode, mask);
4605 if (maskmode == V8SImode)
4606 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4607 else
4608 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4609
4610 /* Multiply the shuffle indicies by two. */
4611 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4612 OPTAB_DIRECT);
4613
4614 /* Add one to the odd shuffle indicies:
4615 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4616 for (i = 0; i < w / 2; ++i)
4617 {
4618 vec[i * 2] = const0_rtx;
4619 vec[i * 2 + 1] = const1_rtx;
4620 }
4621 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4622 vt = validize_mem (force_const_mem (maskmode, vt));
4623 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4624 OPTAB_DIRECT);
4625
4626 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4627 operands[3] = mask = t1;
4628 target = gen_reg_rtx (mode);
4629 op0 = gen_lowpart (mode, op0);
4630 op1 = gen_lowpart (mode, op1);
4631 }
4632
4633 switch (mode)
4634 {
4635 case E_V8SImode:
4636 /* The VPERMD and VPERMPS instructions already properly ignore
4637 the high bits of the shuffle elements. No need for us to
4638 perform an AND ourselves. */
4639 if (one_operand_shuffle)
4640 {
4641 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4642 if (target != operands[0])
4643 emit_move_insn (operands[0],
4644 gen_lowpart (GET_MODE (operands[0]), target));
4645 }
4646 else
4647 {
4648 t1 = gen_reg_rtx (V8SImode);
4649 t2 = gen_reg_rtx (V8SImode);
4650 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4651 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4652 goto merge_two;
4653 }
4654 return;
4655
4656 case E_V8SFmode:
4657 mask = gen_lowpart (V8SImode, mask);
4658 if (one_operand_shuffle)
4659 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4660 else
4661 {
4662 t1 = gen_reg_rtx (V8SFmode);
4663 t2 = gen_reg_rtx (V8SFmode);
4664 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4665 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4666 goto merge_two;
4667 }
4668 return;
4669
4670 case E_V4SImode:
4671 /* By combining the two 128-bit input vectors into one 256-bit
4672 input vector, we can use VPERMD and VPERMPS for the full
4673 two-operand shuffle. */
4674 t1 = gen_reg_rtx (V8SImode);
4675 t2 = gen_reg_rtx (V8SImode);
4676 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4677 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4678 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4679 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4680 return;
4681
4682 case E_V4SFmode:
4683 t1 = gen_reg_rtx (V8SFmode);
4684 t2 = gen_reg_rtx (V8SImode);
4685 mask = gen_lowpart (V4SImode, mask);
4686 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4687 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4688 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4689 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4690 return;
4691
4692 case E_V32QImode:
4693 t1 = gen_reg_rtx (V32QImode);
4694 t2 = gen_reg_rtx (V32QImode);
4695 t3 = gen_reg_rtx (V32QImode);
4696 vt2 = GEN_INT (-128);
4697 vt = gen_const_vec_duplicate (V32QImode, vt2);
4698 vt = force_reg (V32QImode, vt);
4699 for (i = 0; i < 32; i++)
4700 vec[i] = i < 16 ? vt2 : const0_rtx;
4701 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4702 vt2 = force_reg (V32QImode, vt2);
4703 /* From mask create two adjusted masks, which contain the same
4704 bits as mask in the low 7 bits of each vector element.
4705 The first mask will have the most significant bit clear
4706 if it requests element from the same 128-bit lane
4707 and MSB set if it requests element from the other 128-bit lane.
4708 The second mask will have the opposite values of the MSB,
4709 and additionally will have its 128-bit lanes swapped.
4710 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4711 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4712 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4713 stands for other 12 bytes. */
4714 /* The bit whether element is from the same lane or the other
4715 lane is bit 4, so shift it up by 3 to the MSB position. */
4716 t5 = gen_reg_rtx (V4DImode);
4717 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4718 GEN_INT (3)));
4719 /* Clear MSB bits from the mask just in case it had them set. */
4720 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4721 /* After this t1 will have MSB set for elements from other lane. */
4722 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4723 /* Clear bits other than MSB. */
4724 emit_insn (gen_andv32qi3 (t1, t1, vt));
4725 /* Or in the lower bits from mask into t3. */
4726 emit_insn (gen_iorv32qi3 (t3, t1, t2));
4727 /* And invert MSB bits in t1, so MSB is set for elements from the same
4728 lane. */
4729 emit_insn (gen_xorv32qi3 (t1, t1, vt));
4730 /* Swap 128-bit lanes in t3. */
4731 t6 = gen_reg_rtx (V4DImode);
4732 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4733 const2_rtx, GEN_INT (3),
4734 const0_rtx, const1_rtx));
4735 /* And or in the lower bits from mask into t1. */
4736 emit_insn (gen_iorv32qi3 (t1, t1, t2));
4737 if (one_operand_shuffle)
4738 {
4739 /* Each of these shuffles will put 0s in places where
4740 element from the other 128-bit lane is needed, otherwise
4741 will shuffle in the requested value. */
4742 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4743 gen_lowpart (V32QImode, t6)));
4744 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4745 /* For t3 the 128-bit lanes are swapped again. */
4746 t7 = gen_reg_rtx (V4DImode);
4747 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4748 const2_rtx, GEN_INT (3),
4749 const0_rtx, const1_rtx));
4750 /* And oring both together leads to the result. */
4751 emit_insn (gen_iorv32qi3 (target, t1,
4752 gen_lowpart (V32QImode, t7)));
4753 if (target != operands[0])
4754 emit_move_insn (operands[0],
4755 gen_lowpart (GET_MODE (operands[0]), target));
4756 return;
4757 }
4758
4759 t4 = gen_reg_rtx (V32QImode);
4760 /* Similarly to the above one_operand_shuffle code,
4761 just for repeated twice for each operand. merge_two:
4762 code will merge the two results together. */
4763 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4764 gen_lowpart (V32QImode, t6)));
4765 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4766 gen_lowpart (V32QImode, t6)));
4767 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4768 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4769 t7 = gen_reg_rtx (V4DImode);
4770 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4771 const2_rtx, GEN_INT (3),
4772 const0_rtx, const1_rtx));
4773 t8 = gen_reg_rtx (V4DImode);
4774 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4775 const2_rtx, GEN_INT (3),
4776 const0_rtx, const1_rtx));
4777 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4778 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4779 t1 = t4;
4780 t2 = t3;
4781 goto merge_two;
4782
4783 default:
4784 gcc_assert (GET_MODE_SIZE (mode) <= 16);
4785 break;
4786 }
4787 }
4788
4789 if (TARGET_XOP)
4790 {
4791 /* The XOP VPPERM insn supports three inputs. By ignoring the
4792 one_operand_shuffle special case, we avoid creating another
4793 set of constant vectors in memory. */
4794 one_operand_shuffle = false;
4795
4796 /* mask = mask & {2*w-1, ...} */
4797 vt = GEN_INT (2*w - 1);
4798 }
4799 else
4800 {
4801 /* mask = mask & {w-1, ...} */
4802 vt = GEN_INT (w - 1);
4803 }
4804
4805 vt = gen_const_vec_duplicate (maskmode, vt);
4806 mask = expand_simple_binop (maskmode, AND, mask, vt,
4807 NULL_RTX, 0, OPTAB_DIRECT);
4808
4809 /* For non-QImode operations, convert the word permutation control
4810 into a byte permutation control. */
4811 if (mode != V16QImode)
4812 {
4813 mask = expand_simple_binop (maskmode, ASHIFT, mask,
4814 GEN_INT (exact_log2 (e)),
4815 NULL_RTX, 0, OPTAB_DIRECT);
4816
4817 /* Convert mask to vector of chars. */
4818 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4819
4820 /* Replicate each of the input bytes into byte positions:
4821 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4822 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4823 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4824 for (i = 0; i < 16; ++i)
4825 vec[i] = GEN_INT (i/e * e);
4826 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4827 vt = validize_mem (force_const_mem (V16QImode, vt));
4828 if (TARGET_XOP)
4829 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4830 else
4831 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4832
4833 /* Convert it into the byte positions by doing
4834 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4835 for (i = 0; i < 16; ++i)
4836 vec[i] = GEN_INT (i % e);
4837 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4838 vt = validize_mem (force_const_mem (V16QImode, vt));
4839 emit_insn (gen_addv16qi3 (mask, mask, vt));
4840 }
4841
4842 /* The actual shuffle operations all operate on V16QImode. */
4843 op0 = gen_lowpart (V16QImode, op0);
4844 op1 = gen_lowpart (V16QImode, op1);
4845
4846 if (TARGET_XOP)
4847 {
4848 if (GET_MODE (target) != V16QImode)
4849 target = gen_reg_rtx (V16QImode);
4850 emit_insn (gen_xop_pperm (target, op0, op1, mask));
4851 if (target != operands[0])
4852 emit_move_insn (operands[0],
4853 gen_lowpart (GET_MODE (operands[0]), target));
4854 }
4855 else if (one_operand_shuffle)
4856 {
4857 if (GET_MODE (target) != V16QImode)
4858 target = gen_reg_rtx (V16QImode);
4859 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4860 if (target != operands[0])
4861 emit_move_insn (operands[0],
4862 gen_lowpart (GET_MODE (operands[0]), target));
4863 }
4864 else
4865 {
4866 rtx xops[6];
4867 bool ok;
4868
4869 /* Shuffle the two input vectors independently. */
4870 t1 = gen_reg_rtx (V16QImode);
4871 t2 = gen_reg_rtx (V16QImode);
4872 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4873 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4874
4875 merge_two:
4876 /* Then merge them together. The key is whether any given control
4877 element contained a bit set that indicates the second word. */
4878 mask = operands[3];
4879 vt = GEN_INT (w);
4880 if (maskmode == V2DImode && !TARGET_SSE4_1)
4881 {
4882 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4883 more shuffle to convert the V2DI input mask into a V4SI
4884 input mask. At which point the masking that expand_int_vcond
4885 will work as desired. */
4886 rtx t3 = gen_reg_rtx (V4SImode);
4887 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4888 const0_rtx, const0_rtx,
4889 const2_rtx, const2_rtx));
4890 mask = t3;
4891 maskmode = V4SImode;
4892 e = w = 4;
4893 }
4894
4895 vt = gen_const_vec_duplicate (maskmode, vt);
4896 vt = force_reg (maskmode, vt);
4897 mask = expand_simple_binop (maskmode, AND, mask, vt,
4898 NULL_RTX, 0, OPTAB_DIRECT);
4899
4900 if (GET_MODE (target) != mode)
4901 target = gen_reg_rtx (mode);
4902 xops[0] = target;
4903 xops[1] = gen_lowpart (mode, t2);
4904 xops[2] = gen_lowpart (mode, t1);
4905 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4906 xops[4] = mask;
4907 xops[5] = vt;
4908 ok = ix86_expand_int_vcond (xops);
4909 gcc_assert (ok);
4910 if (target != operands[0])
4911 emit_move_insn (operands[0],
4912 gen_lowpart (GET_MODE (operands[0]), target));
4913 }
4914 }
4915
4916 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4917 true if we should do zero extension, else sign extension. HIGH_P is
4918 true if we want the N/2 high elements, else the low elements. */
4919
4920 void
4921 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4922 {
4923 machine_mode imode = GET_MODE (src);
4924 rtx tmp;
4925
4926 if (TARGET_SSE4_1)
4927 {
4928 rtx (*unpack)(rtx, rtx);
4929 rtx (*extract)(rtx, rtx) = NULL;
4930 machine_mode halfmode = BLKmode;
4931
4932 switch (imode)
4933 {
4934 case E_V64QImode:
4935 if (unsigned_p)
4936 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4937 else
4938 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4939 halfmode = V32QImode;
4940 extract
4941 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4942 break;
4943 case E_V32QImode:
4944 if (unsigned_p)
4945 unpack = gen_avx2_zero_extendv16qiv16hi2;
4946 else
4947 unpack = gen_avx2_sign_extendv16qiv16hi2;
4948 halfmode = V16QImode;
4949 extract
4950 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4951 break;
4952 case E_V32HImode:
4953 if (unsigned_p)
4954 unpack = gen_avx512f_zero_extendv16hiv16si2;
4955 else
4956 unpack = gen_avx512f_sign_extendv16hiv16si2;
4957 halfmode = V16HImode;
4958 extract
4959 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
4960 break;
4961 case E_V16HImode:
4962 if (unsigned_p)
4963 unpack = gen_avx2_zero_extendv8hiv8si2;
4964 else
4965 unpack = gen_avx2_sign_extendv8hiv8si2;
4966 halfmode = V8HImode;
4967 extract
4968 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
4969 break;
4970 case E_V16SImode:
4971 if (unsigned_p)
4972 unpack = gen_avx512f_zero_extendv8siv8di2;
4973 else
4974 unpack = gen_avx512f_sign_extendv8siv8di2;
4975 halfmode = V8SImode;
4976 extract
4977 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
4978 break;
4979 case E_V8SImode:
4980 if (unsigned_p)
4981 unpack = gen_avx2_zero_extendv4siv4di2;
4982 else
4983 unpack = gen_avx2_sign_extendv4siv4di2;
4984 halfmode = V4SImode;
4985 extract
4986 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
4987 break;
4988 case E_V16QImode:
4989 if (unsigned_p)
4990 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
4991 else
4992 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
4993 break;
4994 case E_V8HImode:
4995 if (unsigned_p)
4996 unpack = gen_sse4_1_zero_extendv4hiv4si2;
4997 else
4998 unpack = gen_sse4_1_sign_extendv4hiv4si2;
4999 break;
5000 case E_V4SImode:
5001 if (unsigned_p)
5002 unpack = gen_sse4_1_zero_extendv2siv2di2;
5003 else
5004 unpack = gen_sse4_1_sign_extendv2siv2di2;
5005 break;
5006 default:
5007 gcc_unreachable ();
5008 }
5009
5010 if (GET_MODE_SIZE (imode) >= 32)
5011 {
5012 tmp = gen_reg_rtx (halfmode);
5013 emit_insn (extract (tmp, src));
5014 }
5015 else if (high_p)
5016 {
5017 /* Shift higher 8 bytes to lower 8 bytes. */
5018 tmp = gen_reg_rtx (V1TImode);
5019 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5020 GEN_INT (64)));
5021 tmp = gen_lowpart (imode, tmp);
5022 }
5023 else
5024 tmp = src;
5025
5026 emit_insn (unpack (dest, tmp));
5027 }
5028 else
5029 {
5030 rtx (*unpack)(rtx, rtx, rtx);
5031
5032 switch (imode)
5033 {
5034 case E_V16QImode:
5035 if (high_p)
5036 unpack = gen_vec_interleave_highv16qi;
5037 else
5038 unpack = gen_vec_interleave_lowv16qi;
5039 break;
5040 case E_V8HImode:
5041 if (high_p)
5042 unpack = gen_vec_interleave_highv8hi;
5043 else
5044 unpack = gen_vec_interleave_lowv8hi;
5045 break;
5046 case E_V4SImode:
5047 if (high_p)
5048 unpack = gen_vec_interleave_highv4si;
5049 else
5050 unpack = gen_vec_interleave_lowv4si;
5051 break;
5052 default:
5053 gcc_unreachable ();
5054 }
5055
5056 if (unsigned_p)
5057 tmp = force_reg (imode, CONST0_RTX (imode));
5058 else
5059 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5060 src, pc_rtx, pc_rtx);
5061
5062 rtx tmp2 = gen_reg_rtx (imode);
5063 emit_insn (unpack (tmp2, src, tmp));
5064 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5065 }
5066 }
5067
5068 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5069 but works for floating pointer parameters and nonoffsetable memories.
5070 For pushes, it returns just stack offsets; the values will be saved
5071 in the right order. Maximally three parts are generated. */
5072
5073 static int
5074 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5075 {
5076 int size;
5077
5078 if (!TARGET_64BIT)
5079 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5080 else
5081 size = (GET_MODE_SIZE (mode) + 4) / 8;
5082
5083 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5084 gcc_assert (size >= 2 && size <= 4);
5085
5086 /* Optimize constant pool reference to immediates. This is used by fp
5087 moves, that force all constants to memory to allow combining. */
5088 if (MEM_P (operand) && MEM_READONLY_P (operand))
5089 operand = avoid_constant_pool_reference (operand);
5090
5091 if (MEM_P (operand) && !offsettable_memref_p (operand))
5092 {
5093 /* The only non-offsetable memories we handle are pushes. */
5094 int ok = push_operand (operand, VOIDmode);
5095
5096 gcc_assert (ok);
5097
5098 operand = copy_rtx (operand);
5099 PUT_MODE (operand, word_mode);
5100 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5101 return size;
5102 }
5103
5104 if (GET_CODE (operand) == CONST_VECTOR)
5105 {
5106 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5107 /* Caution: if we looked through a constant pool memory above,
5108 the operand may actually have a different mode now. That's
5109 ok, since we want to pun this all the way back to an integer. */
5110 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5111 gcc_assert (operand != NULL);
5112 mode = imode;
5113 }
5114
5115 if (!TARGET_64BIT)
5116 {
5117 if (mode == DImode)
5118 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5119 else
5120 {
5121 int i;
5122
5123 if (REG_P (operand))
5124 {
5125 gcc_assert (reload_completed);
5126 for (i = 0; i < size; i++)
5127 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5128 }
5129 else if (offsettable_memref_p (operand))
5130 {
5131 operand = adjust_address (operand, SImode, 0);
5132 parts[0] = operand;
5133 for (i = 1; i < size; i++)
5134 parts[i] = adjust_address (operand, SImode, 4 * i);
5135 }
5136 else if (CONST_DOUBLE_P (operand))
5137 {
5138 const REAL_VALUE_TYPE *r;
5139 long l[4];
5140
5141 r = CONST_DOUBLE_REAL_VALUE (operand);
5142 switch (mode)
5143 {
5144 case E_TFmode:
5145 real_to_target (l, r, mode);
5146 parts[3] = gen_int_mode (l[3], SImode);
5147 parts[2] = gen_int_mode (l[2], SImode);
5148 break;
5149 case E_XFmode:
5150 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5151 long double may not be 80-bit. */
5152 real_to_target (l, r, mode);
5153 parts[2] = gen_int_mode (l[2], SImode);
5154 break;
5155 case E_DFmode:
5156 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5157 break;
5158 default:
5159 gcc_unreachable ();
5160 }
5161 parts[1] = gen_int_mode (l[1], SImode);
5162 parts[0] = gen_int_mode (l[0], SImode);
5163 }
5164 else
5165 gcc_unreachable ();
5166 }
5167 }
5168 else
5169 {
5170 if (mode == TImode)
5171 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5172 if (mode == XFmode || mode == TFmode)
5173 {
5174 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5175 if (REG_P (operand))
5176 {
5177 gcc_assert (reload_completed);
5178 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5179 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5180 }
5181 else if (offsettable_memref_p (operand))
5182 {
5183 operand = adjust_address (operand, DImode, 0);
5184 parts[0] = operand;
5185 parts[1] = adjust_address (operand, upper_mode, 8);
5186 }
5187 else if (CONST_DOUBLE_P (operand))
5188 {
5189 long l[4];
5190
5191 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5192
5193 /* real_to_target puts 32-bit pieces in each long. */
5194 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5195 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5196 << 32), DImode);
5197
5198 if (upper_mode == SImode)
5199 parts[1] = gen_int_mode (l[2], SImode);
5200 else
5201 parts[1]
5202 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5203 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5204 << 32), DImode);
5205 }
5206 else
5207 gcc_unreachable ();
5208 }
5209 }
5210
5211 return size;
5212 }
5213
5214 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5215 Return false when normal moves are needed; true when all required
5216 insns have been emitted. Operands 2-4 contain the input values
5217 int the correct order; operands 5-7 contain the output values. */
5218
5219 void
5220 ix86_split_long_move (rtx operands[])
5221 {
5222 rtx part[2][4];
5223 int nparts, i, j;
5224 int push = 0;
5225 int collisions = 0;
5226 machine_mode mode = GET_MODE (operands[0]);
5227 bool collisionparts[4];
5228
5229 /* The DFmode expanders may ask us to move double.
5230 For 64bit target this is single move. By hiding the fact
5231 here we simplify i386.md splitters. */
5232 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5233 {
5234 /* Optimize constant pool reference to immediates. This is used by
5235 fp moves, that force all constants to memory to allow combining. */
5236
5237 if (MEM_P (operands[1])
5238 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5239 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5240 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5241 if (push_operand (operands[0], VOIDmode))
5242 {
5243 operands[0] = copy_rtx (operands[0]);
5244 PUT_MODE (operands[0], word_mode);
5245 }
5246 else
5247 operands[0] = gen_lowpart (DImode, operands[0]);
5248 operands[1] = gen_lowpart (DImode, operands[1]);
5249 emit_move_insn (operands[0], operands[1]);
5250 return;
5251 }
5252
5253 /* The only non-offsettable memory we handle is push. */
5254 if (push_operand (operands[0], VOIDmode))
5255 push = 1;
5256 else
5257 gcc_assert (!MEM_P (operands[0])
5258 || offsettable_memref_p (operands[0]));
5259
5260 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5261 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5262
5263 /* When emitting push, take care for source operands on the stack. */
5264 if (push && MEM_P (operands[1])
5265 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5266 {
5267 rtx src_base = XEXP (part[1][nparts - 1], 0);
5268
5269 /* Compensate for the stack decrement by 4. */
5270 if (!TARGET_64BIT && nparts == 3
5271 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5272 src_base = plus_constant (Pmode, src_base, 4);
5273
5274 /* src_base refers to the stack pointer and is
5275 automatically decreased by emitted push. */
5276 for (i = 0; i < nparts; i++)
5277 part[1][i] = change_address (part[1][i],
5278 GET_MODE (part[1][i]), src_base);
5279 }
5280
5281 /* We need to do copy in the right order in case an address register
5282 of the source overlaps the destination. */
5283 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5284 {
5285 rtx tmp;
5286
5287 for (i = 0; i < nparts; i++)
5288 {
5289 collisionparts[i]
5290 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5291 if (collisionparts[i])
5292 collisions++;
5293 }
5294
5295 /* Collision in the middle part can be handled by reordering. */
5296 if (collisions == 1 && nparts == 3 && collisionparts [1])
5297 {
5298 std::swap (part[0][1], part[0][2]);
5299 std::swap (part[1][1], part[1][2]);
5300 }
5301 else if (collisions == 1
5302 && nparts == 4
5303 && (collisionparts [1] || collisionparts [2]))
5304 {
5305 if (collisionparts [1])
5306 {
5307 std::swap (part[0][1], part[0][2]);
5308 std::swap (part[1][1], part[1][2]);
5309 }
5310 else
5311 {
5312 std::swap (part[0][2], part[0][3]);
5313 std::swap (part[1][2], part[1][3]);
5314 }
5315 }
5316
5317 /* If there are more collisions, we can't handle it by reordering.
5318 Do an lea to the last part and use only one colliding move. */
5319 else if (collisions > 1)
5320 {
5321 rtx base, addr;
5322
5323 collisions = 1;
5324
5325 base = part[0][nparts - 1];
5326
5327 /* Handle the case when the last part isn't valid for lea.
5328 Happens in 64-bit mode storing the 12-byte XFmode. */
5329 if (GET_MODE (base) != Pmode)
5330 base = gen_rtx_REG (Pmode, REGNO (base));
5331
5332 addr = XEXP (part[1][0], 0);
5333 if (TARGET_TLS_DIRECT_SEG_REFS)
5334 {
5335 struct ix86_address parts;
5336 int ok = ix86_decompose_address (addr, &parts);
5337 gcc_assert (ok);
5338 /* It is not valid to use %gs: or %fs: in lea. */
5339 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5340 }
5341 emit_insn (gen_rtx_SET (base, addr));
5342 part[1][0] = replace_equiv_address (part[1][0], base);
5343 for (i = 1; i < nparts; i++)
5344 {
5345 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5346 part[1][i] = replace_equiv_address (part[1][i], tmp);
5347 }
5348 }
5349 }
5350
5351 if (push)
5352 {
5353 if (!TARGET_64BIT)
5354 {
5355 if (nparts == 3)
5356 {
5357 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5358 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5359 emit_move_insn (part[0][2], part[1][2]);
5360 }
5361 else if (nparts == 4)
5362 {
5363 emit_move_insn (part[0][3], part[1][3]);
5364 emit_move_insn (part[0][2], part[1][2]);
5365 }
5366 }
5367 else
5368 {
5369 /* In 64bit mode we don't have 32bit push available. In case this is
5370 register, it is OK - we will just use larger counterpart. We also
5371 retype memory - these comes from attempt to avoid REX prefix on
5372 moving of second half of TFmode value. */
5373 if (GET_MODE (part[1][1]) == SImode)
5374 {
5375 switch (GET_CODE (part[1][1]))
5376 {
5377 case MEM:
5378 part[1][1] = adjust_address (part[1][1], DImode, 0);
5379 break;
5380
5381 case REG:
5382 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5383 break;
5384
5385 default:
5386 gcc_unreachable ();
5387 }
5388
5389 if (GET_MODE (part[1][0]) == SImode)
5390 part[1][0] = part[1][1];
5391 }
5392 }
5393 emit_move_insn (part[0][1], part[1][1]);
5394 emit_move_insn (part[0][0], part[1][0]);
5395 return;
5396 }
5397
5398 /* Choose correct order to not overwrite the source before it is copied. */
5399 if ((REG_P (part[0][0])
5400 && REG_P (part[1][1])
5401 && (REGNO (part[0][0]) == REGNO (part[1][1])
5402 || (nparts == 3
5403 && REGNO (part[0][0]) == REGNO (part[1][2]))
5404 || (nparts == 4
5405 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5406 || (collisions > 0
5407 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5408 {
5409 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5410 {
5411 operands[2 + i] = part[0][j];
5412 operands[6 + i] = part[1][j];
5413 }
5414 }
5415 else
5416 {
5417 for (i = 0; i < nparts; i++)
5418 {
5419 operands[2 + i] = part[0][i];
5420 operands[6 + i] = part[1][i];
5421 }
5422 }
5423
5424 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5425 if (optimize_insn_for_size_p ())
5426 {
5427 for (j = 0; j < nparts - 1; j++)
5428 if (CONST_INT_P (operands[6 + j])
5429 && operands[6 + j] != const0_rtx
5430 && REG_P (operands[2 + j]))
5431 for (i = j; i < nparts - 1; i++)
5432 if (CONST_INT_P (operands[7 + i])
5433 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5434 operands[7 + i] = operands[2 + j];
5435 }
5436
5437 for (i = 0; i < nparts; i++)
5438 emit_move_insn (operands[2 + i], operands[6 + i]);
5439
5440 return;
5441 }
5442
5443 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5444 left shift by a constant, either using a single shift or
5445 a sequence of add instructions. */
5446
5447 static void
5448 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5449 {
5450 if (count == 1
5451 || (count * ix86_cost->add <= ix86_cost->shift_const
5452 && !optimize_insn_for_size_p ()))
5453 {
5454 while (count-- > 0)
5455 emit_insn (gen_add2_insn (operand, operand));
5456 }
5457 else
5458 {
5459 rtx (*insn)(rtx, rtx, rtx);
5460
5461 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5462 emit_insn (insn (operand, operand, GEN_INT (count)));
5463 }
5464 }
5465
5466 void
5467 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5468 {
5469 rtx (*gen_ashl3)(rtx, rtx, rtx);
5470 rtx (*gen_shld)(rtx, rtx, rtx);
5471 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5472 machine_mode half_mode;
5473
5474 rtx low[2], high[2];
5475 int count;
5476
5477 if (CONST_INT_P (operands[2]))
5478 {
5479 split_double_mode (mode, operands, 2, low, high);
5480 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5481
5482 if (count >= half_width)
5483 {
5484 emit_move_insn (high[0], low[1]);
5485 emit_move_insn (low[0], const0_rtx);
5486
5487 if (count > half_width)
5488 ix86_expand_ashl_const (high[0], count - half_width, mode);
5489 }
5490 else
5491 {
5492 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5493
5494 if (!rtx_equal_p (operands[0], operands[1]))
5495 emit_move_insn (operands[0], operands[1]);
5496
5497 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5498 ix86_expand_ashl_const (low[0], count, mode);
5499 }
5500 return;
5501 }
5502
5503 split_double_mode (mode, operands, 1, low, high);
5504 half_mode = mode == DImode ? SImode : DImode;
5505
5506 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5507
5508 if (operands[1] == const1_rtx)
5509 {
5510 /* Assuming we've chosen a QImode capable registers, then 1 << N
5511 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5512 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5513 {
5514 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5515
5516 ix86_expand_clear (low[0]);
5517 ix86_expand_clear (high[0]);
5518 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5519
5520 d = gen_lowpart (QImode, low[0]);
5521 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5522 s = gen_rtx_EQ (QImode, flags, const0_rtx);
5523 emit_insn (gen_rtx_SET (d, s));
5524
5525 d = gen_lowpart (QImode, high[0]);
5526 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5527 s = gen_rtx_NE (QImode, flags, const0_rtx);
5528 emit_insn (gen_rtx_SET (d, s));
5529 }
5530
5531 /* Otherwise, we can get the same results by manually performing
5532 a bit extract operation on bit 5/6, and then performing the two
5533 shifts. The two methods of getting 0/1 into low/high are exactly
5534 the same size. Avoiding the shift in the bit extract case helps
5535 pentium4 a bit; no one else seems to care much either way. */
5536 else
5537 {
5538 rtx (*gen_lshr3)(rtx, rtx, rtx);
5539 rtx (*gen_and3)(rtx, rtx, rtx);
5540 rtx (*gen_xor3)(rtx, rtx, rtx);
5541 HOST_WIDE_INT bits;
5542 rtx x;
5543
5544 if (mode == DImode)
5545 {
5546 gen_lshr3 = gen_lshrsi3;
5547 gen_and3 = gen_andsi3;
5548 gen_xor3 = gen_xorsi3;
5549 bits = 5;
5550 }
5551 else
5552 {
5553 gen_lshr3 = gen_lshrdi3;
5554 gen_and3 = gen_anddi3;
5555 gen_xor3 = gen_xordi3;
5556 bits = 6;
5557 }
5558
5559 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5560 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5561 else
5562 x = gen_lowpart (half_mode, operands[2]);
5563 emit_insn (gen_rtx_SET (high[0], x));
5564
5565 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5566 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5567 emit_move_insn (low[0], high[0]);
5568 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5569 }
5570
5571 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5572 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5573 return;
5574 }
5575
5576 if (operands[1] == constm1_rtx)
5577 {
5578 /* For -1 << N, we can avoid the shld instruction, because we
5579 know that we're shifting 0...31/63 ones into a -1. */
5580 emit_move_insn (low[0], constm1_rtx);
5581 if (optimize_insn_for_size_p ())
5582 emit_move_insn (high[0], low[0]);
5583 else
5584 emit_move_insn (high[0], constm1_rtx);
5585 }
5586 else
5587 {
5588 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5589
5590 if (!rtx_equal_p (operands[0], operands[1]))
5591 emit_move_insn (operands[0], operands[1]);
5592
5593 split_double_mode (mode, operands, 1, low, high);
5594 emit_insn (gen_shld (high[0], low[0], operands[2]));
5595 }
5596
5597 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5598
5599 if (TARGET_CMOVE && scratch)
5600 {
5601 ix86_expand_clear (scratch);
5602 emit_insn (gen_x86_shift_adj_1
5603 (half_mode, high[0], low[0], operands[2], scratch));
5604 }
5605 else
5606 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
5607 }
5608
5609 void
5610 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5611 {
5612 rtx (*gen_ashr3)(rtx, rtx, rtx)
5613 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5614 rtx (*gen_shrd)(rtx, rtx, rtx);
5615 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5616
5617 rtx low[2], high[2];
5618 int count;
5619
5620 if (CONST_INT_P (operands[2]))
5621 {
5622 split_double_mode (mode, operands, 2, low, high);
5623 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5624
5625 if (count == GET_MODE_BITSIZE (mode) - 1)
5626 {
5627 emit_move_insn (high[0], high[1]);
5628 emit_insn (gen_ashr3 (high[0], high[0],
5629 GEN_INT (half_width - 1)));
5630 emit_move_insn (low[0], high[0]);
5631
5632 }
5633 else if (count >= half_width)
5634 {
5635 emit_move_insn (low[0], high[1]);
5636 emit_move_insn (high[0], low[0]);
5637 emit_insn (gen_ashr3 (high[0], high[0],
5638 GEN_INT (half_width - 1)));
5639
5640 if (count > half_width)
5641 emit_insn (gen_ashr3 (low[0], low[0],
5642 GEN_INT (count - half_width)));
5643 }
5644 else
5645 {
5646 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5647
5648 if (!rtx_equal_p (operands[0], operands[1]))
5649 emit_move_insn (operands[0], operands[1]);
5650
5651 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5652 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5653 }
5654 }
5655 else
5656 {
5657 machine_mode half_mode;
5658
5659 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5660
5661 if (!rtx_equal_p (operands[0], operands[1]))
5662 emit_move_insn (operands[0], operands[1]);
5663
5664 split_double_mode (mode, operands, 1, low, high);
5665 half_mode = mode == DImode ? SImode : DImode;
5666
5667 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5668 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5669
5670 if (TARGET_CMOVE && scratch)
5671 {
5672 emit_move_insn (scratch, high[0]);
5673 emit_insn (gen_ashr3 (scratch, scratch,
5674 GEN_INT (half_width - 1)));
5675 emit_insn (gen_x86_shift_adj_1
5676 (half_mode, low[0], high[0], operands[2], scratch));
5677 }
5678 else
5679 emit_insn (gen_x86_shift_adj_3
5680 (half_mode, low[0], high[0], operands[2]));
5681 }
5682 }
5683
5684 void
5685 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5686 {
5687 rtx (*gen_lshr3)(rtx, rtx, rtx)
5688 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5689 rtx (*gen_shrd)(rtx, rtx, rtx);
5690 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5691
5692 rtx low[2], high[2];
5693 int count;
5694
5695 if (CONST_INT_P (operands[2]))
5696 {
5697 split_double_mode (mode, operands, 2, low, high);
5698 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5699
5700 if (count >= half_width)
5701 {
5702 emit_move_insn (low[0], high[1]);
5703 ix86_expand_clear (high[0]);
5704
5705 if (count > half_width)
5706 emit_insn (gen_lshr3 (low[0], low[0],
5707 GEN_INT (count - half_width)));
5708 }
5709 else
5710 {
5711 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5712
5713 if (!rtx_equal_p (operands[0], operands[1]))
5714 emit_move_insn (operands[0], operands[1]);
5715
5716 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5717 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5718 }
5719 }
5720 else
5721 {
5722 machine_mode half_mode;
5723
5724 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5725
5726 if (!rtx_equal_p (operands[0], operands[1]))
5727 emit_move_insn (operands[0], operands[1]);
5728
5729 split_double_mode (mode, operands, 1, low, high);
5730 half_mode = mode == DImode ? SImode : DImode;
5731
5732 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5733 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5734
5735 if (TARGET_CMOVE && scratch)
5736 {
5737 ix86_expand_clear (scratch);
5738 emit_insn (gen_x86_shift_adj_1
5739 (half_mode, low[0], high[0], operands[2], scratch));
5740 }
5741 else
5742 emit_insn (gen_x86_shift_adj_2
5743 (half_mode, low[0], high[0], operands[2]));
5744 }
5745 }
5746
5747 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5748 DImode for constant loop counts. */
5749
5750 static machine_mode
5751 counter_mode (rtx count_exp)
5752 {
5753 if (GET_MODE (count_exp) != VOIDmode)
5754 return GET_MODE (count_exp);
5755 if (!CONST_INT_P (count_exp))
5756 return Pmode;
5757 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5758 return DImode;
5759 return SImode;
5760 }
5761
5762 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5763 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5764 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5765 memory by VALUE (supposed to be in MODE).
5766
5767 The size is rounded down to whole number of chunk size moved at once.
5768 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5769
5770
5771 static void
5772 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
5773 rtx destptr, rtx srcptr, rtx value,
5774 rtx count, machine_mode mode, int unroll,
5775 int expected_size, bool issetmem)
5776 {
5777 rtx_code_label *out_label, *top_label;
5778 rtx iter, tmp;
5779 machine_mode iter_mode = counter_mode (count);
5780 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5781 rtx piece_size = GEN_INT (piece_size_n);
5782 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5783 rtx size;
5784 int i;
5785
5786 top_label = gen_label_rtx ();
5787 out_label = gen_label_rtx ();
5788 iter = gen_reg_rtx (iter_mode);
5789
5790 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5791 NULL, 1, OPTAB_DIRECT);
5792 /* Those two should combine. */
5793 if (piece_size == const1_rtx)
5794 {
5795 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5796 true, out_label);
5797 predict_jump (REG_BR_PROB_BASE * 10 / 100);
5798 }
5799 emit_move_insn (iter, const0_rtx);
5800
5801 emit_label (top_label);
5802
5803 tmp = convert_modes (Pmode, iter_mode, iter, true);
5804
5805 /* This assert could be relaxed - in this case we'll need to compute
5806 smallest power of two, containing in PIECE_SIZE_N and pass it to
5807 offset_address. */
5808 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5809 destmem = offset_address (destmem, tmp, piece_size_n);
5810 destmem = adjust_address (destmem, mode, 0);
5811
5812 if (!issetmem)
5813 {
5814 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5815 srcmem = adjust_address (srcmem, mode, 0);
5816
5817 /* When unrolling for chips that reorder memory reads and writes,
5818 we can save registers by using single temporary.
5819 Also using 4 temporaries is overkill in 32bit mode. */
5820 if (!TARGET_64BIT && 0)
5821 {
5822 for (i = 0; i < unroll; i++)
5823 {
5824 if (i)
5825 {
5826 destmem = adjust_address (copy_rtx (destmem), mode,
5827 GET_MODE_SIZE (mode));
5828 srcmem = adjust_address (copy_rtx (srcmem), mode,
5829 GET_MODE_SIZE (mode));
5830 }
5831 emit_move_insn (destmem, srcmem);
5832 }
5833 }
5834 else
5835 {
5836 rtx tmpreg[4];
5837 gcc_assert (unroll <= 4);
5838 for (i = 0; i < unroll; i++)
5839 {
5840 tmpreg[i] = gen_reg_rtx (mode);
5841 if (i)
5842 srcmem = adjust_address (copy_rtx (srcmem), mode,
5843 GET_MODE_SIZE (mode));
5844 emit_move_insn (tmpreg[i], srcmem);
5845 }
5846 for (i = 0; i < unroll; i++)
5847 {
5848 if (i)
5849 destmem = adjust_address (copy_rtx (destmem), mode,
5850 GET_MODE_SIZE (mode));
5851 emit_move_insn (destmem, tmpreg[i]);
5852 }
5853 }
5854 }
5855 else
5856 for (i = 0; i < unroll; i++)
5857 {
5858 if (i)
5859 destmem = adjust_address (copy_rtx (destmem), mode,
5860 GET_MODE_SIZE (mode));
5861 emit_move_insn (destmem, value);
5862 }
5863
5864 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5865 true, OPTAB_LIB_WIDEN);
5866 if (tmp != iter)
5867 emit_move_insn (iter, tmp);
5868
5869 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5870 true, top_label);
5871 if (expected_size != -1)
5872 {
5873 expected_size /= GET_MODE_SIZE (mode) * unroll;
5874 if (expected_size == 0)
5875 predict_jump (0);
5876 else if (expected_size > REG_BR_PROB_BASE)
5877 predict_jump (REG_BR_PROB_BASE - 1);
5878 else
5879 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5880 / expected_size);
5881 }
5882 else
5883 predict_jump (REG_BR_PROB_BASE * 80 / 100);
5884 iter = ix86_zero_extend_to_Pmode (iter);
5885 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5886 true, OPTAB_LIB_WIDEN);
5887 if (tmp != destptr)
5888 emit_move_insn (destptr, tmp);
5889 if (!issetmem)
5890 {
5891 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5892 true, OPTAB_LIB_WIDEN);
5893 if (tmp != srcptr)
5894 emit_move_insn (srcptr, tmp);
5895 }
5896 emit_label (out_label);
5897 }
5898
5899 /* Divide COUNTREG by SCALE. */
5900 static rtx
5901 scale_counter (rtx countreg, int scale)
5902 {
5903 rtx sc;
5904
5905 if (scale == 1)
5906 return countreg;
5907 if (CONST_INT_P (countreg))
5908 return GEN_INT (INTVAL (countreg) / scale);
5909 gcc_assert (REG_P (countreg));
5910
5911 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5912 GEN_INT (exact_log2 (scale)),
5913 NULL, 1, OPTAB_DIRECT);
5914 return sc;
5915 }
5916
5917 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5918 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5919 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5920 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5921 ORIG_VALUE is the original value passed to memset to fill the memory with.
5922 Other arguments have same meaning as for previous function. */
5923
5924 static void
5925 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
5926 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5927 rtx count,
5928 machine_mode mode, bool issetmem)
5929 {
5930 rtx destexp;
5931 rtx srcexp;
5932 rtx countreg;
5933 HOST_WIDE_INT rounded_count;
5934
5935 /* If possible, it is shorter to use rep movs.
5936 TODO: Maybe it is better to move this logic to decide_alg. */
5937 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5938 && (!issetmem || orig_value == const0_rtx))
5939 mode = SImode;
5940
5941 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5942 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5943
5944 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5945 GET_MODE_SIZE (mode)));
5946 if (mode != QImode)
5947 {
5948 destexp = gen_rtx_ASHIFT (Pmode, countreg,
5949 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5950 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5951 }
5952 else
5953 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
5954 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
5955 {
5956 rounded_count
5957 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5958 destmem = shallow_copy_rtx (destmem);
5959 set_mem_size (destmem, rounded_count);
5960 }
5961 else if (MEM_SIZE_KNOWN_P (destmem))
5962 clear_mem_size (destmem);
5963
5964 if (issetmem)
5965 {
5966 value = force_reg (mode, gen_lowpart (mode, value));
5967 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
5968 }
5969 else
5970 {
5971 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
5972 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
5973 if (mode != QImode)
5974 {
5975 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
5976 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5977 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
5978 }
5979 else
5980 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
5981 if (CONST_INT_P (count))
5982 {
5983 rounded_count
5984 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5985 srcmem = shallow_copy_rtx (srcmem);
5986 set_mem_size (srcmem, rounded_count);
5987 }
5988 else
5989 {
5990 if (MEM_SIZE_KNOWN_P (srcmem))
5991 clear_mem_size (srcmem);
5992 }
5993 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
5994 destexp, srcexp));
5995 }
5996 }
5997
5998 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
5999 DESTMEM.
6000 SRC is passed by pointer to be updated on return.
6001 Return value is updated DST. */
6002 static rtx
6003 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6004 HOST_WIDE_INT size_to_move)
6005 {
6006 rtx dst = destmem, src = *srcmem, tempreg;
6007 enum insn_code code;
6008 machine_mode move_mode;
6009 int piece_size, i;
6010
6011 /* Find the widest mode in which we could perform moves.
6012 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6013 it until move of such size is supported. */
6014 piece_size = 1 << floor_log2 (size_to_move);
6015 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6016 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6017 {
6018 gcc_assert (piece_size > 1);
6019 piece_size >>= 1;
6020 }
6021
6022 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6023 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6024 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6025 {
6026 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6027 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6028 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6029 {
6030 move_mode = word_mode;
6031 piece_size = GET_MODE_SIZE (move_mode);
6032 code = optab_handler (mov_optab, move_mode);
6033 }
6034 }
6035 gcc_assert (code != CODE_FOR_nothing);
6036
6037 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6038 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6039
6040 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6041 gcc_assert (size_to_move % piece_size == 0);
6042
6043 for (i = 0; i < size_to_move; i += piece_size)
6044 {
6045 /* We move from memory to memory, so we'll need to do it via
6046 a temporary register. */
6047 tempreg = gen_reg_rtx (move_mode);
6048 emit_insn (GEN_FCN (code) (tempreg, src));
6049 emit_insn (GEN_FCN (code) (dst, tempreg));
6050
6051 emit_move_insn (destptr,
6052 plus_constant (Pmode, copy_rtx (destptr), piece_size));
6053 emit_move_insn (srcptr,
6054 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
6055
6056 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6057 piece_size);
6058 src = adjust_automodify_address_nv (src, move_mode, srcptr,
6059 piece_size);
6060 }
6061
6062 /* Update DST and SRC rtx. */
6063 *srcmem = src;
6064 return dst;
6065 }
6066
6067 /* Helper function for the string operations below. Dest VARIABLE whether
6068 it is aligned to VALUE bytes. If true, jump to the label. */
6069
6070 static rtx_code_label *
6071 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6072 {
6073 rtx_code_label *label = gen_label_rtx ();
6074 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6075 if (GET_MODE (variable) == DImode)
6076 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6077 else
6078 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6079 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6080 1, label);
6081 if (epilogue)
6082 predict_jump (REG_BR_PROB_BASE * 50 / 100);
6083 else
6084 predict_jump (REG_BR_PROB_BASE * 90 / 100);
6085 return label;
6086 }
6087
6088
6089 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6090
6091 static void
6092 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
6093 rtx destptr, rtx srcptr, rtx count, int max_size)
6094 {
6095 rtx src, dest;
6096 if (CONST_INT_P (count))
6097 {
6098 HOST_WIDE_INT countval = INTVAL (count);
6099 HOST_WIDE_INT epilogue_size = countval % max_size;
6100 int i;
6101
6102 /* For now MAX_SIZE should be a power of 2. This assert could be
6103 relaxed, but it'll require a bit more complicated epilogue
6104 expanding. */
6105 gcc_assert ((max_size & (max_size - 1)) == 0);
6106 for (i = max_size; i >= 1; i >>= 1)
6107 {
6108 if (epilogue_size & i)
6109 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6110 }
6111 return;
6112 }
6113 if (max_size > 8)
6114 {
6115 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6116 count, 1, OPTAB_DIRECT);
6117 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
6118 count, QImode, 1, 4, false);
6119 return;
6120 }
6121
6122 /* When there are stringops, we can cheaply increase dest and src pointers.
6123 Otherwise we save code size by maintaining offset (zero is readily
6124 available from preceding rep operation) and using x86 addressing modes.
6125 */
6126 if (TARGET_SINGLE_STRINGOP)
6127 {
6128 if (max_size > 4)
6129 {
6130 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6131 src = change_address (srcmem, SImode, srcptr);
6132 dest = change_address (destmem, SImode, destptr);
6133 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6134 emit_label (label);
6135 LABEL_NUSES (label) = 1;
6136 }
6137 if (max_size > 2)
6138 {
6139 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6140 src = change_address (srcmem, HImode, srcptr);
6141 dest = change_address (destmem, HImode, destptr);
6142 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6143 emit_label (label);
6144 LABEL_NUSES (label) = 1;
6145 }
6146 if (max_size > 1)
6147 {
6148 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6149 src = change_address (srcmem, QImode, srcptr);
6150 dest = change_address (destmem, QImode, destptr);
6151 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6152 emit_label (label);
6153 LABEL_NUSES (label) = 1;
6154 }
6155 }
6156 else
6157 {
6158 rtx offset = force_reg (Pmode, const0_rtx);
6159 rtx tmp;
6160
6161 if (max_size > 4)
6162 {
6163 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6164 src = change_address (srcmem, SImode, srcptr);
6165 dest = change_address (destmem, SImode, destptr);
6166 emit_move_insn (dest, src);
6167 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6168 true, OPTAB_LIB_WIDEN);
6169 if (tmp != offset)
6170 emit_move_insn (offset, tmp);
6171 emit_label (label);
6172 LABEL_NUSES (label) = 1;
6173 }
6174 if (max_size > 2)
6175 {
6176 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6177 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6178 src = change_address (srcmem, HImode, tmp);
6179 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6180 dest = change_address (destmem, HImode, tmp);
6181 emit_move_insn (dest, src);
6182 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6183 true, OPTAB_LIB_WIDEN);
6184 if (tmp != offset)
6185 emit_move_insn (offset, tmp);
6186 emit_label (label);
6187 LABEL_NUSES (label) = 1;
6188 }
6189 if (max_size > 1)
6190 {
6191 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6192 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6193 src = change_address (srcmem, QImode, tmp);
6194 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6195 dest = change_address (destmem, QImode, tmp);
6196 emit_move_insn (dest, src);
6197 emit_label (label);
6198 LABEL_NUSES (label) = 1;
6199 }
6200 }
6201 }
6202
6203 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6204 with value PROMOTED_VAL.
6205 SRC is passed by pointer to be updated on return.
6206 Return value is updated DST. */
6207 static rtx
6208 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6209 HOST_WIDE_INT size_to_move)
6210 {
6211 rtx dst = destmem;
6212 enum insn_code code;
6213 machine_mode move_mode;
6214 int piece_size, i;
6215
6216 /* Find the widest mode in which we could perform moves.
6217 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6218 it until move of such size is supported. */
6219 move_mode = GET_MODE (promoted_val);
6220 if (move_mode == VOIDmode)
6221 move_mode = QImode;
6222 if (size_to_move < GET_MODE_SIZE (move_mode))
6223 {
6224 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6225 move_mode = int_mode_for_size (move_bits, 0).require ();
6226 promoted_val = gen_lowpart (move_mode, promoted_val);
6227 }
6228 piece_size = GET_MODE_SIZE (move_mode);
6229 code = optab_handler (mov_optab, move_mode);
6230 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6231
6232 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6233
6234 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6235 gcc_assert (size_to_move % piece_size == 0);
6236
6237 for (i = 0; i < size_to_move; i += piece_size)
6238 {
6239 if (piece_size <= GET_MODE_SIZE (word_mode))
6240 {
6241 emit_insn (gen_strset (destptr, dst, promoted_val));
6242 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6243 piece_size);
6244 continue;
6245 }
6246
6247 emit_insn (GEN_FCN (code) (dst, promoted_val));
6248
6249 emit_move_insn (destptr,
6250 plus_constant (Pmode, copy_rtx (destptr), piece_size));
6251
6252 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6253 piece_size);
6254 }
6255
6256 /* Update DST rtx. */
6257 return dst;
6258 }
6259 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6260 static void
6261 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6262 rtx count, int max_size)
6263 {
6264 count = expand_simple_binop (counter_mode (count), AND, count,
6265 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
6266 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
6267 gen_lowpart (QImode, value), count, QImode,
6268 1, max_size / 2, true);
6269 }
6270
6271 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6272 static void
6273 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6274 rtx count, int max_size)
6275 {
6276 rtx dest;
6277
6278 if (CONST_INT_P (count))
6279 {
6280 HOST_WIDE_INT countval = INTVAL (count);
6281 HOST_WIDE_INT epilogue_size = countval % max_size;
6282 int i;
6283
6284 /* For now MAX_SIZE should be a power of 2. This assert could be
6285 relaxed, but it'll require a bit more complicated epilogue
6286 expanding. */
6287 gcc_assert ((max_size & (max_size - 1)) == 0);
6288 for (i = max_size; i >= 1; i >>= 1)
6289 {
6290 if (epilogue_size & i)
6291 {
6292 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6293 destmem = emit_memset (destmem, destptr, vec_value, i);
6294 else
6295 destmem = emit_memset (destmem, destptr, value, i);
6296 }
6297 }
6298 return;
6299 }
6300 if (max_size > 32)
6301 {
6302 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6303 return;
6304 }
6305 if (max_size > 16)
6306 {
6307 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6308 if (TARGET_64BIT)
6309 {
6310 dest = change_address (destmem, DImode, destptr);
6311 emit_insn (gen_strset (destptr, dest, value));
6312 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6313 emit_insn (gen_strset (destptr, dest, value));
6314 }
6315 else
6316 {
6317 dest = change_address (destmem, SImode, destptr);
6318 emit_insn (gen_strset (destptr, dest, value));
6319 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6320 emit_insn (gen_strset (destptr, dest, value));
6321 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6322 emit_insn (gen_strset (destptr, dest, value));
6323 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6324 emit_insn (gen_strset (destptr, dest, value));
6325 }
6326 emit_label (label);
6327 LABEL_NUSES (label) = 1;
6328 }
6329 if (max_size > 8)
6330 {
6331 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6332 if (TARGET_64BIT)
6333 {
6334 dest = change_address (destmem, DImode, destptr);
6335 emit_insn (gen_strset (destptr, dest, value));
6336 }
6337 else
6338 {
6339 dest = change_address (destmem, SImode, destptr);
6340 emit_insn (gen_strset (destptr, dest, value));
6341 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6342 emit_insn (gen_strset (destptr, dest, value));
6343 }
6344 emit_label (label);
6345 LABEL_NUSES (label) = 1;
6346 }
6347 if (max_size > 4)
6348 {
6349 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6350 dest = change_address (destmem, SImode, destptr);
6351 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6352 emit_label (label);
6353 LABEL_NUSES (label) = 1;
6354 }
6355 if (max_size > 2)
6356 {
6357 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6358 dest = change_address (destmem, HImode, destptr);
6359 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6360 emit_label (label);
6361 LABEL_NUSES (label) = 1;
6362 }
6363 if (max_size > 1)
6364 {
6365 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6366 dest = change_address (destmem, QImode, destptr);
6367 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6368 emit_label (label);
6369 LABEL_NUSES (label) = 1;
6370 }
6371 }
6372
6373 /* Adjust COUNTER by the VALUE. */
6374 static void
6375 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6376 {
6377 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
6378 }
6379
6380 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6381 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6382 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6383 ignored.
6384 Return value is updated DESTMEM. */
6385
6386 static rtx
6387 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
6388 rtx destptr, rtx srcptr, rtx value,
6389 rtx vec_value, rtx count, int align,
6390 int desired_alignment, bool issetmem)
6391 {
6392 int i;
6393 for (i = 1; i < desired_alignment; i <<= 1)
6394 {
6395 if (align <= i)
6396 {
6397 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6398 if (issetmem)
6399 {
6400 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6401 destmem = emit_memset (destmem, destptr, vec_value, i);
6402 else
6403 destmem = emit_memset (destmem, destptr, value, i);
6404 }
6405 else
6406 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6407 ix86_adjust_counter (count, i);
6408 emit_label (label);
6409 LABEL_NUSES (label) = 1;
6410 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6411 }
6412 }
6413 return destmem;
6414 }
6415
6416 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6417 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6418 and jump to DONE_LABEL. */
6419 static void
6420 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
6421 rtx destptr, rtx srcptr,
6422 rtx value, rtx vec_value,
6423 rtx count, int size,
6424 rtx done_label, bool issetmem)
6425 {
6426 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6427 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6428 rtx modesize;
6429 int n;
6430
6431 /* If we do not have vector value to copy, we must reduce size. */
6432 if (issetmem)
6433 {
6434 if (!vec_value)
6435 {
6436 if (GET_MODE (value) == VOIDmode && size > 8)
6437 mode = Pmode;
6438 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6439 mode = GET_MODE (value);
6440 }
6441 else
6442 mode = GET_MODE (vec_value), value = vec_value;
6443 }
6444 else
6445 {
6446 /* Choose appropriate vector mode. */
6447 if (size >= 32)
6448 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6449 else if (size >= 16)
6450 mode = TARGET_SSE ? V16QImode : DImode;
6451 srcmem = change_address (srcmem, mode, srcptr);
6452 }
6453 destmem = change_address (destmem, mode, destptr);
6454 modesize = GEN_INT (GET_MODE_SIZE (mode));
6455 gcc_assert (GET_MODE_SIZE (mode) <= size);
6456 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6457 {
6458 if (issetmem)
6459 emit_move_insn (destmem, gen_lowpart (mode, value));
6460 else
6461 {
6462 emit_move_insn (destmem, srcmem);
6463 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6464 }
6465 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6466 }
6467
6468 destmem = offset_address (destmem, count, 1);
6469 destmem = offset_address (destmem, GEN_INT (-2 * size),
6470 GET_MODE_SIZE (mode));
6471 if (!issetmem)
6472 {
6473 srcmem = offset_address (srcmem, count, 1);
6474 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6475 GET_MODE_SIZE (mode));
6476 }
6477 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6478 {
6479 if (issetmem)
6480 emit_move_insn (destmem, gen_lowpart (mode, value));
6481 else
6482 {
6483 emit_move_insn (destmem, srcmem);
6484 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6485 }
6486 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6487 }
6488 emit_jump_insn (gen_jump (done_label));
6489 emit_barrier ();
6490
6491 emit_label (label);
6492 LABEL_NUSES (label) = 1;
6493 }
6494
6495 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6496 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6497 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6498 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6499 DONE_LABEL is a label after the whole copying sequence. The label is created
6500 on demand if *DONE_LABEL is NULL.
6501 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6502 bounds after the initial copies.
6503
6504 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6505 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6506 we will dispatch to a library call for large blocks.
6507
6508 In pseudocode we do:
6509
6510 if (COUNT < SIZE)
6511 {
6512 Assume that SIZE is 4. Bigger sizes are handled analogously
6513 if (COUNT & 4)
6514 {
6515 copy 4 bytes from SRCPTR to DESTPTR
6516 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6517 goto done_label
6518 }
6519 if (!COUNT)
6520 goto done_label;
6521 copy 1 byte from SRCPTR to DESTPTR
6522 if (COUNT & 2)
6523 {
6524 copy 2 bytes from SRCPTR to DESTPTR
6525 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6526 }
6527 }
6528 else
6529 {
6530 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6531 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6532
6533 OLD_DESPTR = DESTPTR;
6534 Align DESTPTR up to DESIRED_ALIGN
6535 SRCPTR += DESTPTR - OLD_DESTPTR
6536 COUNT -= DEST_PTR - OLD_DESTPTR
6537 if (DYNAMIC_CHECK)
6538 Round COUNT down to multiple of SIZE
6539 << optional caller supplied zero size guard is here >>
6540 << optional caller supplied dynamic check is here >>
6541 << caller supplied main copy loop is here >>
6542 }
6543 done_label:
6544 */
6545 static void
6546 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
6547 rtx *destptr, rtx *srcptr,
6548 machine_mode mode,
6549 rtx value, rtx vec_value,
6550 rtx *count,
6551 rtx_code_label **done_label,
6552 int size,
6553 int desired_align,
6554 int align,
6555 unsigned HOST_WIDE_INT *min_size,
6556 bool dynamic_check,
6557 bool issetmem)
6558 {
6559 rtx_code_label *loop_label = NULL, *label;
6560 int n;
6561 rtx modesize;
6562 int prolog_size = 0;
6563 rtx mode_value;
6564
6565 /* Chose proper value to copy. */
6566 if (issetmem && VECTOR_MODE_P (mode))
6567 mode_value = vec_value;
6568 else
6569 mode_value = value;
6570 gcc_assert (GET_MODE_SIZE (mode) <= size);
6571
6572 /* See if block is big or small, handle small blocks. */
6573 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6574 {
6575 int size2 = size;
6576 loop_label = gen_label_rtx ();
6577
6578 if (!*done_label)
6579 *done_label = gen_label_rtx ();
6580
6581 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6582 1, loop_label);
6583 size2 >>= 1;
6584
6585 /* Handle sizes > 3. */
6586 for (;size2 > 2; size2 >>= 1)
6587 expand_small_cpymem_or_setmem (destmem, srcmem,
6588 *destptr, *srcptr,
6589 value, vec_value,
6590 *count,
6591 size2, *done_label, issetmem);
6592 /* Nothing to copy? Jump to DONE_LABEL if so */
6593 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6594 1, *done_label);
6595
6596 /* Do a byte copy. */
6597 destmem = change_address (destmem, QImode, *destptr);
6598 if (issetmem)
6599 emit_move_insn (destmem, gen_lowpart (QImode, value));
6600 else
6601 {
6602 srcmem = change_address (srcmem, QImode, *srcptr);
6603 emit_move_insn (destmem, srcmem);
6604 }
6605
6606 /* Handle sizes 2 and 3. */
6607 label = ix86_expand_aligntest (*count, 2, false);
6608 destmem = change_address (destmem, HImode, *destptr);
6609 destmem = offset_address (destmem, *count, 1);
6610 destmem = offset_address (destmem, GEN_INT (-2), 2);
6611 if (issetmem)
6612 emit_move_insn (destmem, gen_lowpart (HImode, value));
6613 else
6614 {
6615 srcmem = change_address (srcmem, HImode, *srcptr);
6616 srcmem = offset_address (srcmem, *count, 1);
6617 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6618 emit_move_insn (destmem, srcmem);
6619 }
6620
6621 emit_label (label);
6622 LABEL_NUSES (label) = 1;
6623 emit_jump_insn (gen_jump (*done_label));
6624 emit_barrier ();
6625 }
6626 else
6627 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6628 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6629
6630 /* Start memcpy for COUNT >= SIZE. */
6631 if (loop_label)
6632 {
6633 emit_label (loop_label);
6634 LABEL_NUSES (loop_label) = 1;
6635 }
6636
6637 /* Copy first desired_align bytes. */
6638 if (!issetmem)
6639 srcmem = change_address (srcmem, mode, *srcptr);
6640 destmem = change_address (destmem, mode, *destptr);
6641 modesize = GEN_INT (GET_MODE_SIZE (mode));
6642 for (n = 0; prolog_size < desired_align - align; n++)
6643 {
6644 if (issetmem)
6645 emit_move_insn (destmem, mode_value);
6646 else
6647 {
6648 emit_move_insn (destmem, srcmem);
6649 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6650 }
6651 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6652 prolog_size += GET_MODE_SIZE (mode);
6653 }
6654
6655
6656 /* Copy last SIZE bytes. */
6657 destmem = offset_address (destmem, *count, 1);
6658 destmem = offset_address (destmem,
6659 GEN_INT (-size - prolog_size),
6660 1);
6661 if (issetmem)
6662 emit_move_insn (destmem, mode_value);
6663 else
6664 {
6665 srcmem = offset_address (srcmem, *count, 1);
6666 srcmem = offset_address (srcmem,
6667 GEN_INT (-size - prolog_size),
6668 1);
6669 emit_move_insn (destmem, srcmem);
6670 }
6671 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6672 {
6673 destmem = offset_address (destmem, modesize, 1);
6674 if (issetmem)
6675 emit_move_insn (destmem, mode_value);
6676 else
6677 {
6678 srcmem = offset_address (srcmem, modesize, 1);
6679 emit_move_insn (destmem, srcmem);
6680 }
6681 }
6682
6683 /* Align destination. */
6684 if (desired_align > 1 && desired_align > align)
6685 {
6686 rtx saveddest = *destptr;
6687
6688 gcc_assert (desired_align <= size);
6689 /* Align destptr up, place it to new register. */
6690 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6691 GEN_INT (prolog_size),
6692 NULL_RTX, 1, OPTAB_DIRECT);
6693 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6694 REG_POINTER (*destptr) = 1;
6695 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6696 GEN_INT (-desired_align),
6697 *destptr, 1, OPTAB_DIRECT);
6698 /* See how many bytes we skipped. */
6699 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6700 *destptr,
6701 saveddest, 1, OPTAB_DIRECT);
6702 /* Adjust srcptr and count. */
6703 if (!issetmem)
6704 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6705 saveddest, *srcptr, 1, OPTAB_DIRECT);
6706 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6707 saveddest, *count, 1, OPTAB_DIRECT);
6708 /* We copied at most size + prolog_size. */
6709 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6710 *min_size
6711 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6712 else
6713 *min_size = 0;
6714
6715 /* Our loops always round down the block size, but for dispatch to
6716 library we need precise value. */
6717 if (dynamic_check)
6718 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
6719 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6720 }
6721 else
6722 {
6723 gcc_assert (prolog_size == 0);
6724 /* Decrease count, so we won't end up copying last word twice. */
6725 if (!CONST_INT_P (*count))
6726 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6727 constm1_rtx, *count, 1, OPTAB_DIRECT);
6728 else
6729 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6730 (unsigned HOST_WIDE_INT)size));
6731 if (*min_size)
6732 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6733 }
6734 }
6735
6736
6737 /* This function is like the previous one, except here we know how many bytes
6738 need to be copied. That allows us to update alignment not only of DST, which
6739 is returned, but also of SRC, which is passed as a pointer for that
6740 reason. */
6741 static rtx
6742 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
6743 rtx srcreg, rtx value, rtx vec_value,
6744 int desired_align, int align_bytes,
6745 bool issetmem)
6746 {
6747 rtx src = NULL;
6748 rtx orig_dst = dst;
6749 rtx orig_src = NULL;
6750 int piece_size = 1;
6751 int copied_bytes = 0;
6752
6753 if (!issetmem)
6754 {
6755 gcc_assert (srcp != NULL);
6756 src = *srcp;
6757 orig_src = src;
6758 }
6759
6760 for (piece_size = 1;
6761 piece_size <= desired_align && copied_bytes < align_bytes;
6762 piece_size <<= 1)
6763 {
6764 if (align_bytes & piece_size)
6765 {
6766 if (issetmem)
6767 {
6768 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6769 dst = emit_memset (dst, destreg, vec_value, piece_size);
6770 else
6771 dst = emit_memset (dst, destreg, value, piece_size);
6772 }
6773 else
6774 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6775 copied_bytes += piece_size;
6776 }
6777 }
6778 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6779 set_mem_align (dst, desired_align * BITS_PER_UNIT);
6780 if (MEM_SIZE_KNOWN_P (orig_dst))
6781 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6782
6783 if (!issetmem)
6784 {
6785 int src_align_bytes = get_mem_align_offset (src, desired_align
6786 * BITS_PER_UNIT);
6787 if (src_align_bytes >= 0)
6788 src_align_bytes = desired_align - src_align_bytes;
6789 if (src_align_bytes >= 0)
6790 {
6791 unsigned int src_align;
6792 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6793 {
6794 if ((src_align_bytes & (src_align - 1))
6795 == (align_bytes & (src_align - 1)))
6796 break;
6797 }
6798 if (src_align > (unsigned int) desired_align)
6799 src_align = desired_align;
6800 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6801 set_mem_align (src, src_align * BITS_PER_UNIT);
6802 }
6803 if (MEM_SIZE_KNOWN_P (orig_src))
6804 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6805 *srcp = src;
6806 }
6807
6808 return dst;
6809 }
6810
6811 /* Return true if ALG can be used in current context.
6812 Assume we expand memset if MEMSET is true. */
6813 static bool
6814 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6815 {
6816 if (alg == no_stringop)
6817 return false;
6818 if (alg == vector_loop)
6819 return TARGET_SSE || TARGET_AVX;
6820 /* Algorithms using the rep prefix want at least edi and ecx;
6821 additionally, memset wants eax and memcpy wants esi. Don't
6822 consider such algorithms if the user has appropriated those
6823 registers for their own purposes, or if we have a non-default
6824 address space, since some string insns cannot override the segment. */
6825 if (alg == rep_prefix_1_byte
6826 || alg == rep_prefix_4_byte
6827 || alg == rep_prefix_8_byte)
6828 {
6829 if (have_as)
6830 return false;
6831 if (fixed_regs[CX_REG]
6832 || fixed_regs[DI_REG]
6833 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6834 return false;
6835 }
6836 return true;
6837 }
6838
6839 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6840 static enum stringop_alg
6841 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6842 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6843 bool memset, bool zero_memset, bool have_as,
6844 int *dynamic_check, bool *noalign, bool recur)
6845 {
6846 const struct stringop_algs *algs;
6847 bool optimize_for_speed;
6848 int max = 0;
6849 const struct processor_costs *cost;
6850 int i;
6851 bool any_alg_usable_p = false;
6852
6853 *noalign = false;
6854 *dynamic_check = -1;
6855
6856 /* Even if the string operation call is cold, we still might spend a lot
6857 of time processing large blocks. */
6858 if (optimize_function_for_size_p (cfun)
6859 || (optimize_insn_for_size_p ()
6860 && (max_size < 256
6861 || (expected_size != -1 && expected_size < 256))))
6862 optimize_for_speed = false;
6863 else
6864 optimize_for_speed = true;
6865
6866 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6867 if (memset)
6868 algs = &cost->memset[TARGET_64BIT != 0];
6869 else
6870 algs = &cost->memcpy[TARGET_64BIT != 0];
6871
6872 /* See maximal size for user defined algorithm. */
6873 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6874 {
6875 enum stringop_alg candidate = algs->size[i].alg;
6876 bool usable = alg_usable_p (candidate, memset, have_as);
6877 any_alg_usable_p |= usable;
6878
6879 if (candidate != libcall && candidate && usable)
6880 max = algs->size[i].max;
6881 }
6882
6883 /* If expected size is not known but max size is small enough
6884 so inline version is a win, set expected size into
6885 the range. */
6886 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6887 && expected_size == -1)
6888 expected_size = min_size / 2 + max_size / 2;
6889
6890 /* If user specified the algorithm, honor it if possible. */
6891 if (ix86_stringop_alg != no_stringop
6892 && alg_usable_p (ix86_stringop_alg, memset, have_as))
6893 return ix86_stringop_alg;
6894 /* rep; movq or rep; movl is the smallest variant. */
6895 else if (!optimize_for_speed)
6896 {
6897 *noalign = true;
6898 if (!count || (count & 3) || (memset && !zero_memset))
6899 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6900 ? rep_prefix_1_byte : loop_1_byte;
6901 else
6902 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6903 ? rep_prefix_4_byte : loop;
6904 }
6905 /* Very tiny blocks are best handled via the loop, REP is expensive to
6906 setup. */
6907 else if (expected_size != -1 && expected_size < 4)
6908 return loop_1_byte;
6909 else if (expected_size != -1)
6910 {
6911 enum stringop_alg alg = libcall;
6912 bool alg_noalign = false;
6913 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6914 {
6915 /* We get here if the algorithms that were not libcall-based
6916 were rep-prefix based and we are unable to use rep prefixes
6917 based on global register usage. Break out of the loop and
6918 use the heuristic below. */
6919 if (algs->size[i].max == 0)
6920 break;
6921 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6922 {
6923 enum stringop_alg candidate = algs->size[i].alg;
6924
6925 if (candidate != libcall
6926 && alg_usable_p (candidate, memset, have_as))
6927 {
6928 alg = candidate;
6929 alg_noalign = algs->size[i].noalign;
6930 }
6931 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6932 last non-libcall inline algorithm. */
6933 if (TARGET_INLINE_ALL_STRINGOPS)
6934 {
6935 /* When the current size is best to be copied by a libcall,
6936 but we are still forced to inline, run the heuristic below
6937 that will pick code for medium sized blocks. */
6938 if (alg != libcall)
6939 {
6940 *noalign = alg_noalign;
6941 return alg;
6942 }
6943 else if (!any_alg_usable_p)
6944 break;
6945 }
6946 else if (alg_usable_p (candidate, memset, have_as))
6947 {
6948 *noalign = algs->size[i].noalign;
6949 return candidate;
6950 }
6951 }
6952 }
6953 }
6954 /* When asked to inline the call anyway, try to pick meaningful choice.
6955 We look for maximal size of block that is faster to copy by hand and
6956 take blocks of at most of that size guessing that average size will
6957 be roughly half of the block.
6958
6959 If this turns out to be bad, we might simply specify the preferred
6960 choice in ix86_costs. */
6961 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6962 && (algs->unknown_size == libcall
6963 || !alg_usable_p (algs->unknown_size, memset, have_as)))
6964 {
6965 enum stringop_alg alg;
6966 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
6967
6968 /* If there aren't any usable algorithms or if recursing already,
6969 then recursing on smaller sizes or same size isn't going to
6970 find anything. Just return the simple byte-at-a-time copy loop. */
6971 if (!any_alg_usable_p || recur)
6972 {
6973 /* Pick something reasonable. */
6974 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
6975 *dynamic_check = 128;
6976 return loop_1_byte;
6977 }
6978 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
6979 zero_memset, have_as, dynamic_check, noalign, true);
6980 gcc_assert (*dynamic_check == -1);
6981 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6982 *dynamic_check = max;
6983 else
6984 gcc_assert (alg != libcall);
6985 return alg;
6986 }
6987 return (alg_usable_p (algs->unknown_size, memset, have_as)
6988 ? algs->unknown_size : libcall);
6989 }
6990
6991 /* Decide on alignment. We know that the operand is already aligned to ALIGN
6992 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
6993 static int
6994 decide_alignment (int align,
6995 enum stringop_alg alg,
6996 int expected_size,
6997 machine_mode move_mode)
6998 {
6999 int desired_align = 0;
7000
7001 gcc_assert (alg != no_stringop);
7002
7003 if (alg == libcall)
7004 return 0;
7005 if (move_mode == VOIDmode)
7006 return 0;
7007
7008 desired_align = GET_MODE_SIZE (move_mode);
7009 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7010 copying whole cacheline at once. */
7011 if (TARGET_PENTIUMPRO
7012 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7013 desired_align = 8;
7014
7015 if (optimize_size)
7016 desired_align = 1;
7017 if (desired_align < align)
7018 desired_align = align;
7019 if (expected_size != -1 && expected_size < 4)
7020 desired_align = align;
7021
7022 return desired_align;
7023 }
7024
7025
7026 /* Helper function for memcpy. For QImode value 0xXY produce
7027 0xXYXYXYXY of wide specified by MODE. This is essentially
7028 a * 0x10101010, but we can do slightly better than
7029 synth_mult by unwinding the sequence by hand on CPUs with
7030 slow multiply. */
7031 static rtx
7032 promote_duplicated_reg (machine_mode mode, rtx val)
7033 {
7034 machine_mode valmode = GET_MODE (val);
7035 rtx tmp;
7036 int nops = mode == DImode ? 3 : 2;
7037
7038 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7039 if (val == const0_rtx)
7040 return copy_to_mode_reg (mode, CONST0_RTX (mode));
7041 if (CONST_INT_P (val))
7042 {
7043 HOST_WIDE_INT v = INTVAL (val) & 255;
7044
7045 v |= v << 8;
7046 v |= v << 16;
7047 if (mode == DImode)
7048 v |= (v << 16) << 16;
7049 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7050 }
7051
7052 if (valmode == VOIDmode)
7053 valmode = QImode;
7054 if (valmode != QImode)
7055 val = gen_lowpart (QImode, val);
7056 if (mode == QImode)
7057 return val;
7058 if (!TARGET_PARTIAL_REG_STALL)
7059 nops--;
7060 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7061 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7062 <= (ix86_cost->shift_const + ix86_cost->add) * nops
7063 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7064 {
7065 rtx reg = convert_modes (mode, QImode, val, true);
7066 tmp = promote_duplicated_reg (mode, const1_rtx);
7067 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7068 OPTAB_DIRECT);
7069 }
7070 else
7071 {
7072 rtx reg = convert_modes (mode, QImode, val, true);
7073
7074 if (!TARGET_PARTIAL_REG_STALL)
7075 emit_insn (gen_insv_1 (mode, reg, reg));
7076 else
7077 {
7078 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7079 NULL, 1, OPTAB_DIRECT);
7080 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7081 OPTAB_DIRECT);
7082 }
7083 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7084 NULL, 1, OPTAB_DIRECT);
7085 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7086 if (mode == SImode)
7087 return reg;
7088 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7089 NULL, 1, OPTAB_DIRECT);
7090 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7091 return reg;
7092 }
7093 }
7094
7095 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7096 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7097 alignment from ALIGN to DESIRED_ALIGN. */
7098 static rtx
7099 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7100 int align)
7101 {
7102 rtx promoted_val;
7103
7104 if (TARGET_64BIT
7105 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7106 promoted_val = promote_duplicated_reg (DImode, val);
7107 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7108 promoted_val = promote_duplicated_reg (SImode, val);
7109 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7110 promoted_val = promote_duplicated_reg (HImode, val);
7111 else
7112 promoted_val = val;
7113
7114 return promoted_val;
7115 }
7116
7117 /* Copy the address to a Pmode register. This is used for x32 to
7118 truncate DImode TLS address to a SImode register. */
7119
7120 static rtx
7121 ix86_copy_addr_to_reg (rtx addr)
7122 {
7123 rtx reg;
7124 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7125 {
7126 reg = copy_addr_to_reg (addr);
7127 REG_POINTER (reg) = 1;
7128 return reg;
7129 }
7130 else
7131 {
7132 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7133 reg = copy_to_mode_reg (DImode, addr);
7134 REG_POINTER (reg) = 1;
7135 return gen_rtx_SUBREG (SImode, reg, 0);
7136 }
7137 }
7138
7139 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7140 operations when profitable. The code depends upon architecture, block size
7141 and alignment, but always has one of the following overall structures:
7142
7143 Aligned move sequence:
7144
7145 1) Prologue guard: Conditional that jumps up to epilogues for small
7146 blocks that can be handled by epilogue alone. This is faster
7147 but also needed for correctness, since prologue assume the block
7148 is larger than the desired alignment.
7149
7150 Optional dynamic check for size and libcall for large
7151 blocks is emitted here too, with -minline-stringops-dynamically.
7152
7153 2) Prologue: copy first few bytes in order to get destination
7154 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7155 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7156 copied. We emit either a jump tree on power of two sized
7157 blocks, or a byte loop.
7158
7159 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7160 with specified algorithm.
7161
7162 4) Epilogue: code copying tail of the block that is too small to be
7163 handled by main body (or up to size guarded by prologue guard).
7164
7165 Misaligned move sequence
7166
7167 1) missaligned move prologue/epilogue containing:
7168 a) Prologue handling small memory blocks and jumping to done_label
7169 (skipped if blocks are known to be large enough)
7170 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7171 needed by single possibly misaligned move
7172 (skipped if alignment is not needed)
7173 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7174
7175 2) Zero size guard dispatching to done_label, if needed
7176
7177 3) dispatch to library call, if needed,
7178
7179 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7180 with specified algorithm. */
7181 bool
7182 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
7183 rtx align_exp, rtx expected_align_exp,
7184 rtx expected_size_exp, rtx min_size_exp,
7185 rtx max_size_exp, rtx probable_max_size_exp,
7186 bool issetmem)
7187 {
7188 rtx destreg;
7189 rtx srcreg = NULL;
7190 rtx_code_label *label = NULL;
7191 rtx tmp;
7192 rtx_code_label *jump_around_label = NULL;
7193 HOST_WIDE_INT align = 1;
7194 unsigned HOST_WIDE_INT count = 0;
7195 HOST_WIDE_INT expected_size = -1;
7196 int size_needed = 0, epilogue_size_needed;
7197 int desired_align = 0, align_bytes = 0;
7198 enum stringop_alg alg;
7199 rtx promoted_val = NULL;
7200 rtx vec_promoted_val = NULL;
7201 bool force_loopy_epilogue = false;
7202 int dynamic_check;
7203 bool need_zero_guard = false;
7204 bool noalign;
7205 machine_mode move_mode = VOIDmode;
7206 machine_mode wider_mode;
7207 int unroll_factor = 1;
7208 /* TODO: Once value ranges are available, fill in proper data. */
7209 unsigned HOST_WIDE_INT min_size = 0;
7210 unsigned HOST_WIDE_INT max_size = -1;
7211 unsigned HOST_WIDE_INT probable_max_size = -1;
7212 bool misaligned_prologue_used = false;
7213 bool have_as;
7214
7215 if (CONST_INT_P (align_exp))
7216 align = INTVAL (align_exp);
7217 /* i386 can do misaligned access on reasonably increased cost. */
7218 if (CONST_INT_P (expected_align_exp)
7219 && INTVAL (expected_align_exp) > align)
7220 align = INTVAL (expected_align_exp);
7221 /* ALIGN is the minimum of destination and source alignment, but we care here
7222 just about destination alignment. */
7223 else if (!issetmem
7224 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7225 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7226
7227 if (CONST_INT_P (count_exp))
7228 {
7229 min_size = max_size = probable_max_size = count = expected_size
7230 = INTVAL (count_exp);
7231 /* When COUNT is 0, there is nothing to do. */
7232 if (!count)
7233 return true;
7234 }
7235 else
7236 {
7237 if (min_size_exp)
7238 min_size = INTVAL (min_size_exp);
7239 if (max_size_exp)
7240 max_size = INTVAL (max_size_exp);
7241 if (probable_max_size_exp)
7242 probable_max_size = INTVAL (probable_max_size_exp);
7243 if (CONST_INT_P (expected_size_exp))
7244 expected_size = INTVAL (expected_size_exp);
7245 }
7246
7247 /* Make sure we don't need to care about overflow later on. */
7248 if (count > (HOST_WIDE_INT_1U << 30))
7249 return false;
7250
7251 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7252 if (!issetmem)
7253 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7254
7255 /* Step 0: Decide on preferred algorithm, desired alignment and
7256 size of chunks to be copied by main loop. */
7257 alg = decide_alg (count, expected_size, min_size, probable_max_size,
7258 issetmem,
7259 issetmem && val_exp == const0_rtx, have_as,
7260 &dynamic_check, &noalign, false);
7261
7262 if (dump_file)
7263 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7264 stringop_alg_names[alg]);
7265
7266 if (alg == libcall)
7267 return false;
7268 gcc_assert (alg != no_stringop);
7269
7270 /* For now vector-version of memset is generated only for memory zeroing, as
7271 creating of promoted vector value is very cheap in this case. */
7272 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7273 alg = unrolled_loop;
7274
7275 if (!count)
7276 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7277 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7278 if (!issetmem)
7279 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7280
7281 unroll_factor = 1;
7282 move_mode = word_mode;
7283 switch (alg)
7284 {
7285 case libcall:
7286 case no_stringop:
7287 case last_alg:
7288 gcc_unreachable ();
7289 case loop_1_byte:
7290 need_zero_guard = true;
7291 move_mode = QImode;
7292 break;
7293 case loop:
7294 need_zero_guard = true;
7295 break;
7296 case unrolled_loop:
7297 need_zero_guard = true;
7298 unroll_factor = (TARGET_64BIT ? 4 : 2);
7299 break;
7300 case vector_loop:
7301 need_zero_guard = true;
7302 unroll_factor = 4;
7303 /* Find the widest supported mode. */
7304 move_mode = word_mode;
7305 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7306 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7307 move_mode = wider_mode;
7308
7309 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
7310 move_mode = TImode;
7311
7312 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7313 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7314 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7315 {
7316 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7317 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7318 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7319 move_mode = word_mode;
7320 }
7321 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7322 break;
7323 case rep_prefix_8_byte:
7324 move_mode = DImode;
7325 break;
7326 case rep_prefix_4_byte:
7327 move_mode = SImode;
7328 break;
7329 case rep_prefix_1_byte:
7330 move_mode = QImode;
7331 break;
7332 }
7333 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7334 epilogue_size_needed = size_needed;
7335
7336 /* If we are going to call any library calls conditionally, make sure any
7337 pending stack adjustment happen before the first conditional branch,
7338 otherwise they will be emitted before the library call only and won't
7339 happen from the other branches. */
7340 if (dynamic_check != -1)
7341 do_pending_stack_adjust ();
7342
7343 desired_align = decide_alignment (align, alg, expected_size, move_mode);
7344 if (!TARGET_ALIGN_STRINGOPS || noalign)
7345 align = desired_align;
7346
7347 /* Step 1: Prologue guard. */
7348
7349 /* Alignment code needs count to be in register. */
7350 if (CONST_INT_P (count_exp) && desired_align > align)
7351 {
7352 if (INTVAL (count_exp) > desired_align
7353 && INTVAL (count_exp) > size_needed)
7354 {
7355 align_bytes
7356 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7357 if (align_bytes <= 0)
7358 align_bytes = 0;
7359 else
7360 align_bytes = desired_align - align_bytes;
7361 }
7362 if (align_bytes == 0)
7363 count_exp = force_reg (counter_mode (count_exp), count_exp);
7364 }
7365 gcc_assert (desired_align >= 1 && align >= 1);
7366
7367 /* Misaligned move sequences handle both prologue and epilogue at once.
7368 Default code generation results in a smaller code for large alignments
7369 and also avoids redundant job when sizes are known precisely. */
7370 misaligned_prologue_used
7371 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7372 && MAX (desired_align, epilogue_size_needed) <= 32
7373 && desired_align <= epilogue_size_needed
7374 && ((desired_align > align && !align_bytes)
7375 || (!count && epilogue_size_needed > 1)));
7376
7377 /* Do the cheap promotion to allow better CSE across the
7378 main loop and epilogue (ie one load of the big constant in the
7379 front of all code.
7380 For now the misaligned move sequences do not have fast path
7381 without broadcasting. */
7382 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7383 {
7384 if (alg == vector_loop)
7385 {
7386 gcc_assert (val_exp == const0_rtx);
7387 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7388 promoted_val = promote_duplicated_reg_to_size (val_exp,
7389 GET_MODE_SIZE (word_mode),
7390 desired_align, align);
7391 }
7392 else
7393 {
7394 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7395 desired_align, align);
7396 }
7397 }
7398 /* Misaligned move sequences handles both prologues and epilogues at once.
7399 Default code generation results in smaller code for large alignments and
7400 also avoids redundant job when sizes are known precisely. */
7401 if (misaligned_prologue_used)
7402 {
7403 /* Misaligned move prologue handled small blocks by itself. */
7404 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7405 (dst, src, &destreg, &srcreg,
7406 move_mode, promoted_val, vec_promoted_val,
7407 &count_exp,
7408 &jump_around_label,
7409 desired_align < align
7410 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7411 desired_align, align, &min_size, dynamic_check, issetmem);
7412 if (!issetmem)
7413 src = change_address (src, BLKmode, srcreg);
7414 dst = change_address (dst, BLKmode, destreg);
7415 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7416 epilogue_size_needed = 0;
7417 if (need_zero_guard
7418 && min_size < (unsigned HOST_WIDE_INT) size_needed)
7419 {
7420 /* It is possible that we copied enough so the main loop will not
7421 execute. */
7422 gcc_assert (size_needed > 1);
7423 if (jump_around_label == NULL_RTX)
7424 jump_around_label = gen_label_rtx ();
7425 emit_cmp_and_jump_insns (count_exp,
7426 GEN_INT (size_needed),
7427 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7428 if (expected_size == -1
7429 || expected_size < (desired_align - align) / 2 + size_needed)
7430 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7431 else
7432 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7433 }
7434 }
7435 /* Ensure that alignment prologue won't copy past end of block. */
7436 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7437 {
7438 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7439 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7440 Make sure it is power of 2. */
7441 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7442
7443 /* To improve performance of small blocks, we jump around the VAL
7444 promoting mode. This mean that if the promoted VAL is not constant,
7445 we might not use it in the epilogue and have to use byte
7446 loop variant. */
7447 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7448 force_loopy_epilogue = true;
7449 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7450 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7451 {
7452 /* If main algorithm works on QImode, no epilogue is needed.
7453 For small sizes just don't align anything. */
7454 if (size_needed == 1)
7455 desired_align = align;
7456 else
7457 goto epilogue;
7458 }
7459 else if (!count
7460 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7461 {
7462 label = gen_label_rtx ();
7463 emit_cmp_and_jump_insns (count_exp,
7464 GEN_INT (epilogue_size_needed),
7465 LTU, 0, counter_mode (count_exp), 1, label);
7466 if (expected_size == -1 || expected_size < epilogue_size_needed)
7467 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7468 else
7469 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7470 }
7471 }
7472
7473 /* Emit code to decide on runtime whether library call or inline should be
7474 used. */
7475 if (dynamic_check != -1)
7476 {
7477 if (!issetmem && CONST_INT_P (count_exp))
7478 {
7479 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7480 {
7481 emit_block_copy_via_libcall (dst, src, count_exp);
7482 count_exp = const0_rtx;
7483 goto epilogue;
7484 }
7485 }
7486 else
7487 {
7488 rtx_code_label *hot_label = gen_label_rtx ();
7489 if (jump_around_label == NULL_RTX)
7490 jump_around_label = gen_label_rtx ();
7491 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7492 LEU, 0, counter_mode (count_exp),
7493 1, hot_label);
7494 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7495 if (issetmem)
7496 set_storage_via_libcall (dst, count_exp, val_exp);
7497 else
7498 emit_block_copy_via_libcall (dst, src, count_exp);
7499 emit_jump (jump_around_label);
7500 emit_label (hot_label);
7501 }
7502 }
7503
7504 /* Step 2: Alignment prologue. */
7505 /* Do the expensive promotion once we branched off the small blocks. */
7506 if (issetmem && !promoted_val)
7507 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7508 desired_align, align);
7509
7510 if (desired_align > align && !misaligned_prologue_used)
7511 {
7512 if (align_bytes == 0)
7513 {
7514 /* Except for the first move in prologue, we no longer know
7515 constant offset in aliasing info. It don't seems to worth
7516 the pain to maintain it for the first move, so throw away
7517 the info early. */
7518 dst = change_address (dst, BLKmode, destreg);
7519 if (!issetmem)
7520 src = change_address (src, BLKmode, srcreg);
7521 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
7522 promoted_val, vec_promoted_val,
7523 count_exp, align, desired_align,
7524 issetmem);
7525 /* At most desired_align - align bytes are copied. */
7526 if (min_size < (unsigned)(desired_align - align))
7527 min_size = 0;
7528 else
7529 min_size -= desired_align - align;
7530 }
7531 else
7532 {
7533 /* If we know how many bytes need to be stored before dst is
7534 sufficiently aligned, maintain aliasing info accurately. */
7535 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
7536 srcreg,
7537 promoted_val,
7538 vec_promoted_val,
7539 desired_align,
7540 align_bytes,
7541 issetmem);
7542
7543 count_exp = plus_constant (counter_mode (count_exp),
7544 count_exp, -align_bytes);
7545 count -= align_bytes;
7546 min_size -= align_bytes;
7547 max_size -= align_bytes;
7548 }
7549 if (need_zero_guard
7550 && min_size < (unsigned HOST_WIDE_INT) size_needed
7551 && (count < (unsigned HOST_WIDE_INT) size_needed
7552 || (align_bytes == 0
7553 && count < ((unsigned HOST_WIDE_INT) size_needed
7554 + desired_align - align))))
7555 {
7556 /* It is possible that we copied enough so the main loop will not
7557 execute. */
7558 gcc_assert (size_needed > 1);
7559 if (label == NULL_RTX)
7560 label = gen_label_rtx ();
7561 emit_cmp_and_jump_insns (count_exp,
7562 GEN_INT (size_needed),
7563 LTU, 0, counter_mode (count_exp), 1, label);
7564 if (expected_size == -1
7565 || expected_size < (desired_align - align) / 2 + size_needed)
7566 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7567 else
7568 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7569 }
7570 }
7571 if (label && size_needed == 1)
7572 {
7573 emit_label (label);
7574 LABEL_NUSES (label) = 1;
7575 label = NULL;
7576 epilogue_size_needed = 1;
7577 if (issetmem)
7578 promoted_val = val_exp;
7579 }
7580 else if (label == NULL_RTX && !misaligned_prologue_used)
7581 epilogue_size_needed = size_needed;
7582
7583 /* Step 3: Main loop. */
7584
7585 switch (alg)
7586 {
7587 case libcall:
7588 case no_stringop:
7589 case last_alg:
7590 gcc_unreachable ();
7591 case loop_1_byte:
7592 case loop:
7593 case unrolled_loop:
7594 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
7595 count_exp, move_mode, unroll_factor,
7596 expected_size, issetmem);
7597 break;
7598 case vector_loop:
7599 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
7600 vec_promoted_val, count_exp, move_mode,
7601 unroll_factor, expected_size, issetmem);
7602 break;
7603 case rep_prefix_8_byte:
7604 case rep_prefix_4_byte:
7605 case rep_prefix_1_byte:
7606 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
7607 val_exp, count_exp, move_mode, issetmem);
7608 break;
7609 }
7610 /* Adjust properly the offset of src and dest memory for aliasing. */
7611 if (CONST_INT_P (count_exp))
7612 {
7613 if (!issetmem)
7614 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7615 (count / size_needed) * size_needed);
7616 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7617 (count / size_needed) * size_needed);
7618 }
7619 else
7620 {
7621 if (!issetmem)
7622 src = change_address (src, BLKmode, srcreg);
7623 dst = change_address (dst, BLKmode, destreg);
7624 }
7625
7626 /* Step 4: Epilogue to copy the remaining bytes. */
7627 epilogue:
7628 if (label)
7629 {
7630 /* When the main loop is done, COUNT_EXP might hold original count,
7631 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7632 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7633 bytes. Compensate if needed. */
7634
7635 if (size_needed < epilogue_size_needed)
7636 {
7637 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7638 GEN_INT (size_needed - 1), count_exp, 1,
7639 OPTAB_DIRECT);
7640 if (tmp != count_exp)
7641 emit_move_insn (count_exp, tmp);
7642 }
7643 emit_label (label);
7644 LABEL_NUSES (label) = 1;
7645 }
7646
7647 if (count_exp != const0_rtx && epilogue_size_needed > 1)
7648 {
7649 if (force_loopy_epilogue)
7650 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7651 epilogue_size_needed);
7652 else
7653 {
7654 if (issetmem)
7655 expand_setmem_epilogue (dst, destreg, promoted_val,
7656 vec_promoted_val, count_exp,
7657 epilogue_size_needed);
7658 else
7659 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
7660 epilogue_size_needed);
7661 }
7662 }
7663 if (jump_around_label)
7664 emit_label (jump_around_label);
7665 return true;
7666 }
7667
7668
7669 /* Expand the appropriate insns for doing strlen if not just doing
7670 repnz; scasb
7671
7672 out = result, initialized with the start address
7673 align_rtx = alignment of the address.
7674 scratch = scratch register, initialized with the startaddress when
7675 not aligned, otherwise undefined
7676
7677 This is just the body. It needs the initializations mentioned above and
7678 some address computing at the end. These things are done in i386.md. */
7679
7680 static void
7681 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7682 {
7683 int align;
7684 rtx tmp;
7685 rtx_code_label *align_2_label = NULL;
7686 rtx_code_label *align_3_label = NULL;
7687 rtx_code_label *align_4_label = gen_label_rtx ();
7688 rtx_code_label *end_0_label = gen_label_rtx ();
7689 rtx mem;
7690 rtx tmpreg = gen_reg_rtx (SImode);
7691 rtx scratch = gen_reg_rtx (SImode);
7692 rtx cmp;
7693
7694 align = 0;
7695 if (CONST_INT_P (align_rtx))
7696 align = INTVAL (align_rtx);
7697
7698 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7699
7700 /* Is there a known alignment and is it less than 4? */
7701 if (align < 4)
7702 {
7703 rtx scratch1 = gen_reg_rtx (Pmode);
7704 emit_move_insn (scratch1, out);
7705 /* Is there a known alignment and is it not 2? */
7706 if (align != 2)
7707 {
7708 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7709 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7710
7711 /* Leave just the 3 lower bits. */
7712 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7713 NULL_RTX, 0, OPTAB_WIDEN);
7714
7715 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7716 Pmode, 1, align_4_label);
7717 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7718 Pmode, 1, align_2_label);
7719 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7720 Pmode, 1, align_3_label);
7721 }
7722 else
7723 {
7724 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7725 check if is aligned to 4 - byte. */
7726
7727 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7728 NULL_RTX, 0, OPTAB_WIDEN);
7729
7730 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7731 Pmode, 1, align_4_label);
7732 }
7733
7734 mem = change_address (src, QImode, out);
7735
7736 /* Now compare the bytes. */
7737
7738 /* Compare the first n unaligned byte on a byte per byte basis. */
7739 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7740 QImode, 1, end_0_label);
7741
7742 /* Increment the address. */
7743 emit_insn (gen_add2_insn (out, const1_rtx));
7744
7745 /* Not needed with an alignment of 2 */
7746 if (align != 2)
7747 {
7748 emit_label (align_2_label);
7749
7750 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7751 end_0_label);
7752
7753 emit_insn (gen_add2_insn (out, const1_rtx));
7754
7755 emit_label (align_3_label);
7756 }
7757
7758 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7759 end_0_label);
7760
7761 emit_insn (gen_add2_insn (out, const1_rtx));
7762 }
7763
7764 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7765 align this loop. It gives only huge programs, but does not help to
7766 speed up. */
7767 emit_label (align_4_label);
7768
7769 mem = change_address (src, SImode, out);
7770 emit_move_insn (scratch, mem);
7771 emit_insn (gen_add2_insn (out, GEN_INT (4)));
7772
7773 /* This formula yields a nonzero result iff one of the bytes is zero.
7774 This saves three branches inside loop and many cycles. */
7775
7776 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7777 emit_insn (gen_one_cmplsi2 (scratch, scratch));
7778 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7779 emit_insn (gen_andsi3 (tmpreg, tmpreg,
7780 gen_int_mode (0x80808080, SImode)));
7781 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7782 align_4_label);
7783
7784 if (TARGET_CMOVE)
7785 {
7786 rtx reg = gen_reg_rtx (SImode);
7787 rtx reg2 = gen_reg_rtx (Pmode);
7788 emit_move_insn (reg, tmpreg);
7789 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7790
7791 /* If zero is not in the first two bytes, move two bytes forward. */
7792 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7793 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7794 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7795 emit_insn (gen_rtx_SET (tmpreg,
7796 gen_rtx_IF_THEN_ELSE (SImode, tmp,
7797 reg,
7798 tmpreg)));
7799 /* Emit lea manually to avoid clobbering of flags. */
7800 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
7801
7802 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7803 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7804 emit_insn (gen_rtx_SET (out,
7805 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7806 reg2,
7807 out)));
7808 }
7809 else
7810 {
7811 rtx_code_label *end_2_label = gen_label_rtx ();
7812 /* Is zero in the first two bytes? */
7813
7814 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7815 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7816 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7817 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7818 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7819 pc_rtx);
7820 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7821 JUMP_LABEL (tmp) = end_2_label;
7822
7823 /* Not in the first two. Move two bytes forward. */
7824 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
7825 emit_insn (gen_add2_insn (out, const2_rtx));
7826
7827 emit_label (end_2_label);
7828
7829 }
7830
7831 /* Avoid branch in fixing the byte. */
7832 tmpreg = gen_lowpart (QImode, tmpreg);
7833 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7834 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7835 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
7836 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
7837
7838 emit_label (end_0_label);
7839 }
7840
7841 /* Expand strlen. */
7842
7843 bool
7844 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7845 {
7846 if (TARGET_UNROLL_STRLEN
7847 && TARGET_INLINE_ALL_STRINGOPS
7848 && eoschar == const0_rtx
7849 && optimize > 1)
7850 {
7851 /* The generic case of strlen expander is long. Avoid it's
7852 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7853 rtx addr = force_reg (Pmode, XEXP (src, 0));
7854 /* Well it seems that some optimizer does not combine a call like
7855 foo(strlen(bar), strlen(bar));
7856 when the move and the subtraction is done here. It does calculate
7857 the length just once when these instructions are done inside of
7858 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7859 often used and I use one fewer register for the lifetime of
7860 output_strlen_unroll() this is better. */
7861
7862 emit_move_insn (out, addr);
7863
7864 ix86_expand_strlensi_unroll_1 (out, src, align);
7865
7866 /* strlensi_unroll_1 returns the address of the zero at the end of
7867 the string, like memchr(), so compute the length by subtracting
7868 the start address. */
7869 emit_insn (gen_sub2_insn (out, addr));
7870 return true;
7871 }
7872 else
7873 return false;
7874 }
7875
7876 /* For given symbol (function) construct code to compute address of it's PLT
7877 entry in large x86-64 PIC model. */
7878
7879 static rtx
7880 construct_plt_address (rtx symbol)
7881 {
7882 rtx tmp, unspec;
7883
7884 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
7885 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
7886 gcc_assert (Pmode == DImode);
7887
7888 tmp = gen_reg_rtx (Pmode);
7889 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
7890
7891 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
7892 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
7893 return tmp;
7894 }
7895
7896 /* Additional registers that are clobbered by SYSV calls. */
7897
7898 static int const x86_64_ms_sysv_extra_clobbered_registers
7899 [NUM_X86_64_MS_CLOBBERED_REGS] =
7900 {
7901 SI_REG, DI_REG,
7902 XMM6_REG, XMM7_REG,
7903 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
7904 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
7905 };
7906
7907 rtx_insn *
7908 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
7909 rtx callarg2,
7910 rtx pop, bool sibcall)
7911 {
7912 rtx vec[3];
7913 rtx use = NULL, call;
7914 unsigned int vec_len = 0;
7915 tree fndecl;
7916
7917 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7918 {
7919 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
7920 if (fndecl
7921 && (lookup_attribute ("interrupt",
7922 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
7923 error ("interrupt service routine cannot be called directly");
7924 }
7925 else
7926 fndecl = NULL_TREE;
7927
7928 if (pop == const0_rtx)
7929 pop = NULL;
7930 gcc_assert (!TARGET_64BIT || !pop);
7931
7932 if (TARGET_MACHO && !TARGET_64BIT)
7933 {
7934 #if TARGET_MACHO
7935 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7936 fnaddr = machopic_indirect_call_target (fnaddr);
7937 #endif
7938 }
7939 else
7940 {
7941 /* Static functions and indirect calls don't need the pic register. Also,
7942 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7943 it an indirect call. */
7944 rtx addr = XEXP (fnaddr, 0);
7945 if (flag_pic
7946 && GET_CODE (addr) == SYMBOL_REF
7947 && !SYMBOL_REF_LOCAL_P (addr))
7948 {
7949 if (flag_plt
7950 && (SYMBOL_REF_DECL (addr) == NULL_TREE
7951 || !lookup_attribute ("noplt",
7952 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
7953 {
7954 if (!TARGET_64BIT
7955 || (ix86_cmodel == CM_LARGE_PIC
7956 && DEFAULT_ABI != MS_ABI))
7957 {
7958 use_reg (&use, gen_rtx_REG (Pmode,
7959 REAL_PIC_OFFSET_TABLE_REGNUM));
7960 if (ix86_use_pseudo_pic_reg ())
7961 emit_move_insn (gen_rtx_REG (Pmode,
7962 REAL_PIC_OFFSET_TABLE_REGNUM),
7963 pic_offset_table_rtx);
7964 }
7965 }
7966 else if (!TARGET_PECOFF && !TARGET_MACHO)
7967 {
7968 if (TARGET_64BIT)
7969 {
7970 fnaddr = gen_rtx_UNSPEC (Pmode,
7971 gen_rtvec (1, addr),
7972 UNSPEC_GOTPCREL);
7973 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
7974 }
7975 else
7976 {
7977 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
7978 UNSPEC_GOT);
7979 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
7980 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
7981 fnaddr);
7982 }
7983 fnaddr = gen_const_mem (Pmode, fnaddr);
7984 /* Pmode may not be the same as word_mode for x32, which
7985 doesn't support indirect branch via 32-bit memory slot.
7986 Since x32 GOT slot is 64 bit with zero upper 32 bits,
7987 indirect branch via x32 GOT slot is OK. */
7988 if (GET_MODE (fnaddr) != word_mode)
7989 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
7990 fnaddr = gen_rtx_MEM (QImode, fnaddr);
7991 }
7992 }
7993 }
7994
7995 /* Skip setting up RAX register for -mskip-rax-setup when there are no
7996 parameters passed in vector registers. */
7997 if (TARGET_64BIT
7998 && (INTVAL (callarg2) > 0
7999 || (INTVAL (callarg2) == 0
8000 && (TARGET_SSE || !flag_skip_rax_setup))))
8001 {
8002 rtx al = gen_rtx_REG (QImode, AX_REG);
8003 emit_move_insn (al, callarg2);
8004 use_reg (&use, al);
8005 }
8006
8007 if (ix86_cmodel == CM_LARGE_PIC
8008 && !TARGET_PECOFF
8009 && MEM_P (fnaddr)
8010 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8011 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8012 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8013 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8014 branch via x32 GOT slot is OK. */
8015 else if (!(TARGET_X32
8016 && MEM_P (fnaddr)
8017 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8018 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8019 && (sibcall
8020 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8021 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8022 {
8023 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8024 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8025 }
8026
8027 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8028
8029 if (retval)
8030 call = gen_rtx_SET (retval, call);
8031 vec[vec_len++] = call;
8032
8033 if (pop)
8034 {
8035 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8036 pop = gen_rtx_SET (stack_pointer_rtx, pop);
8037 vec[vec_len++] = pop;
8038 }
8039
8040 if (cfun->machine->no_caller_saved_registers
8041 && (!fndecl
8042 || (!TREE_THIS_VOLATILE (fndecl)
8043 && !lookup_attribute ("no_caller_saved_registers",
8044 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8045 {
8046 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8047 bool is_64bit_ms_abi = (TARGET_64BIT
8048 && ix86_function_abi (fndecl) == MS_ABI);
8049 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8050
8051 /* If there are no caller-saved registers, add all registers
8052 that are clobbered by the call which returns. */
8053 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8054 if (!fixed_regs[i]
8055 && (ix86_call_used_regs[i] == 1
8056 || (ix86_call_used_regs[i] & c_mask))
8057 && !STACK_REGNO_P (i)
8058 && !MMX_REGNO_P (i))
8059 clobber_reg (&use,
8060 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8061 }
8062 else if (TARGET_64BIT_MS_ABI
8063 && (!callarg2 || INTVAL (callarg2) != -2))
8064 {
8065 unsigned i;
8066
8067 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8068 {
8069 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8070 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8071
8072 clobber_reg (&use, gen_rtx_REG (mode, regno));
8073 }
8074
8075 /* Set here, but it may get cleared later. */
8076 if (TARGET_CALL_MS2SYSV_XLOGUES)
8077 {
8078 if (!TARGET_SSE)
8079 ;
8080
8081 /* Don't break hot-patched functions. */
8082 else if (ix86_function_ms_hook_prologue (current_function_decl))
8083 ;
8084
8085 /* TODO: Cases not yet examined. */
8086 else if (flag_split_stack)
8087 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8088
8089 else
8090 {
8091 gcc_assert (!reload_completed);
8092 cfun->machine->call_ms2sysv = true;
8093 }
8094 }
8095 }
8096
8097 if (vec_len > 1)
8098 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8099 rtx_insn *call_insn = emit_call_insn (call);
8100 if (use)
8101 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8102
8103 return call_insn;
8104 }
8105
8106 /* Split simple return with popping POPC bytes from stack to indirect
8107 branch with stack adjustment . */
8108
8109 void
8110 ix86_split_simple_return_pop_internal (rtx popc)
8111 {
8112 struct machine_function *m = cfun->machine;
8113 rtx ecx = gen_rtx_REG (SImode, CX_REG);
8114 rtx_insn *insn;
8115
8116 /* There is no "pascal" calling convention in any 64bit ABI. */
8117 gcc_assert (!TARGET_64BIT);
8118
8119 insn = emit_insn (gen_pop (ecx));
8120 m->fs.cfa_offset -= UNITS_PER_WORD;
8121 m->fs.sp_offset -= UNITS_PER_WORD;
8122
8123 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8124 x = gen_rtx_SET (stack_pointer_rtx, x);
8125 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8126 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8127 RTX_FRAME_RELATED_P (insn) = 1;
8128
8129 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8130 x = gen_rtx_SET (stack_pointer_rtx, x);
8131 insn = emit_insn (x);
8132 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8133 RTX_FRAME_RELATED_P (insn) = 1;
8134
8135 /* Now return address is in ECX. */
8136 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8137 }
8138
8139 /* Errors in the source file can cause expand_expr to return const0_rtx
8140 where we expect a vector. To avoid crashing, use one of the vector
8141 clear instructions. */
8142
8143 static rtx
8144 safe_vector_operand (rtx x, machine_mode mode)
8145 {
8146 if (x == const0_rtx)
8147 x = CONST0_RTX (mode);
8148 return x;
8149 }
8150
8151 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8152
8153 static rtx
8154 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8155 {
8156 rtx pat;
8157 tree arg0 = CALL_EXPR_ARG (exp, 0);
8158 tree arg1 = CALL_EXPR_ARG (exp, 1);
8159 rtx op0 = expand_normal (arg0);
8160 rtx op1 = expand_normal (arg1);
8161 machine_mode tmode = insn_data[icode].operand[0].mode;
8162 machine_mode mode0 = insn_data[icode].operand[1].mode;
8163 machine_mode mode1 = insn_data[icode].operand[2].mode;
8164
8165 if (VECTOR_MODE_P (mode0))
8166 op0 = safe_vector_operand (op0, mode0);
8167 if (VECTOR_MODE_P (mode1))
8168 op1 = safe_vector_operand (op1, mode1);
8169
8170 if (optimize || !target
8171 || GET_MODE (target) != tmode
8172 || !insn_data[icode].operand[0].predicate (target, tmode))
8173 target = gen_reg_rtx (tmode);
8174
8175 if (GET_MODE (op1) == SImode && mode1 == TImode)
8176 {
8177 rtx x = gen_reg_rtx (V4SImode);
8178 emit_insn (gen_sse2_loadd (x, op1));
8179 op1 = gen_lowpart (TImode, x);
8180 }
8181
8182 if (!insn_data[icode].operand[1].predicate (op0, mode0))
8183 op0 = copy_to_mode_reg (mode0, op0);
8184 if (!insn_data[icode].operand[2].predicate (op1, mode1))
8185 op1 = copy_to_mode_reg (mode1, op1);
8186
8187 pat = GEN_FCN (icode) (target, op0, op1);
8188 if (! pat)
8189 return 0;
8190
8191 emit_insn (pat);
8192
8193 return target;
8194 }
8195
8196 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8197
8198 static rtx
8199 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8200 enum ix86_builtin_func_type m_type,
8201 enum rtx_code sub_code)
8202 {
8203 rtx pat;
8204 int i;
8205 int nargs;
8206 bool comparison_p = false;
8207 bool tf_p = false;
8208 bool last_arg_constant = false;
8209 int num_memory = 0;
8210 struct {
8211 rtx op;
8212 machine_mode mode;
8213 } args[4];
8214
8215 machine_mode tmode = insn_data[icode].operand[0].mode;
8216
8217 switch (m_type)
8218 {
8219 case MULTI_ARG_4_DF2_DI_I:
8220 case MULTI_ARG_4_DF2_DI_I1:
8221 case MULTI_ARG_4_SF2_SI_I:
8222 case MULTI_ARG_4_SF2_SI_I1:
8223 nargs = 4;
8224 last_arg_constant = true;
8225 break;
8226
8227 case MULTI_ARG_3_SF:
8228 case MULTI_ARG_3_DF:
8229 case MULTI_ARG_3_SF2:
8230 case MULTI_ARG_3_DF2:
8231 case MULTI_ARG_3_DI:
8232 case MULTI_ARG_3_SI:
8233 case MULTI_ARG_3_SI_DI:
8234 case MULTI_ARG_3_HI:
8235 case MULTI_ARG_3_HI_SI:
8236 case MULTI_ARG_3_QI:
8237 case MULTI_ARG_3_DI2:
8238 case MULTI_ARG_3_SI2:
8239 case MULTI_ARG_3_HI2:
8240 case MULTI_ARG_3_QI2:
8241 nargs = 3;
8242 break;
8243
8244 case MULTI_ARG_2_SF:
8245 case MULTI_ARG_2_DF:
8246 case MULTI_ARG_2_DI:
8247 case MULTI_ARG_2_SI:
8248 case MULTI_ARG_2_HI:
8249 case MULTI_ARG_2_QI:
8250 nargs = 2;
8251 break;
8252
8253 case MULTI_ARG_2_DI_IMM:
8254 case MULTI_ARG_2_SI_IMM:
8255 case MULTI_ARG_2_HI_IMM:
8256 case MULTI_ARG_2_QI_IMM:
8257 nargs = 2;
8258 last_arg_constant = true;
8259 break;
8260
8261 case MULTI_ARG_1_SF:
8262 case MULTI_ARG_1_DF:
8263 case MULTI_ARG_1_SF2:
8264 case MULTI_ARG_1_DF2:
8265 case MULTI_ARG_1_DI:
8266 case MULTI_ARG_1_SI:
8267 case MULTI_ARG_1_HI:
8268 case MULTI_ARG_1_QI:
8269 case MULTI_ARG_1_SI_DI:
8270 case MULTI_ARG_1_HI_DI:
8271 case MULTI_ARG_1_HI_SI:
8272 case MULTI_ARG_1_QI_DI:
8273 case MULTI_ARG_1_QI_SI:
8274 case MULTI_ARG_1_QI_HI:
8275 nargs = 1;
8276 break;
8277
8278 case MULTI_ARG_2_DI_CMP:
8279 case MULTI_ARG_2_SI_CMP:
8280 case MULTI_ARG_2_HI_CMP:
8281 case MULTI_ARG_2_QI_CMP:
8282 nargs = 2;
8283 comparison_p = true;
8284 break;
8285
8286 case MULTI_ARG_2_SF_TF:
8287 case MULTI_ARG_2_DF_TF:
8288 case MULTI_ARG_2_DI_TF:
8289 case MULTI_ARG_2_SI_TF:
8290 case MULTI_ARG_2_HI_TF:
8291 case MULTI_ARG_2_QI_TF:
8292 nargs = 2;
8293 tf_p = true;
8294 break;
8295
8296 default:
8297 gcc_unreachable ();
8298 }
8299
8300 if (optimize || !target
8301 || GET_MODE (target) != tmode
8302 || !insn_data[icode].operand[0].predicate (target, tmode))
8303 target = gen_reg_rtx (tmode);
8304 else if (memory_operand (target, tmode))
8305 num_memory++;
8306
8307 gcc_assert (nargs <= 4);
8308
8309 for (i = 0; i < nargs; i++)
8310 {
8311 tree arg = CALL_EXPR_ARG (exp, i);
8312 rtx op = expand_normal (arg);
8313 int adjust = (comparison_p) ? 1 : 0;
8314 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8315
8316 if (last_arg_constant && i == nargs - 1)
8317 {
8318 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8319 {
8320 enum insn_code new_icode = icode;
8321 switch (icode)
8322 {
8323 case CODE_FOR_xop_vpermil2v2df3:
8324 case CODE_FOR_xop_vpermil2v4sf3:
8325 case CODE_FOR_xop_vpermil2v4df3:
8326 case CODE_FOR_xop_vpermil2v8sf3:
8327 error ("the last argument must be a 2-bit immediate");
8328 return gen_reg_rtx (tmode);
8329 case CODE_FOR_xop_rotlv2di3:
8330 new_icode = CODE_FOR_rotlv2di3;
8331 goto xop_rotl;
8332 case CODE_FOR_xop_rotlv4si3:
8333 new_icode = CODE_FOR_rotlv4si3;
8334 goto xop_rotl;
8335 case CODE_FOR_xop_rotlv8hi3:
8336 new_icode = CODE_FOR_rotlv8hi3;
8337 goto xop_rotl;
8338 case CODE_FOR_xop_rotlv16qi3:
8339 new_icode = CODE_FOR_rotlv16qi3;
8340 xop_rotl:
8341 if (CONST_INT_P (op))
8342 {
8343 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8344 op = GEN_INT (INTVAL (op) & mask);
8345 gcc_checking_assert
8346 (insn_data[icode].operand[i + 1].predicate (op, mode));
8347 }
8348 else
8349 {
8350 gcc_checking_assert
8351 (nargs == 2
8352 && insn_data[new_icode].operand[0].mode == tmode
8353 && insn_data[new_icode].operand[1].mode == tmode
8354 && insn_data[new_icode].operand[2].mode == mode
8355 && insn_data[new_icode].operand[0].predicate
8356 == insn_data[icode].operand[0].predicate
8357 && insn_data[new_icode].operand[1].predicate
8358 == insn_data[icode].operand[1].predicate);
8359 icode = new_icode;
8360 goto non_constant;
8361 }
8362 break;
8363 default:
8364 gcc_unreachable ();
8365 }
8366 }
8367 }
8368 else
8369 {
8370 non_constant:
8371 if (VECTOR_MODE_P (mode))
8372 op = safe_vector_operand (op, mode);
8373
8374 /* If we aren't optimizing, only allow one memory operand to be
8375 generated. */
8376 if (memory_operand (op, mode))
8377 num_memory++;
8378
8379 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8380
8381 if (optimize
8382 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8383 || num_memory > 1)
8384 op = force_reg (mode, op);
8385 }
8386
8387 args[i].op = op;
8388 args[i].mode = mode;
8389 }
8390
8391 switch (nargs)
8392 {
8393 case 1:
8394 pat = GEN_FCN (icode) (target, args[0].op);
8395 break;
8396
8397 case 2:
8398 if (tf_p)
8399 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
8400 GEN_INT ((int)sub_code));
8401 else if (! comparison_p)
8402 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
8403 else
8404 {
8405 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8406 args[0].op,
8407 args[1].op);
8408
8409 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
8410 }
8411 break;
8412
8413 case 3:
8414 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
8415 break;
8416
8417 case 4:
8418 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
8419 break;
8420
8421 default:
8422 gcc_unreachable ();
8423 }
8424
8425 if (! pat)
8426 return 0;
8427
8428 emit_insn (pat);
8429 return target;
8430 }
8431
8432 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8433 insns with vec_merge. */
8434
8435 static rtx
8436 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8437 rtx target)
8438 {
8439 rtx pat;
8440 tree arg0 = CALL_EXPR_ARG (exp, 0);
8441 rtx op1, op0 = expand_normal (arg0);
8442 machine_mode tmode = insn_data[icode].operand[0].mode;
8443 machine_mode mode0 = insn_data[icode].operand[1].mode;
8444
8445 if (optimize || !target
8446 || GET_MODE (target) != tmode
8447 || !insn_data[icode].operand[0].predicate (target, tmode))
8448 target = gen_reg_rtx (tmode);
8449
8450 if (VECTOR_MODE_P (mode0))
8451 op0 = safe_vector_operand (op0, mode0);
8452
8453 if ((optimize && !register_operand (op0, mode0))
8454 || !insn_data[icode].operand[1].predicate (op0, mode0))
8455 op0 = copy_to_mode_reg (mode0, op0);
8456
8457 op1 = op0;
8458 if (!insn_data[icode].operand[2].predicate (op1, mode0))
8459 op1 = copy_to_mode_reg (mode0, op1);
8460
8461 pat = GEN_FCN (icode) (target, op0, op1);
8462 if (! pat)
8463 return 0;
8464 emit_insn (pat);
8465 return target;
8466 }
8467
8468 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8469
8470 static rtx
8471 ix86_expand_sse_compare (const struct builtin_description *d,
8472 tree exp, rtx target, bool swap)
8473 {
8474 rtx pat;
8475 tree arg0 = CALL_EXPR_ARG (exp, 0);
8476 tree arg1 = CALL_EXPR_ARG (exp, 1);
8477 rtx op0 = expand_normal (arg0);
8478 rtx op1 = expand_normal (arg1);
8479 rtx op2;
8480 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8481 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8482 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8483 enum rtx_code comparison = d->comparison;
8484
8485 if (VECTOR_MODE_P (mode0))
8486 op0 = safe_vector_operand (op0, mode0);
8487 if (VECTOR_MODE_P (mode1))
8488 op1 = safe_vector_operand (op1, mode1);
8489
8490 /* Swap operands if we have a comparison that isn't available in
8491 hardware. */
8492 if (swap)
8493 std::swap (op0, op1);
8494
8495 if (optimize || !target
8496 || GET_MODE (target) != tmode
8497 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8498 target = gen_reg_rtx (tmode);
8499
8500 if ((optimize && !register_operand (op0, mode0))
8501 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8502 op0 = copy_to_mode_reg (mode0, op0);
8503 if ((optimize && !register_operand (op1, mode1))
8504 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8505 op1 = copy_to_mode_reg (mode1, op1);
8506
8507 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8508 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8509 if (! pat)
8510 return 0;
8511 emit_insn (pat);
8512 return target;
8513 }
8514
8515 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8516
8517 static rtx
8518 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8519 rtx target)
8520 {
8521 rtx pat;
8522 tree arg0 = CALL_EXPR_ARG (exp, 0);
8523 tree arg1 = CALL_EXPR_ARG (exp, 1);
8524 rtx op0 = expand_normal (arg0);
8525 rtx op1 = expand_normal (arg1);
8526 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8527 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8528 enum rtx_code comparison = d->comparison;
8529
8530 if (VECTOR_MODE_P (mode0))
8531 op0 = safe_vector_operand (op0, mode0);
8532 if (VECTOR_MODE_P (mode1))
8533 op1 = safe_vector_operand (op1, mode1);
8534
8535 /* Swap operands if we have a comparison that isn't available in
8536 hardware. */
8537 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
8538 std::swap (op0, op1);
8539
8540 target = gen_reg_rtx (SImode);
8541 emit_move_insn (target, const0_rtx);
8542 target = gen_rtx_SUBREG (QImode, target, 0);
8543
8544 if ((optimize && !register_operand (op0, mode0))
8545 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8546 op0 = copy_to_mode_reg (mode0, op0);
8547 if ((optimize && !register_operand (op1, mode1))
8548 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8549 op1 = copy_to_mode_reg (mode1, op1);
8550
8551 pat = GEN_FCN (d->icode) (op0, op1);
8552 if (! pat)
8553 return 0;
8554 emit_insn (pat);
8555 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8556 gen_rtx_fmt_ee (comparison, QImode,
8557 SET_DEST (pat),
8558 const0_rtx)));
8559
8560 return SUBREG_REG (target);
8561 }
8562
8563 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8564
8565 static rtx
8566 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8567 rtx target)
8568 {
8569 rtx pat;
8570 tree arg0 = CALL_EXPR_ARG (exp, 0);
8571 rtx op1, op0 = expand_normal (arg0);
8572 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8573 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8574
8575 if (optimize || target == 0
8576 || GET_MODE (target) != tmode
8577 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8578 target = gen_reg_rtx (tmode);
8579
8580 if (VECTOR_MODE_P (mode0))
8581 op0 = safe_vector_operand (op0, mode0);
8582
8583 if ((optimize && !register_operand (op0, mode0))
8584 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8585 op0 = copy_to_mode_reg (mode0, op0);
8586
8587 op1 = GEN_INT (d->comparison);
8588
8589 pat = GEN_FCN (d->icode) (target, op0, op1);
8590 if (! pat)
8591 return 0;
8592 emit_insn (pat);
8593 return target;
8594 }
8595
8596 static rtx
8597 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8598 tree exp, rtx target)
8599 {
8600 rtx pat;
8601 tree arg0 = CALL_EXPR_ARG (exp, 0);
8602 tree arg1 = CALL_EXPR_ARG (exp, 1);
8603 rtx op0 = expand_normal (arg0);
8604 rtx op1 = expand_normal (arg1);
8605 rtx op2;
8606 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8607 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8608 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8609
8610 if (optimize || target == 0
8611 || GET_MODE (target) != tmode
8612 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8613 target = gen_reg_rtx (tmode);
8614
8615 op0 = safe_vector_operand (op0, mode0);
8616 op1 = safe_vector_operand (op1, mode1);
8617
8618 if ((optimize && !register_operand (op0, mode0))
8619 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8620 op0 = copy_to_mode_reg (mode0, op0);
8621 if ((optimize && !register_operand (op1, mode1))
8622 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8623 op1 = copy_to_mode_reg (mode1, op1);
8624
8625 op2 = GEN_INT (d->comparison);
8626
8627 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8628 if (! pat)
8629 return 0;
8630 emit_insn (pat);
8631 return target;
8632 }
8633
8634 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8635
8636 static rtx
8637 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8638 rtx target)
8639 {
8640 rtx pat;
8641 tree arg0 = CALL_EXPR_ARG (exp, 0);
8642 tree arg1 = CALL_EXPR_ARG (exp, 1);
8643 rtx op0 = expand_normal (arg0);
8644 rtx op1 = expand_normal (arg1);
8645 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8646 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8647 enum rtx_code comparison = d->comparison;
8648
8649 if (VECTOR_MODE_P (mode0))
8650 op0 = safe_vector_operand (op0, mode0);
8651 if (VECTOR_MODE_P (mode1))
8652 op1 = safe_vector_operand (op1, mode1);
8653
8654 target = gen_reg_rtx (SImode);
8655 emit_move_insn (target, const0_rtx);
8656 target = gen_rtx_SUBREG (QImode, target, 0);
8657
8658 if ((optimize && !register_operand (op0, mode0))
8659 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8660 op0 = copy_to_mode_reg (mode0, op0);
8661 if ((optimize && !register_operand (op1, mode1))
8662 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8663 op1 = copy_to_mode_reg (mode1, op1);
8664
8665 pat = GEN_FCN (d->icode) (op0, op1);
8666 if (! pat)
8667 return 0;
8668 emit_insn (pat);
8669 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8670 gen_rtx_fmt_ee (comparison, QImode,
8671 SET_DEST (pat),
8672 const0_rtx)));
8673
8674 return SUBREG_REG (target);
8675 }
8676
8677 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8678
8679 static rtx
8680 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8681 tree exp, rtx target)
8682 {
8683 rtx pat;
8684 tree arg0 = CALL_EXPR_ARG (exp, 0);
8685 tree arg1 = CALL_EXPR_ARG (exp, 1);
8686 tree arg2 = CALL_EXPR_ARG (exp, 2);
8687 tree arg3 = CALL_EXPR_ARG (exp, 3);
8688 tree arg4 = CALL_EXPR_ARG (exp, 4);
8689 rtx scratch0, scratch1;
8690 rtx op0 = expand_normal (arg0);
8691 rtx op1 = expand_normal (arg1);
8692 rtx op2 = expand_normal (arg2);
8693 rtx op3 = expand_normal (arg3);
8694 rtx op4 = expand_normal (arg4);
8695 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8696
8697 tmode0 = insn_data[d->icode].operand[0].mode;
8698 tmode1 = insn_data[d->icode].operand[1].mode;
8699 modev2 = insn_data[d->icode].operand[2].mode;
8700 modei3 = insn_data[d->icode].operand[3].mode;
8701 modev4 = insn_data[d->icode].operand[4].mode;
8702 modei5 = insn_data[d->icode].operand[5].mode;
8703 modeimm = insn_data[d->icode].operand[6].mode;
8704
8705 if (VECTOR_MODE_P (modev2))
8706 op0 = safe_vector_operand (op0, modev2);
8707 if (VECTOR_MODE_P (modev4))
8708 op2 = safe_vector_operand (op2, modev4);
8709
8710 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8711 op0 = copy_to_mode_reg (modev2, op0);
8712 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8713 op1 = copy_to_mode_reg (modei3, op1);
8714 if ((optimize && !register_operand (op2, modev4))
8715 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8716 op2 = copy_to_mode_reg (modev4, op2);
8717 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8718 op3 = copy_to_mode_reg (modei5, op3);
8719
8720 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8721 {
8722 error ("the fifth argument must be an 8-bit immediate");
8723 return const0_rtx;
8724 }
8725
8726 if (d->code == IX86_BUILTIN_PCMPESTRI128)
8727 {
8728 if (optimize || !target
8729 || GET_MODE (target) != tmode0
8730 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8731 target = gen_reg_rtx (tmode0);
8732
8733 scratch1 = gen_reg_rtx (tmode1);
8734
8735 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8736 }
8737 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8738 {
8739 if (optimize || !target
8740 || GET_MODE (target) != tmode1
8741 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8742 target = gen_reg_rtx (tmode1);
8743
8744 scratch0 = gen_reg_rtx (tmode0);
8745
8746 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8747 }
8748 else
8749 {
8750 gcc_assert (d->flag);
8751
8752 scratch0 = gen_reg_rtx (tmode0);
8753 scratch1 = gen_reg_rtx (tmode1);
8754
8755 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8756 }
8757
8758 if (! pat)
8759 return 0;
8760
8761 emit_insn (pat);
8762
8763 if (d->flag)
8764 {
8765 target = gen_reg_rtx (SImode);
8766 emit_move_insn (target, const0_rtx);
8767 target = gen_rtx_SUBREG (QImode, target, 0);
8768
8769 emit_insn
8770 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8771 gen_rtx_fmt_ee (EQ, QImode,
8772 gen_rtx_REG ((machine_mode) d->flag,
8773 FLAGS_REG),
8774 const0_rtx)));
8775 return SUBREG_REG (target);
8776 }
8777 else
8778 return target;
8779 }
8780
8781
8782 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8783
8784 static rtx
8785 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8786 tree exp, rtx target)
8787 {
8788 rtx pat;
8789 tree arg0 = CALL_EXPR_ARG (exp, 0);
8790 tree arg1 = CALL_EXPR_ARG (exp, 1);
8791 tree arg2 = CALL_EXPR_ARG (exp, 2);
8792 rtx scratch0, scratch1;
8793 rtx op0 = expand_normal (arg0);
8794 rtx op1 = expand_normal (arg1);
8795 rtx op2 = expand_normal (arg2);
8796 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8797
8798 tmode0 = insn_data[d->icode].operand[0].mode;
8799 tmode1 = insn_data[d->icode].operand[1].mode;
8800 modev2 = insn_data[d->icode].operand[2].mode;
8801 modev3 = insn_data[d->icode].operand[3].mode;
8802 modeimm = insn_data[d->icode].operand[4].mode;
8803
8804 if (VECTOR_MODE_P (modev2))
8805 op0 = safe_vector_operand (op0, modev2);
8806 if (VECTOR_MODE_P (modev3))
8807 op1 = safe_vector_operand (op1, modev3);
8808
8809 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8810 op0 = copy_to_mode_reg (modev2, op0);
8811 if ((optimize && !register_operand (op1, modev3))
8812 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8813 op1 = copy_to_mode_reg (modev3, op1);
8814
8815 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8816 {
8817 error ("the third argument must be an 8-bit immediate");
8818 return const0_rtx;
8819 }
8820
8821 if (d->code == IX86_BUILTIN_PCMPISTRI128)
8822 {
8823 if (optimize || !target
8824 || GET_MODE (target) != tmode0
8825 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8826 target = gen_reg_rtx (tmode0);
8827
8828 scratch1 = gen_reg_rtx (tmode1);
8829
8830 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8831 }
8832 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8833 {
8834 if (optimize || !target
8835 || GET_MODE (target) != tmode1
8836 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8837 target = gen_reg_rtx (tmode1);
8838
8839 scratch0 = gen_reg_rtx (tmode0);
8840
8841 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8842 }
8843 else
8844 {
8845 gcc_assert (d->flag);
8846
8847 scratch0 = gen_reg_rtx (tmode0);
8848 scratch1 = gen_reg_rtx (tmode1);
8849
8850 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8851 }
8852
8853 if (! pat)
8854 return 0;
8855
8856 emit_insn (pat);
8857
8858 if (d->flag)
8859 {
8860 target = gen_reg_rtx (SImode);
8861 emit_move_insn (target, const0_rtx);
8862 target = gen_rtx_SUBREG (QImode, target, 0);
8863
8864 emit_insn
8865 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8866 gen_rtx_fmt_ee (EQ, QImode,
8867 gen_rtx_REG ((machine_mode) d->flag,
8868 FLAGS_REG),
8869 const0_rtx)));
8870 return SUBREG_REG (target);
8871 }
8872 else
8873 return target;
8874 }
8875
8876 /* Fixup modeless constants to fit required mode. */
8877
8878 static rtx
8879 fixup_modeless_constant (rtx x, machine_mode mode)
8880 {
8881 if (GET_MODE (x) == VOIDmode)
8882 x = convert_to_mode (mode, x, 1);
8883 return x;
8884 }
8885
8886 /* Subroutine of ix86_expand_builtin to take care of insns with
8887 variable number of operands. */
8888
8889 static rtx
8890 ix86_expand_args_builtin (const struct builtin_description *d,
8891 tree exp, rtx target)
8892 {
8893 rtx pat, real_target;
8894 unsigned int i, nargs;
8895 unsigned int nargs_constant = 0;
8896 unsigned int mask_pos = 0;
8897 int num_memory = 0;
8898 struct
8899 {
8900 rtx op;
8901 machine_mode mode;
8902 } args[6];
8903 bool second_arg_count = false;
8904 enum insn_code icode = d->icode;
8905 const struct insn_data_d *insn_p = &insn_data[icode];
8906 machine_mode tmode = insn_p->operand[0].mode;
8907 machine_mode rmode = VOIDmode;
8908 bool swap = false;
8909 enum rtx_code comparison = d->comparison;
8910
8911 switch ((enum ix86_builtin_func_type) d->flag)
8912 {
8913 case V2DF_FTYPE_V2DF_ROUND:
8914 case V4DF_FTYPE_V4DF_ROUND:
8915 case V8DF_FTYPE_V8DF_ROUND:
8916 case V4SF_FTYPE_V4SF_ROUND:
8917 case V8SF_FTYPE_V8SF_ROUND:
8918 case V16SF_FTYPE_V16SF_ROUND:
8919 case V4SI_FTYPE_V4SF_ROUND:
8920 case V8SI_FTYPE_V8SF_ROUND:
8921 case V16SI_FTYPE_V16SF_ROUND:
8922 return ix86_expand_sse_round (d, exp, target);
8923 case V4SI_FTYPE_V2DF_V2DF_ROUND:
8924 case V8SI_FTYPE_V4DF_V4DF_ROUND:
8925 case V16SI_FTYPE_V8DF_V8DF_ROUND:
8926 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
8927 case INT_FTYPE_V8SF_V8SF_PTEST:
8928 case INT_FTYPE_V4DI_V4DI_PTEST:
8929 case INT_FTYPE_V4DF_V4DF_PTEST:
8930 case INT_FTYPE_V4SF_V4SF_PTEST:
8931 case INT_FTYPE_V2DI_V2DI_PTEST:
8932 case INT_FTYPE_V2DF_V2DF_PTEST:
8933 return ix86_expand_sse_ptest (d, exp, target);
8934 case FLOAT128_FTYPE_FLOAT128:
8935 case FLOAT_FTYPE_FLOAT:
8936 case INT_FTYPE_INT:
8937 case UINT_FTYPE_UINT:
8938 case UINT16_FTYPE_UINT16:
8939 case UINT64_FTYPE_INT:
8940 case UINT64_FTYPE_UINT64:
8941 case INT64_FTYPE_INT64:
8942 case INT64_FTYPE_V4SF:
8943 case INT64_FTYPE_V2DF:
8944 case INT_FTYPE_V16QI:
8945 case INT_FTYPE_V8QI:
8946 case INT_FTYPE_V8SF:
8947 case INT_FTYPE_V4DF:
8948 case INT_FTYPE_V4SF:
8949 case INT_FTYPE_V2DF:
8950 case INT_FTYPE_V32QI:
8951 case V16QI_FTYPE_V16QI:
8952 case V8SI_FTYPE_V8SF:
8953 case V8SI_FTYPE_V4SI:
8954 case V8HI_FTYPE_V8HI:
8955 case V8HI_FTYPE_V16QI:
8956 case V8QI_FTYPE_V8QI:
8957 case V8SF_FTYPE_V8SF:
8958 case V8SF_FTYPE_V8SI:
8959 case V8SF_FTYPE_V4SF:
8960 case V8SF_FTYPE_V8HI:
8961 case V4SI_FTYPE_V4SI:
8962 case V4SI_FTYPE_V16QI:
8963 case V4SI_FTYPE_V4SF:
8964 case V4SI_FTYPE_V8SI:
8965 case V4SI_FTYPE_V8HI:
8966 case V4SI_FTYPE_V4DF:
8967 case V4SI_FTYPE_V2DF:
8968 case V4HI_FTYPE_V4HI:
8969 case V4DF_FTYPE_V4DF:
8970 case V4DF_FTYPE_V4SI:
8971 case V4DF_FTYPE_V4SF:
8972 case V4DF_FTYPE_V2DF:
8973 case V4SF_FTYPE_V4SF:
8974 case V4SF_FTYPE_V4SI:
8975 case V4SF_FTYPE_V8SF:
8976 case V4SF_FTYPE_V4DF:
8977 case V4SF_FTYPE_V8HI:
8978 case V4SF_FTYPE_V2DF:
8979 case V2DI_FTYPE_V2DI:
8980 case V2DI_FTYPE_V16QI:
8981 case V2DI_FTYPE_V8HI:
8982 case V2DI_FTYPE_V4SI:
8983 case V2DF_FTYPE_V2DF:
8984 case V2DF_FTYPE_V4SI:
8985 case V2DF_FTYPE_V4DF:
8986 case V2DF_FTYPE_V4SF:
8987 case V2DF_FTYPE_V2SI:
8988 case V2SI_FTYPE_V2SI:
8989 case V2SI_FTYPE_V4SF:
8990 case V2SI_FTYPE_V2SF:
8991 case V2SI_FTYPE_V2DF:
8992 case V2SF_FTYPE_V2SF:
8993 case V2SF_FTYPE_V2SI:
8994 case V32QI_FTYPE_V32QI:
8995 case V32QI_FTYPE_V16QI:
8996 case V16HI_FTYPE_V16HI:
8997 case V16HI_FTYPE_V8HI:
8998 case V8SI_FTYPE_V8SI:
8999 case V16HI_FTYPE_V16QI:
9000 case V8SI_FTYPE_V16QI:
9001 case V4DI_FTYPE_V16QI:
9002 case V8SI_FTYPE_V8HI:
9003 case V4DI_FTYPE_V8HI:
9004 case V4DI_FTYPE_V4SI:
9005 case V4DI_FTYPE_V2DI:
9006 case UQI_FTYPE_UQI:
9007 case UHI_FTYPE_UHI:
9008 case USI_FTYPE_USI:
9009 case USI_FTYPE_UQI:
9010 case USI_FTYPE_UHI:
9011 case UDI_FTYPE_UDI:
9012 case UHI_FTYPE_V16QI:
9013 case USI_FTYPE_V32QI:
9014 case UDI_FTYPE_V64QI:
9015 case V16QI_FTYPE_UHI:
9016 case V32QI_FTYPE_USI:
9017 case V64QI_FTYPE_UDI:
9018 case V8HI_FTYPE_UQI:
9019 case V16HI_FTYPE_UHI:
9020 case V32HI_FTYPE_USI:
9021 case V4SI_FTYPE_UQI:
9022 case V8SI_FTYPE_UQI:
9023 case V4SI_FTYPE_UHI:
9024 case V8SI_FTYPE_UHI:
9025 case UQI_FTYPE_V8HI:
9026 case UHI_FTYPE_V16HI:
9027 case USI_FTYPE_V32HI:
9028 case UQI_FTYPE_V4SI:
9029 case UQI_FTYPE_V8SI:
9030 case UHI_FTYPE_V16SI:
9031 case UQI_FTYPE_V2DI:
9032 case UQI_FTYPE_V4DI:
9033 case UQI_FTYPE_V8DI:
9034 case V16SI_FTYPE_UHI:
9035 case V2DI_FTYPE_UQI:
9036 case V4DI_FTYPE_UQI:
9037 case V16SI_FTYPE_INT:
9038 case V16SF_FTYPE_V8SF:
9039 case V16SI_FTYPE_V8SI:
9040 case V16SF_FTYPE_V4SF:
9041 case V16SI_FTYPE_V4SI:
9042 case V16SI_FTYPE_V16SF:
9043 case V16SI_FTYPE_V16SI:
9044 case V64QI_FTYPE_V64QI:
9045 case V32HI_FTYPE_V32HI:
9046 case V16SF_FTYPE_V16SF:
9047 case V8DI_FTYPE_UQI:
9048 case V8DI_FTYPE_V8DI:
9049 case V8DF_FTYPE_V4DF:
9050 case V8DF_FTYPE_V2DF:
9051 case V8DF_FTYPE_V8DF:
9052 case V4DI_FTYPE_V4DI:
9053 case V16HI_FTYPE_V16SF:
9054 case V8HI_FTYPE_V8SF:
9055 case V8HI_FTYPE_V4SF:
9056 nargs = 1;
9057 break;
9058 case V4SF_FTYPE_V4SF_VEC_MERGE:
9059 case V2DF_FTYPE_V2DF_VEC_MERGE:
9060 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9061 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9062 case V16QI_FTYPE_V16QI_V16QI:
9063 case V16QI_FTYPE_V8HI_V8HI:
9064 case V16SF_FTYPE_V16SF_V16SF:
9065 case V8QI_FTYPE_V8QI_V8QI:
9066 case V8QI_FTYPE_V4HI_V4HI:
9067 case V8HI_FTYPE_V8HI_V8HI:
9068 case V8HI_FTYPE_V16QI_V16QI:
9069 case V8HI_FTYPE_V4SI_V4SI:
9070 case V8SF_FTYPE_V8SF_V8SF:
9071 case V8SF_FTYPE_V8SF_V8SI:
9072 case V8DF_FTYPE_V8DF_V8DF:
9073 case V4SI_FTYPE_V4SI_V4SI:
9074 case V4SI_FTYPE_V8HI_V8HI:
9075 case V4SI_FTYPE_V2DF_V2DF:
9076 case V4HI_FTYPE_V4HI_V4HI:
9077 case V4HI_FTYPE_V8QI_V8QI:
9078 case V4HI_FTYPE_V2SI_V2SI:
9079 case V4DF_FTYPE_V4DF_V4DF:
9080 case V4DF_FTYPE_V4DF_V4DI:
9081 case V4SF_FTYPE_V4SF_V4SF:
9082 case V4SF_FTYPE_V4SF_V4SI:
9083 case V4SF_FTYPE_V4SF_V2SI:
9084 case V4SF_FTYPE_V4SF_V2DF:
9085 case V4SF_FTYPE_V4SF_UINT:
9086 case V4SF_FTYPE_V4SF_DI:
9087 case V4SF_FTYPE_V4SF_SI:
9088 case V2DI_FTYPE_V2DI_V2DI:
9089 case V2DI_FTYPE_V16QI_V16QI:
9090 case V2DI_FTYPE_V4SI_V4SI:
9091 case V2DI_FTYPE_V2DI_V16QI:
9092 case V2SI_FTYPE_V2SI_V2SI:
9093 case V2SI_FTYPE_V4HI_V4HI:
9094 case V2SI_FTYPE_V2SF_V2SF:
9095 case V2DF_FTYPE_V2DF_V2DF:
9096 case V2DF_FTYPE_V2DF_V4SF:
9097 case V2DF_FTYPE_V2DF_V2DI:
9098 case V2DF_FTYPE_V2DF_DI:
9099 case V2DF_FTYPE_V2DF_SI:
9100 case V2DF_FTYPE_V2DF_UINT:
9101 case V2SF_FTYPE_V2SF_V2SF:
9102 case V1DI_FTYPE_V1DI_V1DI:
9103 case V1DI_FTYPE_V8QI_V8QI:
9104 case V1DI_FTYPE_V2SI_V2SI:
9105 case V32QI_FTYPE_V16HI_V16HI:
9106 case V16HI_FTYPE_V8SI_V8SI:
9107 case V64QI_FTYPE_V64QI_V64QI:
9108 case V32QI_FTYPE_V32QI_V32QI:
9109 case V16HI_FTYPE_V32QI_V32QI:
9110 case V16HI_FTYPE_V16HI_V16HI:
9111 case V8SI_FTYPE_V4DF_V4DF:
9112 case V8SI_FTYPE_V8SI_V8SI:
9113 case V8SI_FTYPE_V16HI_V16HI:
9114 case V4DI_FTYPE_V4DI_V4DI:
9115 case V4DI_FTYPE_V8SI_V8SI:
9116 case V8DI_FTYPE_V64QI_V64QI:
9117 if (comparison == UNKNOWN)
9118 return ix86_expand_binop_builtin (icode, exp, target);
9119 nargs = 2;
9120 break;
9121 case V4SF_FTYPE_V4SF_V4SF_SWAP:
9122 case V2DF_FTYPE_V2DF_V2DF_SWAP:
9123 gcc_assert (comparison != UNKNOWN);
9124 nargs = 2;
9125 swap = true;
9126 break;
9127 case V16HI_FTYPE_V16HI_V8HI_COUNT:
9128 case V16HI_FTYPE_V16HI_SI_COUNT:
9129 case V8SI_FTYPE_V8SI_V4SI_COUNT:
9130 case V8SI_FTYPE_V8SI_SI_COUNT:
9131 case V4DI_FTYPE_V4DI_V2DI_COUNT:
9132 case V4DI_FTYPE_V4DI_INT_COUNT:
9133 case V8HI_FTYPE_V8HI_V8HI_COUNT:
9134 case V8HI_FTYPE_V8HI_SI_COUNT:
9135 case V4SI_FTYPE_V4SI_V4SI_COUNT:
9136 case V4SI_FTYPE_V4SI_SI_COUNT:
9137 case V4HI_FTYPE_V4HI_V4HI_COUNT:
9138 case V4HI_FTYPE_V4HI_SI_COUNT:
9139 case V2DI_FTYPE_V2DI_V2DI_COUNT:
9140 case V2DI_FTYPE_V2DI_SI_COUNT:
9141 case V2SI_FTYPE_V2SI_V2SI_COUNT:
9142 case V2SI_FTYPE_V2SI_SI_COUNT:
9143 case V1DI_FTYPE_V1DI_V1DI_COUNT:
9144 case V1DI_FTYPE_V1DI_SI_COUNT:
9145 nargs = 2;
9146 second_arg_count = true;
9147 break;
9148 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9149 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9150 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9151 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9152 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9153 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9154 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9155 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9156 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9157 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9158 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9159 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9160 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9161 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9162 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9163 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9164 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9165 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9166 nargs = 4;
9167 second_arg_count = true;
9168 break;
9169 case UINT64_FTYPE_UINT64_UINT64:
9170 case UINT_FTYPE_UINT_UINT:
9171 case UINT_FTYPE_UINT_USHORT:
9172 case UINT_FTYPE_UINT_UCHAR:
9173 case UINT16_FTYPE_UINT16_INT:
9174 case UINT8_FTYPE_UINT8_INT:
9175 case UQI_FTYPE_UQI_UQI:
9176 case UHI_FTYPE_UHI_UHI:
9177 case USI_FTYPE_USI_USI:
9178 case UDI_FTYPE_UDI_UDI:
9179 case V16SI_FTYPE_V8DF_V8DF:
9180 case V32HI_FTYPE_V16SF_V16SF:
9181 case V16HI_FTYPE_V8SF_V8SF:
9182 case V8HI_FTYPE_V4SF_V4SF:
9183 case V16HI_FTYPE_V16SF_UHI:
9184 case V8HI_FTYPE_V8SF_UQI:
9185 case V8HI_FTYPE_V4SF_UQI:
9186 nargs = 2;
9187 break;
9188 case V2DI_FTYPE_V2DI_INT_CONVERT:
9189 nargs = 2;
9190 rmode = V1TImode;
9191 nargs_constant = 1;
9192 break;
9193 case V4DI_FTYPE_V4DI_INT_CONVERT:
9194 nargs = 2;
9195 rmode = V2TImode;
9196 nargs_constant = 1;
9197 break;
9198 case V8DI_FTYPE_V8DI_INT_CONVERT:
9199 nargs = 2;
9200 rmode = V4TImode;
9201 nargs_constant = 1;
9202 break;
9203 case V8HI_FTYPE_V8HI_INT:
9204 case V8HI_FTYPE_V8SF_INT:
9205 case V16HI_FTYPE_V16SF_INT:
9206 case V8HI_FTYPE_V4SF_INT:
9207 case V8SF_FTYPE_V8SF_INT:
9208 case V4SF_FTYPE_V16SF_INT:
9209 case V16SF_FTYPE_V16SF_INT:
9210 case V4SI_FTYPE_V4SI_INT:
9211 case V4SI_FTYPE_V8SI_INT:
9212 case V4HI_FTYPE_V4HI_INT:
9213 case V4DF_FTYPE_V4DF_INT:
9214 case V4DF_FTYPE_V8DF_INT:
9215 case V4SF_FTYPE_V4SF_INT:
9216 case V4SF_FTYPE_V8SF_INT:
9217 case V2DI_FTYPE_V2DI_INT:
9218 case V2DF_FTYPE_V2DF_INT:
9219 case V2DF_FTYPE_V4DF_INT:
9220 case V16HI_FTYPE_V16HI_INT:
9221 case V8SI_FTYPE_V8SI_INT:
9222 case V16SI_FTYPE_V16SI_INT:
9223 case V4SI_FTYPE_V16SI_INT:
9224 case V4DI_FTYPE_V4DI_INT:
9225 case V2DI_FTYPE_V4DI_INT:
9226 case V4DI_FTYPE_V8DI_INT:
9227 case UQI_FTYPE_UQI_UQI_CONST:
9228 case UHI_FTYPE_UHI_UQI:
9229 case USI_FTYPE_USI_UQI:
9230 case UDI_FTYPE_UDI_UQI:
9231 nargs = 2;
9232 nargs_constant = 1;
9233 break;
9234 case V16QI_FTYPE_V16QI_V16QI_V16QI:
9235 case V8SF_FTYPE_V8SF_V8SF_V8SF:
9236 case V4DF_FTYPE_V4DF_V4DF_V4DF:
9237 case V4SF_FTYPE_V4SF_V4SF_V4SF:
9238 case V2DF_FTYPE_V2DF_V2DF_V2DF:
9239 case V32QI_FTYPE_V32QI_V32QI_V32QI:
9240 case UHI_FTYPE_V16SI_V16SI_UHI:
9241 case UQI_FTYPE_V8DI_V8DI_UQI:
9242 case V16HI_FTYPE_V16SI_V16HI_UHI:
9243 case V16QI_FTYPE_V16SI_V16QI_UHI:
9244 case V16QI_FTYPE_V8DI_V16QI_UQI:
9245 case V16SF_FTYPE_V16SF_V16SF_UHI:
9246 case V16SF_FTYPE_V4SF_V16SF_UHI:
9247 case V16SI_FTYPE_SI_V16SI_UHI:
9248 case V16SI_FTYPE_V16HI_V16SI_UHI:
9249 case V16SI_FTYPE_V16QI_V16SI_UHI:
9250 case V8SF_FTYPE_V4SF_V8SF_UQI:
9251 case V4DF_FTYPE_V2DF_V4DF_UQI:
9252 case V8SI_FTYPE_V4SI_V8SI_UQI:
9253 case V8SI_FTYPE_SI_V8SI_UQI:
9254 case V4SI_FTYPE_V4SI_V4SI_UQI:
9255 case V4SI_FTYPE_SI_V4SI_UQI:
9256 case V4DI_FTYPE_V2DI_V4DI_UQI:
9257 case V4DI_FTYPE_DI_V4DI_UQI:
9258 case V2DI_FTYPE_V2DI_V2DI_UQI:
9259 case V2DI_FTYPE_DI_V2DI_UQI:
9260 case V64QI_FTYPE_V64QI_V64QI_UDI:
9261 case V64QI_FTYPE_V16QI_V64QI_UDI:
9262 case V64QI_FTYPE_QI_V64QI_UDI:
9263 case V32QI_FTYPE_V32QI_V32QI_USI:
9264 case V32QI_FTYPE_V16QI_V32QI_USI:
9265 case V32QI_FTYPE_QI_V32QI_USI:
9266 case V16QI_FTYPE_V16QI_V16QI_UHI:
9267 case V16QI_FTYPE_QI_V16QI_UHI:
9268 case V32HI_FTYPE_V8HI_V32HI_USI:
9269 case V32HI_FTYPE_HI_V32HI_USI:
9270 case V16HI_FTYPE_V8HI_V16HI_UHI:
9271 case V16HI_FTYPE_HI_V16HI_UHI:
9272 case V8HI_FTYPE_V8HI_V8HI_UQI:
9273 case V8HI_FTYPE_HI_V8HI_UQI:
9274 case V8SF_FTYPE_V8HI_V8SF_UQI:
9275 case V4SF_FTYPE_V8HI_V4SF_UQI:
9276 case V8SI_FTYPE_V8SF_V8SI_UQI:
9277 case V4SI_FTYPE_V4SF_V4SI_UQI:
9278 case V4DI_FTYPE_V4SF_V4DI_UQI:
9279 case V2DI_FTYPE_V4SF_V2DI_UQI:
9280 case V4SF_FTYPE_V4DI_V4SF_UQI:
9281 case V4SF_FTYPE_V2DI_V4SF_UQI:
9282 case V4DF_FTYPE_V4DI_V4DF_UQI:
9283 case V2DF_FTYPE_V2DI_V2DF_UQI:
9284 case V16QI_FTYPE_V8HI_V16QI_UQI:
9285 case V16QI_FTYPE_V16HI_V16QI_UHI:
9286 case V16QI_FTYPE_V4SI_V16QI_UQI:
9287 case V16QI_FTYPE_V8SI_V16QI_UQI:
9288 case V8HI_FTYPE_V4SI_V8HI_UQI:
9289 case V8HI_FTYPE_V8SI_V8HI_UQI:
9290 case V16QI_FTYPE_V2DI_V16QI_UQI:
9291 case V16QI_FTYPE_V4DI_V16QI_UQI:
9292 case V8HI_FTYPE_V2DI_V8HI_UQI:
9293 case V8HI_FTYPE_V4DI_V8HI_UQI:
9294 case V4SI_FTYPE_V2DI_V4SI_UQI:
9295 case V4SI_FTYPE_V4DI_V4SI_UQI:
9296 case V32QI_FTYPE_V32HI_V32QI_USI:
9297 case UHI_FTYPE_V16QI_V16QI_UHI:
9298 case USI_FTYPE_V32QI_V32QI_USI:
9299 case UDI_FTYPE_V64QI_V64QI_UDI:
9300 case UQI_FTYPE_V8HI_V8HI_UQI:
9301 case UHI_FTYPE_V16HI_V16HI_UHI:
9302 case USI_FTYPE_V32HI_V32HI_USI:
9303 case UQI_FTYPE_V4SI_V4SI_UQI:
9304 case UQI_FTYPE_V8SI_V8SI_UQI:
9305 case UQI_FTYPE_V2DI_V2DI_UQI:
9306 case UQI_FTYPE_V4DI_V4DI_UQI:
9307 case V4SF_FTYPE_V2DF_V4SF_UQI:
9308 case V4SF_FTYPE_V4DF_V4SF_UQI:
9309 case V16SI_FTYPE_V16SI_V16SI_UHI:
9310 case V16SI_FTYPE_V4SI_V16SI_UHI:
9311 case V2DI_FTYPE_V4SI_V2DI_UQI:
9312 case V2DI_FTYPE_V8HI_V2DI_UQI:
9313 case V2DI_FTYPE_V16QI_V2DI_UQI:
9314 case V4DI_FTYPE_V4DI_V4DI_UQI:
9315 case V4DI_FTYPE_V4SI_V4DI_UQI:
9316 case V4DI_FTYPE_V8HI_V4DI_UQI:
9317 case V4DI_FTYPE_V16QI_V4DI_UQI:
9318 case V4DI_FTYPE_V4DF_V4DI_UQI:
9319 case V2DI_FTYPE_V2DF_V2DI_UQI:
9320 case V4SI_FTYPE_V4DF_V4SI_UQI:
9321 case V4SI_FTYPE_V2DF_V4SI_UQI:
9322 case V4SI_FTYPE_V8HI_V4SI_UQI:
9323 case V4SI_FTYPE_V16QI_V4SI_UQI:
9324 case V4DI_FTYPE_V4DI_V4DI_V4DI:
9325 case V8DF_FTYPE_V2DF_V8DF_UQI:
9326 case V8DF_FTYPE_V4DF_V8DF_UQI:
9327 case V8DF_FTYPE_V8DF_V8DF_UQI:
9328 case V8SF_FTYPE_V8SF_V8SF_UQI:
9329 case V8SF_FTYPE_V8SI_V8SF_UQI:
9330 case V4DF_FTYPE_V4DF_V4DF_UQI:
9331 case V4SF_FTYPE_V4SF_V4SF_UQI:
9332 case V2DF_FTYPE_V2DF_V2DF_UQI:
9333 case V2DF_FTYPE_V4SF_V2DF_UQI:
9334 case V2DF_FTYPE_V4SI_V2DF_UQI:
9335 case V4SF_FTYPE_V4SI_V4SF_UQI:
9336 case V4DF_FTYPE_V4SF_V4DF_UQI:
9337 case V4DF_FTYPE_V4SI_V4DF_UQI:
9338 case V8SI_FTYPE_V8SI_V8SI_UQI:
9339 case V8SI_FTYPE_V8HI_V8SI_UQI:
9340 case V8SI_FTYPE_V16QI_V8SI_UQI:
9341 case V8DF_FTYPE_V8SI_V8DF_UQI:
9342 case V8DI_FTYPE_DI_V8DI_UQI:
9343 case V16SF_FTYPE_V8SF_V16SF_UHI:
9344 case V16SI_FTYPE_V8SI_V16SI_UHI:
9345 case V16HI_FTYPE_V16HI_V16HI_UHI:
9346 case V8HI_FTYPE_V16QI_V8HI_UQI:
9347 case V16HI_FTYPE_V16QI_V16HI_UHI:
9348 case V32HI_FTYPE_V32HI_V32HI_USI:
9349 case V32HI_FTYPE_V32QI_V32HI_USI:
9350 case V8DI_FTYPE_V16QI_V8DI_UQI:
9351 case V8DI_FTYPE_V2DI_V8DI_UQI:
9352 case V8DI_FTYPE_V4DI_V8DI_UQI:
9353 case V8DI_FTYPE_V8DI_V8DI_UQI:
9354 case V8DI_FTYPE_V8HI_V8DI_UQI:
9355 case V8DI_FTYPE_V8SI_V8DI_UQI:
9356 case V8HI_FTYPE_V8DI_V8HI_UQI:
9357 case V8SI_FTYPE_V8DI_V8SI_UQI:
9358 case V4SI_FTYPE_V4SI_V4SI_V4SI:
9359 case V16SI_FTYPE_V16SI_V16SI_V16SI:
9360 case V8DI_FTYPE_V8DI_V8DI_V8DI:
9361 case V32HI_FTYPE_V32HI_V32HI_V32HI:
9362 case V2DI_FTYPE_V2DI_V2DI_V2DI:
9363 case V16HI_FTYPE_V16HI_V16HI_V16HI:
9364 case V8SI_FTYPE_V8SI_V8SI_V8SI:
9365 case V8HI_FTYPE_V8HI_V8HI_V8HI:
9366 case V32HI_FTYPE_V16SF_V16SF_USI:
9367 case V16HI_FTYPE_V8SF_V8SF_UHI:
9368 case V8HI_FTYPE_V4SF_V4SF_UQI:
9369 case V16HI_FTYPE_V16SF_V16HI_UHI:
9370 case V8HI_FTYPE_V8SF_V8HI_UQI:
9371 case V8HI_FTYPE_V4SF_V8HI_UQI:
9372 case V16SF_FTYPE_V16SF_V32HI_V32HI:
9373 case V8SF_FTYPE_V8SF_V16HI_V16HI:
9374 case V4SF_FTYPE_V4SF_V8HI_V8HI:
9375 nargs = 3;
9376 break;
9377 case V32QI_FTYPE_V32QI_V32QI_INT:
9378 case V16HI_FTYPE_V16HI_V16HI_INT:
9379 case V16QI_FTYPE_V16QI_V16QI_INT:
9380 case V4DI_FTYPE_V4DI_V4DI_INT:
9381 case V8HI_FTYPE_V8HI_V8HI_INT:
9382 case V8SI_FTYPE_V8SI_V8SI_INT:
9383 case V8SI_FTYPE_V8SI_V4SI_INT:
9384 case V8SF_FTYPE_V8SF_V8SF_INT:
9385 case V8SF_FTYPE_V8SF_V4SF_INT:
9386 case V4SI_FTYPE_V4SI_V4SI_INT:
9387 case V4DF_FTYPE_V4DF_V4DF_INT:
9388 case V16SF_FTYPE_V16SF_V16SF_INT:
9389 case V16SF_FTYPE_V16SF_V4SF_INT:
9390 case V16SI_FTYPE_V16SI_V4SI_INT:
9391 case V4DF_FTYPE_V4DF_V2DF_INT:
9392 case V4SF_FTYPE_V4SF_V4SF_INT:
9393 case V2DI_FTYPE_V2DI_V2DI_INT:
9394 case V4DI_FTYPE_V4DI_V2DI_INT:
9395 case V2DF_FTYPE_V2DF_V2DF_INT:
9396 case UQI_FTYPE_V8DI_V8UDI_INT:
9397 case UQI_FTYPE_V8DF_V8DF_INT:
9398 case UQI_FTYPE_V2DF_V2DF_INT:
9399 case UQI_FTYPE_V4SF_V4SF_INT:
9400 case UHI_FTYPE_V16SI_V16SI_INT:
9401 case UHI_FTYPE_V16SF_V16SF_INT:
9402 case V64QI_FTYPE_V64QI_V64QI_INT:
9403 case V32HI_FTYPE_V32HI_V32HI_INT:
9404 case V16SI_FTYPE_V16SI_V16SI_INT:
9405 case V8DI_FTYPE_V8DI_V8DI_INT:
9406 nargs = 3;
9407 nargs_constant = 1;
9408 break;
9409 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9410 nargs = 3;
9411 rmode = V4DImode;
9412 nargs_constant = 1;
9413 break;
9414 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9415 nargs = 3;
9416 rmode = V2DImode;
9417 nargs_constant = 1;
9418 break;
9419 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9420 nargs = 3;
9421 rmode = DImode;
9422 nargs_constant = 1;
9423 break;
9424 case V2DI_FTYPE_V2DI_UINT_UINT:
9425 nargs = 3;
9426 nargs_constant = 2;
9427 break;
9428 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9429 nargs = 3;
9430 rmode = V8DImode;
9431 nargs_constant = 1;
9432 break;
9433 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9434 nargs = 5;
9435 rmode = V8DImode;
9436 mask_pos = 2;
9437 nargs_constant = 1;
9438 break;
9439 case QI_FTYPE_V8DF_INT_UQI:
9440 case QI_FTYPE_V4DF_INT_UQI:
9441 case QI_FTYPE_V2DF_INT_UQI:
9442 case HI_FTYPE_V16SF_INT_UHI:
9443 case QI_FTYPE_V8SF_INT_UQI:
9444 case QI_FTYPE_V4SF_INT_UQI:
9445 case V4SI_FTYPE_V4SI_V4SI_UHI:
9446 case V8SI_FTYPE_V8SI_V8SI_UHI:
9447 nargs = 3;
9448 mask_pos = 1;
9449 nargs_constant = 1;
9450 break;
9451 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9452 nargs = 5;
9453 rmode = V4DImode;
9454 mask_pos = 2;
9455 nargs_constant = 1;
9456 break;
9457 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9458 nargs = 5;
9459 rmode = V2DImode;
9460 mask_pos = 2;
9461 nargs_constant = 1;
9462 break;
9463 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9464 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9465 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9466 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9467 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9468 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9469 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9470 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9471 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9472 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9473 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9474 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9475 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9476 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9477 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9478 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9479 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9480 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9481 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9482 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9483 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9484 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9485 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9486 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9487 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9488 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9489 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9490 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9491 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9492 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9493 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9494 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9495 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9496 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9497 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9498 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9499 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9500 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9501 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9502 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9503 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9504 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9505 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9506 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9507 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9508 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9509 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9510 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9511 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9512 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9513 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
9514 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9515 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9516 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
9517 nargs = 4;
9518 break;
9519 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9520 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9521 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9522 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9523 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9524 nargs = 4;
9525 nargs_constant = 1;
9526 break;
9527 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9528 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9529 case QI_FTYPE_V4DF_V4DF_INT_UQI:
9530 case QI_FTYPE_V8SF_V8SF_INT_UQI:
9531 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9532 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9533 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9534 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9535 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9536 case USI_FTYPE_V32QI_V32QI_INT_USI:
9537 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9538 case USI_FTYPE_V32HI_V32HI_INT_USI:
9539 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9540 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
9541 nargs = 4;
9542 mask_pos = 1;
9543 nargs_constant = 1;
9544 break;
9545 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9546 nargs = 4;
9547 nargs_constant = 2;
9548 break;
9549 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9550 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
9551 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9552 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9553 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
9554 nargs = 4;
9555 break;
9556 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9557 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9558 mask_pos = 1;
9559 nargs = 4;
9560 nargs_constant = 1;
9561 break;
9562 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9563 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9564 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9565 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9566 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9567 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9568 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9569 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9570 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9571 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9572 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9573 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9574 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9575 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9576 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9577 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9578 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9579 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9580 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9581 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9582 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9583 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9584 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9585 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9586 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9587 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9588 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9589 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9590 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9591 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9592 nargs = 4;
9593 mask_pos = 2;
9594 nargs_constant = 1;
9595 break;
9596 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9597 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9598 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9599 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9600 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9601 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9602 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9603 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9604 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9605 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9606 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9607 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9608 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9609 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9610 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9611 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9612 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9613 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9614 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9615 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9616 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9617 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9618 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9619 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9620 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9621 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9622 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9623 nargs = 5;
9624 mask_pos = 2;
9625 nargs_constant = 1;
9626 break;
9627 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9628 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9629 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9630 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9631 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9632 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9633 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9634 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9635 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9636 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9637 nargs = 5;
9638 mask_pos = 1;
9639 nargs_constant = 1;
9640 break;
9641 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9642 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9643 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9644 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9645 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9646 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9647 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9648 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9649 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9650 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9651 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9652 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9653 nargs = 5;
9654 mask_pos = 1;
9655 nargs_constant = 2;
9656 break;
9657
9658 default:
9659 gcc_unreachable ();
9660 }
9661
9662 gcc_assert (nargs <= ARRAY_SIZE (args));
9663
9664 if (comparison != UNKNOWN)
9665 {
9666 gcc_assert (nargs == 2);
9667 return ix86_expand_sse_compare (d, exp, target, swap);
9668 }
9669
9670 if (rmode == VOIDmode || rmode == tmode)
9671 {
9672 if (optimize
9673 || target == 0
9674 || GET_MODE (target) != tmode
9675 || !insn_p->operand[0].predicate (target, tmode))
9676 target = gen_reg_rtx (tmode);
9677 else if (memory_operand (target, tmode))
9678 num_memory++;
9679 real_target = target;
9680 }
9681 else
9682 {
9683 real_target = gen_reg_rtx (tmode);
9684 target = lowpart_subreg (rmode, real_target, tmode);
9685 }
9686
9687 for (i = 0; i < nargs; i++)
9688 {
9689 tree arg = CALL_EXPR_ARG (exp, i);
9690 rtx op = expand_normal (arg);
9691 machine_mode mode = insn_p->operand[i + 1].mode;
9692 bool match = insn_p->operand[i + 1].predicate (op, mode);
9693
9694 if (second_arg_count && i == 1)
9695 {
9696 /* SIMD shift insns take either an 8-bit immediate or
9697 register as count. But builtin functions take int as
9698 count. If count doesn't match, we put it in register.
9699 The instructions are using 64-bit count, if op is just
9700 32-bit, zero-extend it, as negative shift counts
9701 are undefined behavior and zero-extension is more
9702 efficient. */
9703 if (!match)
9704 {
9705 if (SCALAR_INT_MODE_P (GET_MODE (op)))
9706 op = convert_modes (mode, GET_MODE (op), op, 1);
9707 else
9708 op = lowpart_subreg (mode, op, GET_MODE (op));
9709 if (!insn_p->operand[i + 1].predicate (op, mode))
9710 op = copy_to_reg (op);
9711 }
9712 }
9713 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9714 (!mask_pos && (nargs - i) <= nargs_constant))
9715 {
9716 if (!match)
9717 switch (icode)
9718 {
9719 case CODE_FOR_avx_vinsertf128v4di:
9720 case CODE_FOR_avx_vextractf128v4di:
9721 error ("the last argument must be an 1-bit immediate");
9722 return const0_rtx;
9723
9724 case CODE_FOR_avx512f_cmpv8di3_mask:
9725 case CODE_FOR_avx512f_cmpv16si3_mask:
9726 case CODE_FOR_avx512f_ucmpv8di3_mask:
9727 case CODE_FOR_avx512f_ucmpv16si3_mask:
9728 case CODE_FOR_avx512vl_cmpv4di3_mask:
9729 case CODE_FOR_avx512vl_cmpv8si3_mask:
9730 case CODE_FOR_avx512vl_ucmpv4di3_mask:
9731 case CODE_FOR_avx512vl_ucmpv8si3_mask:
9732 case CODE_FOR_avx512vl_cmpv2di3_mask:
9733 case CODE_FOR_avx512vl_cmpv4si3_mask:
9734 case CODE_FOR_avx512vl_ucmpv2di3_mask:
9735 case CODE_FOR_avx512vl_ucmpv4si3_mask:
9736 error ("the last argument must be a 3-bit immediate");
9737 return const0_rtx;
9738
9739 case CODE_FOR_sse4_1_roundsd:
9740 case CODE_FOR_sse4_1_roundss:
9741
9742 case CODE_FOR_sse4_1_roundpd:
9743 case CODE_FOR_sse4_1_roundps:
9744 case CODE_FOR_avx_roundpd256:
9745 case CODE_FOR_avx_roundps256:
9746
9747 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9748 case CODE_FOR_sse4_1_roundps_sfix:
9749 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9750 case CODE_FOR_avx_roundps_sfix256:
9751
9752 case CODE_FOR_sse4_1_blendps:
9753 case CODE_FOR_avx_blendpd256:
9754 case CODE_FOR_avx_vpermilv4df:
9755 case CODE_FOR_avx_vpermilv4df_mask:
9756 case CODE_FOR_avx512f_getmantv8df_mask:
9757 case CODE_FOR_avx512f_getmantv16sf_mask:
9758 case CODE_FOR_avx512vl_getmantv8sf_mask:
9759 case CODE_FOR_avx512vl_getmantv4df_mask:
9760 case CODE_FOR_avx512vl_getmantv4sf_mask:
9761 case CODE_FOR_avx512vl_getmantv2df_mask:
9762 case CODE_FOR_avx512dq_rangepv8df_mask_round:
9763 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9764 case CODE_FOR_avx512dq_rangepv4df_mask:
9765 case CODE_FOR_avx512dq_rangepv8sf_mask:
9766 case CODE_FOR_avx512dq_rangepv2df_mask:
9767 case CODE_FOR_avx512dq_rangepv4sf_mask:
9768 case CODE_FOR_avx_shufpd256_mask:
9769 error ("the last argument must be a 4-bit immediate");
9770 return const0_rtx;
9771
9772 case CODE_FOR_sha1rnds4:
9773 case CODE_FOR_sse4_1_blendpd:
9774 case CODE_FOR_avx_vpermilv2df:
9775 case CODE_FOR_avx_vpermilv2df_mask:
9776 case CODE_FOR_xop_vpermil2v2df3:
9777 case CODE_FOR_xop_vpermil2v4sf3:
9778 case CODE_FOR_xop_vpermil2v4df3:
9779 case CODE_FOR_xop_vpermil2v8sf3:
9780 case CODE_FOR_avx512f_vinsertf32x4_mask:
9781 case CODE_FOR_avx512f_vinserti32x4_mask:
9782 case CODE_FOR_avx512f_vextractf32x4_mask:
9783 case CODE_FOR_avx512f_vextracti32x4_mask:
9784 case CODE_FOR_sse2_shufpd:
9785 case CODE_FOR_sse2_shufpd_mask:
9786 case CODE_FOR_avx512dq_shuf_f64x2_mask:
9787 case CODE_FOR_avx512dq_shuf_i64x2_mask:
9788 case CODE_FOR_avx512vl_shuf_i32x4_mask:
9789 case CODE_FOR_avx512vl_shuf_f32x4_mask:
9790 error ("the last argument must be a 2-bit immediate");
9791 return const0_rtx;
9792
9793 case CODE_FOR_avx_vextractf128v4df:
9794 case CODE_FOR_avx_vextractf128v8sf:
9795 case CODE_FOR_avx_vextractf128v8si:
9796 case CODE_FOR_avx_vinsertf128v4df:
9797 case CODE_FOR_avx_vinsertf128v8sf:
9798 case CODE_FOR_avx_vinsertf128v8si:
9799 case CODE_FOR_avx512f_vinsertf64x4_mask:
9800 case CODE_FOR_avx512f_vinserti64x4_mask:
9801 case CODE_FOR_avx512f_vextractf64x4_mask:
9802 case CODE_FOR_avx512f_vextracti64x4_mask:
9803 case CODE_FOR_avx512dq_vinsertf32x8_mask:
9804 case CODE_FOR_avx512dq_vinserti32x8_mask:
9805 case CODE_FOR_avx512vl_vinsertv4df:
9806 case CODE_FOR_avx512vl_vinsertv4di:
9807 case CODE_FOR_avx512vl_vinsertv8sf:
9808 case CODE_FOR_avx512vl_vinsertv8si:
9809 error ("the last argument must be a 1-bit immediate");
9810 return const0_rtx;
9811
9812 case CODE_FOR_avx_vmcmpv2df3:
9813 case CODE_FOR_avx_vmcmpv4sf3:
9814 case CODE_FOR_avx_cmpv2df3:
9815 case CODE_FOR_avx_cmpv4sf3:
9816 case CODE_FOR_avx_cmpv4df3:
9817 case CODE_FOR_avx_cmpv8sf3:
9818 case CODE_FOR_avx512f_cmpv8df3_mask:
9819 case CODE_FOR_avx512f_cmpv16sf3_mask:
9820 case CODE_FOR_avx512f_vmcmpv2df3_mask:
9821 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9822 error ("the last argument must be a 5-bit immediate");
9823 return const0_rtx;
9824
9825 default:
9826 switch (nargs_constant)
9827 {
9828 case 2:
9829 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9830 (!mask_pos && (nargs - i) == nargs_constant))
9831 {
9832 error ("the next to last argument must be an 8-bit immediate");
9833 break;
9834 }
9835 /* FALLTHRU */
9836 case 1:
9837 error ("the last argument must be an 8-bit immediate");
9838 break;
9839 default:
9840 gcc_unreachable ();
9841 }
9842 return const0_rtx;
9843 }
9844 }
9845 else
9846 {
9847 if (VECTOR_MODE_P (mode))
9848 op = safe_vector_operand (op, mode);
9849
9850 /* If we aren't optimizing, only allow one memory operand to
9851 be generated. */
9852 if (memory_operand (op, mode))
9853 num_memory++;
9854
9855 op = fixup_modeless_constant (op, mode);
9856
9857 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9858 {
9859 if (optimize || !match || num_memory > 1)
9860 op = copy_to_mode_reg (mode, op);
9861 }
9862 else
9863 {
9864 op = copy_to_reg (op);
9865 op = lowpart_subreg (mode, op, GET_MODE (op));
9866 }
9867 }
9868
9869 args[i].op = op;
9870 args[i].mode = mode;
9871 }
9872
9873 switch (nargs)
9874 {
9875 case 1:
9876 pat = GEN_FCN (icode) (real_target, args[0].op);
9877 break;
9878 case 2:
9879 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
9880 break;
9881 case 3:
9882 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9883 args[2].op);
9884 break;
9885 case 4:
9886 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9887 args[2].op, args[3].op);
9888 break;
9889 case 5:
9890 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9891 args[2].op, args[3].op, args[4].op);
9892 break;
9893 case 6:
9894 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9895 args[2].op, args[3].op, args[4].op,
9896 args[5].op);
9897 break;
9898 default:
9899 gcc_unreachable ();
9900 }
9901
9902 if (! pat)
9903 return 0;
9904
9905 emit_insn (pat);
9906 return target;
9907 }
9908
9909 /* Transform pattern of following layout:
9910 (set A
9911 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9912 )
9913 into:
9914 (set (A B)) */
9915
9916 static rtx
9917 ix86_erase_embedded_rounding (rtx pat)
9918 {
9919 if (GET_CODE (pat) == INSN)
9920 pat = PATTERN (pat);
9921
9922 gcc_assert (GET_CODE (pat) == SET);
9923 rtx src = SET_SRC (pat);
9924 gcc_assert (XVECLEN (src, 0) == 2);
9925 rtx p0 = XVECEXP (src, 0, 0);
9926 gcc_assert (GET_CODE (src) == UNSPEC
9927 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
9928 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
9929 return res;
9930 }
9931
9932 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9933 with rounding. */
9934 static rtx
9935 ix86_expand_sse_comi_round (const struct builtin_description *d,
9936 tree exp, rtx target)
9937 {
9938 rtx pat, set_dst;
9939 tree arg0 = CALL_EXPR_ARG (exp, 0);
9940 tree arg1 = CALL_EXPR_ARG (exp, 1);
9941 tree arg2 = CALL_EXPR_ARG (exp, 2);
9942 tree arg3 = CALL_EXPR_ARG (exp, 3);
9943 rtx op0 = expand_normal (arg0);
9944 rtx op1 = expand_normal (arg1);
9945 rtx op2 = expand_normal (arg2);
9946 rtx op3 = expand_normal (arg3);
9947 enum insn_code icode = d->icode;
9948 const struct insn_data_d *insn_p = &insn_data[icode];
9949 machine_mode mode0 = insn_p->operand[0].mode;
9950 machine_mode mode1 = insn_p->operand[1].mode;
9951
9952 /* See avxintrin.h for values. */
9953 static const enum rtx_code comparisons[32] =
9954 {
9955 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9956 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
9957 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9958 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
9959 };
9960 static const bool ordereds[32] =
9961 {
9962 true, true, true, false, false, false, false, true,
9963 false, false, false, true, true, true, true, false,
9964 true, true, true, false, false, false, false, true,
9965 false, false, false, true, true, true, true, false
9966 };
9967 static const bool non_signalings[32] =
9968 {
9969 true, false, false, true, true, false, false, true,
9970 true, false, false, true, true, false, false, true,
9971 false, true, true, false, false, true, true, false,
9972 false, true, true, false, false, true, true, false
9973 };
9974
9975 if (!CONST_INT_P (op2))
9976 {
9977 error ("the third argument must be comparison constant");
9978 return const0_rtx;
9979 }
9980 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
9981 {
9982 error ("incorrect comparison mode");
9983 return const0_rtx;
9984 }
9985
9986 if (!insn_p->operand[2].predicate (op3, SImode))
9987 {
9988 error ("incorrect rounding operand");
9989 return const0_rtx;
9990 }
9991
9992 if (VECTOR_MODE_P (mode0))
9993 op0 = safe_vector_operand (op0, mode0);
9994 if (VECTOR_MODE_P (mode1))
9995 op1 = safe_vector_operand (op1, mode1);
9996
9997 enum rtx_code comparison = comparisons[INTVAL (op2)];
9998 bool ordered = ordereds[INTVAL (op2)];
9999 bool non_signaling = non_signalings[INTVAL (op2)];
10000 rtx const_val = const0_rtx;
10001
10002 bool check_unordered = false;
10003 machine_mode mode = CCFPmode;
10004 switch (comparison)
10005 {
10006 case ORDERED:
10007 if (!ordered)
10008 {
10009 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10010 if (!non_signaling)
10011 ordered = true;
10012 mode = CCSmode;
10013 }
10014 else
10015 {
10016 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10017 if (non_signaling)
10018 ordered = false;
10019 mode = CCPmode;
10020 }
10021 comparison = NE;
10022 break;
10023 case UNORDERED:
10024 if (ordered)
10025 {
10026 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10027 if (non_signaling)
10028 ordered = false;
10029 mode = CCSmode;
10030 }
10031 else
10032 {
10033 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10034 if (!non_signaling)
10035 ordered = true;
10036 mode = CCPmode;
10037 }
10038 comparison = EQ;
10039 break;
10040
10041 case LE: /* -> GE */
10042 case LT: /* -> GT */
10043 case UNGE: /* -> UNLE */
10044 case UNGT: /* -> UNLT */
10045 std::swap (op0, op1);
10046 comparison = swap_condition (comparison);
10047 /* FALLTHRU */
10048 case GT:
10049 case GE:
10050 case UNEQ:
10051 case UNLT:
10052 case UNLE:
10053 case LTGT:
10054 /* These are supported by CCFPmode. NB: Use ordered/signaling
10055 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10056 with NAN operands. */
10057 if (ordered == non_signaling)
10058 ordered = !ordered;
10059 break;
10060 case EQ:
10061 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10062 _CMP_EQ_OQ/_CMP_EQ_OS. */
10063 check_unordered = true;
10064 mode = CCZmode;
10065 break;
10066 case NE:
10067 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10068 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10069 gcc_assert (!ordered);
10070 check_unordered = true;
10071 mode = CCZmode;
10072 const_val = const1_rtx;
10073 break;
10074 default:
10075 gcc_unreachable ();
10076 }
10077
10078 target = gen_reg_rtx (SImode);
10079 emit_move_insn (target, const_val);
10080 target = gen_rtx_SUBREG (QImode, target, 0);
10081
10082 if ((optimize && !register_operand (op0, mode0))
10083 || !insn_p->operand[0].predicate (op0, mode0))
10084 op0 = copy_to_mode_reg (mode0, op0);
10085 if ((optimize && !register_operand (op1, mode1))
10086 || !insn_p->operand[1].predicate (op1, mode1))
10087 op1 = copy_to_mode_reg (mode1, op1);
10088
10089 /*
10090 1. COMI: ordered and signaling.
10091 2. UCOMI: unordered and non-signaling.
10092 */
10093 if (non_signaling)
10094 icode = (icode == CODE_FOR_sse_comi_round
10095 ? CODE_FOR_sse_ucomi_round
10096 : CODE_FOR_sse2_ucomi_round);
10097
10098 pat = GEN_FCN (icode) (op0, op1, op3);
10099 if (! pat)
10100 return 0;
10101
10102 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10103 if (INTVAL (op3) == NO_ROUND)
10104 {
10105 pat = ix86_erase_embedded_rounding (pat);
10106 if (! pat)
10107 return 0;
10108
10109 set_dst = SET_DEST (pat);
10110 }
10111 else
10112 {
10113 gcc_assert (GET_CODE (pat) == SET);
10114 set_dst = SET_DEST (pat);
10115 }
10116
10117 emit_insn (pat);
10118
10119 rtx_code_label *label = NULL;
10120
10121 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10122 with NAN operands. */
10123 if (check_unordered)
10124 {
10125 gcc_assert (comparison == EQ || comparison == NE);
10126
10127 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10128 label = gen_label_rtx ();
10129 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10130 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10131 gen_rtx_LABEL_REF (VOIDmode, label),
10132 pc_rtx);
10133 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10134 }
10135
10136 /* NB: Set CCFPmode and check a different CCmode which is in subset
10137 of CCFPmode. */
10138 if (GET_MODE (set_dst) != mode)
10139 {
10140 gcc_assert (mode == CCAmode || mode == CCCmode
10141 || mode == CCOmode || mode == CCPmode
10142 || mode == CCSmode || mode == CCZmode);
10143 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10144 }
10145
10146 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10147 gen_rtx_fmt_ee (comparison, QImode,
10148 set_dst,
10149 const0_rtx)));
10150
10151 if (label)
10152 emit_label (label);
10153
10154 return SUBREG_REG (target);
10155 }
10156
10157 static rtx
10158 ix86_expand_round_builtin (const struct builtin_description *d,
10159 tree exp, rtx target)
10160 {
10161 rtx pat;
10162 unsigned int i, nargs;
10163 struct
10164 {
10165 rtx op;
10166 machine_mode mode;
10167 } args[6];
10168 enum insn_code icode = d->icode;
10169 const struct insn_data_d *insn_p = &insn_data[icode];
10170 machine_mode tmode = insn_p->operand[0].mode;
10171 unsigned int nargs_constant = 0;
10172 unsigned int redundant_embed_rnd = 0;
10173
10174 switch ((enum ix86_builtin_func_type) d->flag)
10175 {
10176 case UINT64_FTYPE_V2DF_INT:
10177 case UINT64_FTYPE_V4SF_INT:
10178 case UINT_FTYPE_V2DF_INT:
10179 case UINT_FTYPE_V4SF_INT:
10180 case INT64_FTYPE_V2DF_INT:
10181 case INT64_FTYPE_V4SF_INT:
10182 case INT_FTYPE_V2DF_INT:
10183 case INT_FTYPE_V4SF_INT:
10184 nargs = 2;
10185 break;
10186 case V4SF_FTYPE_V4SF_UINT_INT:
10187 case V4SF_FTYPE_V4SF_UINT64_INT:
10188 case V2DF_FTYPE_V2DF_UINT64_INT:
10189 case V4SF_FTYPE_V4SF_INT_INT:
10190 case V4SF_FTYPE_V4SF_INT64_INT:
10191 case V2DF_FTYPE_V2DF_INT64_INT:
10192 case V4SF_FTYPE_V4SF_V4SF_INT:
10193 case V2DF_FTYPE_V2DF_V2DF_INT:
10194 case V4SF_FTYPE_V4SF_V2DF_INT:
10195 case V2DF_FTYPE_V2DF_V4SF_INT:
10196 nargs = 3;
10197 break;
10198 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10199 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10200 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10201 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10202 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10203 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10204 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10205 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10206 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10207 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10208 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10209 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10210 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10211 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10212 nargs = 4;
10213 break;
10214 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10215 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10216 nargs_constant = 2;
10217 nargs = 4;
10218 break;
10219 case INT_FTYPE_V4SF_V4SF_INT_INT:
10220 case INT_FTYPE_V2DF_V2DF_INT_INT:
10221 return ix86_expand_sse_comi_round (d, exp, target);
10222 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10223 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10224 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10225 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10226 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10227 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10228 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10229 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10230 nargs = 5;
10231 break;
10232 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10233 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10234 nargs_constant = 4;
10235 nargs = 5;
10236 break;
10237 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10238 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10239 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10240 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10241 nargs_constant = 3;
10242 nargs = 5;
10243 break;
10244 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10245 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10246 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10247 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10248 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10249 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10250 nargs = 6;
10251 nargs_constant = 4;
10252 break;
10253 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10254 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10255 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10256 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10257 nargs = 6;
10258 nargs_constant = 3;
10259 break;
10260 default:
10261 gcc_unreachable ();
10262 }
10263 gcc_assert (nargs <= ARRAY_SIZE (args));
10264
10265 if (optimize
10266 || target == 0
10267 || GET_MODE (target) != tmode
10268 || !insn_p->operand[0].predicate (target, tmode))
10269 target = gen_reg_rtx (tmode);
10270
10271 for (i = 0; i < nargs; i++)
10272 {
10273 tree arg = CALL_EXPR_ARG (exp, i);
10274 rtx op = expand_normal (arg);
10275 machine_mode mode = insn_p->operand[i + 1].mode;
10276 bool match = insn_p->operand[i + 1].predicate (op, mode);
10277
10278 if (i == nargs - nargs_constant)
10279 {
10280 if (!match)
10281 {
10282 switch (icode)
10283 {
10284 case CODE_FOR_avx512f_getmantv8df_mask_round:
10285 case CODE_FOR_avx512f_getmantv16sf_mask_round:
10286 case CODE_FOR_avx512f_vgetmantv2df_round:
10287 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10288 case CODE_FOR_avx512f_vgetmantv4sf_round:
10289 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10290 error ("the immediate argument must be a 4-bit immediate");
10291 return const0_rtx;
10292 case CODE_FOR_avx512f_cmpv8df3_mask_round:
10293 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10294 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10295 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10296 error ("the immediate argument must be a 5-bit immediate");
10297 return const0_rtx;
10298 default:
10299 error ("the immediate argument must be an 8-bit immediate");
10300 return const0_rtx;
10301 }
10302 }
10303 }
10304 else if (i == nargs-1)
10305 {
10306 if (!insn_p->operand[nargs].predicate (op, SImode))
10307 {
10308 error ("incorrect rounding operand");
10309 return const0_rtx;
10310 }
10311
10312 /* If there is no rounding use normal version of the pattern. */
10313 if (INTVAL (op) == NO_ROUND)
10314 redundant_embed_rnd = 1;
10315 }
10316 else
10317 {
10318 if (VECTOR_MODE_P (mode))
10319 op = safe_vector_operand (op, mode);
10320
10321 op = fixup_modeless_constant (op, mode);
10322
10323 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10324 {
10325 if (optimize || !match)
10326 op = copy_to_mode_reg (mode, op);
10327 }
10328 else
10329 {
10330 op = copy_to_reg (op);
10331 op = lowpart_subreg (mode, op, GET_MODE (op));
10332 }
10333 }
10334
10335 args[i].op = op;
10336 args[i].mode = mode;
10337 }
10338
10339 switch (nargs)
10340 {
10341 case 1:
10342 pat = GEN_FCN (icode) (target, args[0].op);
10343 break;
10344 case 2:
10345 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10346 break;
10347 case 3:
10348 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10349 args[2].op);
10350 break;
10351 case 4:
10352 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10353 args[2].op, args[3].op);
10354 break;
10355 case 5:
10356 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10357 args[2].op, args[3].op, args[4].op);
10358 break;
10359 case 6:
10360 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10361 args[2].op, args[3].op, args[4].op,
10362 args[5].op);
10363 break;
10364 default:
10365 gcc_unreachable ();
10366 }
10367
10368 if (!pat)
10369 return 0;
10370
10371 if (redundant_embed_rnd)
10372 pat = ix86_erase_embedded_rounding (pat);
10373
10374 emit_insn (pat);
10375 return target;
10376 }
10377
10378 /* Subroutine of ix86_expand_builtin to take care of special insns
10379 with variable number of operands. */
10380
10381 static rtx
10382 ix86_expand_special_args_builtin (const struct builtin_description *d,
10383 tree exp, rtx target)
10384 {
10385 tree arg;
10386 rtx pat, op;
10387 unsigned int i, nargs, arg_adjust, memory;
10388 bool aligned_mem = false;
10389 struct
10390 {
10391 rtx op;
10392 machine_mode mode;
10393 } args[3];
10394 enum insn_code icode = d->icode;
10395 bool last_arg_constant = false;
10396 const struct insn_data_d *insn_p = &insn_data[icode];
10397 machine_mode tmode = insn_p->operand[0].mode;
10398 enum { load, store } klass;
10399
10400 switch ((enum ix86_builtin_func_type) d->flag)
10401 {
10402 case VOID_FTYPE_VOID:
10403 emit_insn (GEN_FCN (icode) (target));
10404 return 0;
10405 case VOID_FTYPE_UINT64:
10406 case VOID_FTYPE_UNSIGNED:
10407 nargs = 0;
10408 klass = store;
10409 memory = 0;
10410 break;
10411
10412 case INT_FTYPE_VOID:
10413 case USHORT_FTYPE_VOID:
10414 case UINT64_FTYPE_VOID:
10415 case UINT_FTYPE_VOID:
10416 case UNSIGNED_FTYPE_VOID:
10417 nargs = 0;
10418 klass = load;
10419 memory = 0;
10420 break;
10421 case UINT64_FTYPE_PUNSIGNED:
10422 case V2DI_FTYPE_PV2DI:
10423 case V4DI_FTYPE_PV4DI:
10424 case V32QI_FTYPE_PCCHAR:
10425 case V16QI_FTYPE_PCCHAR:
10426 case V8SF_FTYPE_PCV4SF:
10427 case V8SF_FTYPE_PCFLOAT:
10428 case V4SF_FTYPE_PCFLOAT:
10429 case V4DF_FTYPE_PCV2DF:
10430 case V4DF_FTYPE_PCDOUBLE:
10431 case V2DF_FTYPE_PCDOUBLE:
10432 case VOID_FTYPE_PVOID:
10433 case V8DI_FTYPE_PV8DI:
10434 nargs = 1;
10435 klass = load;
10436 memory = 0;
10437 switch (icode)
10438 {
10439 case CODE_FOR_sse4_1_movntdqa:
10440 case CODE_FOR_avx2_movntdqa:
10441 case CODE_FOR_avx512f_movntdqa:
10442 aligned_mem = true;
10443 break;
10444 default:
10445 break;
10446 }
10447 break;
10448 case VOID_FTYPE_PV2SF_V4SF:
10449 case VOID_FTYPE_PV8DI_V8DI:
10450 case VOID_FTYPE_PV4DI_V4DI:
10451 case VOID_FTYPE_PV2DI_V2DI:
10452 case VOID_FTYPE_PCHAR_V32QI:
10453 case VOID_FTYPE_PCHAR_V16QI:
10454 case VOID_FTYPE_PFLOAT_V16SF:
10455 case VOID_FTYPE_PFLOAT_V8SF:
10456 case VOID_FTYPE_PFLOAT_V4SF:
10457 case VOID_FTYPE_PDOUBLE_V8DF:
10458 case VOID_FTYPE_PDOUBLE_V4DF:
10459 case VOID_FTYPE_PDOUBLE_V2DF:
10460 case VOID_FTYPE_PLONGLONG_LONGLONG:
10461 case VOID_FTYPE_PULONGLONG_ULONGLONG:
10462 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10463 case VOID_FTYPE_PINT_INT:
10464 nargs = 1;
10465 klass = store;
10466 /* Reserve memory operand for target. */
10467 memory = ARRAY_SIZE (args);
10468 switch (icode)
10469 {
10470 /* These builtins and instructions require the memory
10471 to be properly aligned. */
10472 case CODE_FOR_avx_movntv4di:
10473 case CODE_FOR_sse2_movntv2di:
10474 case CODE_FOR_avx_movntv8sf:
10475 case CODE_FOR_sse_movntv4sf:
10476 case CODE_FOR_sse4a_vmmovntv4sf:
10477 case CODE_FOR_avx_movntv4df:
10478 case CODE_FOR_sse2_movntv2df:
10479 case CODE_FOR_sse4a_vmmovntv2df:
10480 case CODE_FOR_sse2_movntidi:
10481 case CODE_FOR_sse_movntq:
10482 case CODE_FOR_sse2_movntisi:
10483 case CODE_FOR_avx512f_movntv16sf:
10484 case CODE_FOR_avx512f_movntv8df:
10485 case CODE_FOR_avx512f_movntv8di:
10486 aligned_mem = true;
10487 break;
10488 default:
10489 break;
10490 }
10491 break;
10492 case VOID_FTYPE_PVOID_PCVOID:
10493 nargs = 1;
10494 klass = store;
10495 memory = 0;
10496
10497 break;
10498 case V4SF_FTYPE_V4SF_PCV2SF:
10499 case V2DF_FTYPE_V2DF_PCDOUBLE:
10500 nargs = 2;
10501 klass = load;
10502 memory = 1;
10503 break;
10504 case V8SF_FTYPE_PCV8SF_V8SI:
10505 case V4DF_FTYPE_PCV4DF_V4DI:
10506 case V4SF_FTYPE_PCV4SF_V4SI:
10507 case V2DF_FTYPE_PCV2DF_V2DI:
10508 case V8SI_FTYPE_PCV8SI_V8SI:
10509 case V4DI_FTYPE_PCV4DI_V4DI:
10510 case V4SI_FTYPE_PCV4SI_V4SI:
10511 case V2DI_FTYPE_PCV2DI_V2DI:
10512 case VOID_FTYPE_INT_INT64:
10513 nargs = 2;
10514 klass = load;
10515 memory = 0;
10516 break;
10517 case VOID_FTYPE_PV8DF_V8DF_UQI:
10518 case VOID_FTYPE_PV4DF_V4DF_UQI:
10519 case VOID_FTYPE_PV2DF_V2DF_UQI:
10520 case VOID_FTYPE_PV16SF_V16SF_UHI:
10521 case VOID_FTYPE_PV8SF_V8SF_UQI:
10522 case VOID_FTYPE_PV4SF_V4SF_UQI:
10523 case VOID_FTYPE_PV8DI_V8DI_UQI:
10524 case VOID_FTYPE_PV4DI_V4DI_UQI:
10525 case VOID_FTYPE_PV2DI_V2DI_UQI:
10526 case VOID_FTYPE_PV16SI_V16SI_UHI:
10527 case VOID_FTYPE_PV8SI_V8SI_UQI:
10528 case VOID_FTYPE_PV4SI_V4SI_UQI:
10529 case VOID_FTYPE_PV64QI_V64QI_UDI:
10530 case VOID_FTYPE_PV32HI_V32HI_USI:
10531 case VOID_FTYPE_PV32QI_V32QI_USI:
10532 case VOID_FTYPE_PV16QI_V16QI_UHI:
10533 case VOID_FTYPE_PV16HI_V16HI_UHI:
10534 case VOID_FTYPE_PV8HI_V8HI_UQI:
10535 switch (icode)
10536 {
10537 /* These builtins and instructions require the memory
10538 to be properly aligned. */
10539 case CODE_FOR_avx512f_storev16sf_mask:
10540 case CODE_FOR_avx512f_storev16si_mask:
10541 case CODE_FOR_avx512f_storev8df_mask:
10542 case CODE_FOR_avx512f_storev8di_mask:
10543 case CODE_FOR_avx512vl_storev8sf_mask:
10544 case CODE_FOR_avx512vl_storev8si_mask:
10545 case CODE_FOR_avx512vl_storev4df_mask:
10546 case CODE_FOR_avx512vl_storev4di_mask:
10547 case CODE_FOR_avx512vl_storev4sf_mask:
10548 case CODE_FOR_avx512vl_storev4si_mask:
10549 case CODE_FOR_avx512vl_storev2df_mask:
10550 case CODE_FOR_avx512vl_storev2di_mask:
10551 aligned_mem = true;
10552 break;
10553 default:
10554 break;
10555 }
10556 /* FALLTHRU */
10557 case VOID_FTYPE_PV8SF_V8SI_V8SF:
10558 case VOID_FTYPE_PV4DF_V4DI_V4DF:
10559 case VOID_FTYPE_PV4SF_V4SI_V4SF:
10560 case VOID_FTYPE_PV2DF_V2DI_V2DF:
10561 case VOID_FTYPE_PV8SI_V8SI_V8SI:
10562 case VOID_FTYPE_PV4DI_V4DI_V4DI:
10563 case VOID_FTYPE_PV4SI_V4SI_V4SI:
10564 case VOID_FTYPE_PV2DI_V2DI_V2DI:
10565 case VOID_FTYPE_PV8SI_V8DI_UQI:
10566 case VOID_FTYPE_PV8HI_V8DI_UQI:
10567 case VOID_FTYPE_PV16HI_V16SI_UHI:
10568 case VOID_FTYPE_PUDI_V8DI_UQI:
10569 case VOID_FTYPE_PV16QI_V16SI_UHI:
10570 case VOID_FTYPE_PV4SI_V4DI_UQI:
10571 case VOID_FTYPE_PUDI_V2DI_UQI:
10572 case VOID_FTYPE_PUDI_V4DI_UQI:
10573 case VOID_FTYPE_PUSI_V2DI_UQI:
10574 case VOID_FTYPE_PV8HI_V8SI_UQI:
10575 case VOID_FTYPE_PUDI_V4SI_UQI:
10576 case VOID_FTYPE_PUSI_V4DI_UQI:
10577 case VOID_FTYPE_PUHI_V2DI_UQI:
10578 case VOID_FTYPE_PUDI_V8SI_UQI:
10579 case VOID_FTYPE_PUSI_V4SI_UQI:
10580 case VOID_FTYPE_PCHAR_V64QI_UDI:
10581 case VOID_FTYPE_PCHAR_V32QI_USI:
10582 case VOID_FTYPE_PCHAR_V16QI_UHI:
10583 case VOID_FTYPE_PSHORT_V32HI_USI:
10584 case VOID_FTYPE_PSHORT_V16HI_UHI:
10585 case VOID_FTYPE_PSHORT_V8HI_UQI:
10586 case VOID_FTYPE_PINT_V16SI_UHI:
10587 case VOID_FTYPE_PINT_V8SI_UQI:
10588 case VOID_FTYPE_PINT_V4SI_UQI:
10589 case VOID_FTYPE_PINT64_V8DI_UQI:
10590 case VOID_FTYPE_PINT64_V4DI_UQI:
10591 case VOID_FTYPE_PINT64_V2DI_UQI:
10592 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10593 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10594 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10595 case VOID_FTYPE_PFLOAT_V16SF_UHI:
10596 case VOID_FTYPE_PFLOAT_V8SF_UQI:
10597 case VOID_FTYPE_PFLOAT_V4SF_UQI:
10598 case VOID_FTYPE_PV32QI_V32HI_USI:
10599 case VOID_FTYPE_PV16QI_V16HI_UHI:
10600 case VOID_FTYPE_PUDI_V8HI_UQI:
10601 nargs = 2;
10602 klass = store;
10603 /* Reserve memory operand for target. */
10604 memory = ARRAY_SIZE (args);
10605 break;
10606 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10607 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10608 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10609 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10610 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10611 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10612 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10613 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10614 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10615 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10616 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10617 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10618 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10619 case V32HI_FTYPE_PCV32HI_V32HI_USI:
10620 case V32QI_FTYPE_PCV32QI_V32QI_USI:
10621 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10622 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10623 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10624 switch (icode)
10625 {
10626 /* These builtins and instructions require the memory
10627 to be properly aligned. */
10628 case CODE_FOR_avx512f_loadv16sf_mask:
10629 case CODE_FOR_avx512f_loadv16si_mask:
10630 case CODE_FOR_avx512f_loadv8df_mask:
10631 case CODE_FOR_avx512f_loadv8di_mask:
10632 case CODE_FOR_avx512vl_loadv8sf_mask:
10633 case CODE_FOR_avx512vl_loadv8si_mask:
10634 case CODE_FOR_avx512vl_loadv4df_mask:
10635 case CODE_FOR_avx512vl_loadv4di_mask:
10636 case CODE_FOR_avx512vl_loadv4sf_mask:
10637 case CODE_FOR_avx512vl_loadv4si_mask:
10638 case CODE_FOR_avx512vl_loadv2df_mask:
10639 case CODE_FOR_avx512vl_loadv2di_mask:
10640 case CODE_FOR_avx512bw_loadv64qi_mask:
10641 case CODE_FOR_avx512vl_loadv32qi_mask:
10642 case CODE_FOR_avx512vl_loadv16qi_mask:
10643 case CODE_FOR_avx512bw_loadv32hi_mask:
10644 case CODE_FOR_avx512vl_loadv16hi_mask:
10645 case CODE_FOR_avx512vl_loadv8hi_mask:
10646 aligned_mem = true;
10647 break;
10648 default:
10649 break;
10650 }
10651 /* FALLTHRU */
10652 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10653 case V32QI_FTYPE_PCCHAR_V32QI_USI:
10654 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10655 case V32HI_FTYPE_PCSHORT_V32HI_USI:
10656 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10657 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10658 case V16SI_FTYPE_PCINT_V16SI_UHI:
10659 case V8SI_FTYPE_PCINT_V8SI_UQI:
10660 case V4SI_FTYPE_PCINT_V4SI_UQI:
10661 case V8DI_FTYPE_PCINT64_V8DI_UQI:
10662 case V4DI_FTYPE_PCINT64_V4DI_UQI:
10663 case V2DI_FTYPE_PCINT64_V2DI_UQI:
10664 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10665 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10666 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10667 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10668 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10669 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10670 nargs = 3;
10671 klass = load;
10672 memory = 0;
10673 break;
10674 default:
10675 gcc_unreachable ();
10676 }
10677
10678 gcc_assert (nargs <= ARRAY_SIZE (args));
10679
10680 if (klass == store)
10681 {
10682 arg = CALL_EXPR_ARG (exp, 0);
10683 op = expand_normal (arg);
10684 gcc_assert (target == 0);
10685 if (memory)
10686 {
10687 op = ix86_zero_extend_to_Pmode (op);
10688 target = gen_rtx_MEM (tmode, op);
10689 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10690 on it. Try to improve it using get_pointer_alignment,
10691 and if the special builtin is one that requires strict
10692 mode alignment, also from it's GET_MODE_ALIGNMENT.
10693 Failure to do so could lead to ix86_legitimate_combined_insn
10694 rejecting all changes to such insns. */
10695 unsigned int align = get_pointer_alignment (arg);
10696 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10697 align = GET_MODE_ALIGNMENT (tmode);
10698 if (MEM_ALIGN (target) < align)
10699 set_mem_align (target, align);
10700 }
10701 else
10702 target = force_reg (tmode, op);
10703 arg_adjust = 1;
10704 }
10705 else
10706 {
10707 arg_adjust = 0;
10708 if (optimize
10709 || target == 0
10710 || !register_operand (target, tmode)
10711 || GET_MODE (target) != tmode)
10712 target = gen_reg_rtx (tmode);
10713 }
10714
10715 for (i = 0; i < nargs; i++)
10716 {
10717 machine_mode mode = insn_p->operand[i + 1].mode;
10718 bool match;
10719
10720 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10721 op = expand_normal (arg);
10722 match = insn_p->operand[i + 1].predicate (op, mode);
10723
10724 if (last_arg_constant && (i + 1) == nargs)
10725 {
10726 if (!match)
10727 {
10728 error ("the last argument must be an 8-bit immediate");
10729 return const0_rtx;
10730 }
10731 }
10732 else
10733 {
10734 if (i == memory)
10735 {
10736 /* This must be the memory operand. */
10737 op = ix86_zero_extend_to_Pmode (op);
10738 op = gen_rtx_MEM (mode, op);
10739 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10740 on it. Try to improve it using get_pointer_alignment,
10741 and if the special builtin is one that requires strict
10742 mode alignment, also from it's GET_MODE_ALIGNMENT.
10743 Failure to do so could lead to ix86_legitimate_combined_insn
10744 rejecting all changes to such insns. */
10745 unsigned int align = get_pointer_alignment (arg);
10746 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10747 align = GET_MODE_ALIGNMENT (mode);
10748 if (MEM_ALIGN (op) < align)
10749 set_mem_align (op, align);
10750 }
10751 else
10752 {
10753 /* This must be register. */
10754 if (VECTOR_MODE_P (mode))
10755 op = safe_vector_operand (op, mode);
10756
10757 op = fixup_modeless_constant (op, mode);
10758
10759 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10760 op = copy_to_mode_reg (mode, op);
10761 else
10762 {
10763 op = copy_to_reg (op);
10764 op = lowpart_subreg (mode, op, GET_MODE (op));
10765 }
10766 }
10767 }
10768
10769 args[i].op = op;
10770 args[i].mode = mode;
10771 }
10772
10773 switch (nargs)
10774 {
10775 case 0:
10776 pat = GEN_FCN (icode) (target);
10777 break;
10778 case 1:
10779 pat = GEN_FCN (icode) (target, args[0].op);
10780 break;
10781 case 2:
10782 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10783 break;
10784 case 3:
10785 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
10786 break;
10787 default:
10788 gcc_unreachable ();
10789 }
10790
10791 if (! pat)
10792 return 0;
10793 emit_insn (pat);
10794 return klass == store ? 0 : target;
10795 }
10796
10797 /* Return the integer constant in ARG. Constrain it to be in the range
10798 of the subparts of VEC_TYPE; issue an error if not. */
10799
10800 static int
10801 get_element_number (tree vec_type, tree arg)
10802 {
10803 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10804
10805 if (!tree_fits_uhwi_p (arg)
10806 || (elt = tree_to_uhwi (arg), elt > max))
10807 {
10808 error ("selector must be an integer constant in the range "
10809 "[0, %wi]", max);
10810 return 0;
10811 }
10812
10813 return elt;
10814 }
10815
10816 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10817 ix86_expand_vector_init. We DO have language-level syntax for this, in
10818 the form of (type){ init-list }. Except that since we can't place emms
10819 instructions from inside the compiler, we can't allow the use of MMX
10820 registers unless the user explicitly asks for it. So we do *not* define
10821 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10822 we have builtins invoked by mmintrin.h that gives us license to emit
10823 these sorts of instructions. */
10824
10825 static rtx
10826 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10827 {
10828 machine_mode tmode = TYPE_MODE (type);
10829 machine_mode inner_mode = GET_MODE_INNER (tmode);
10830 int i, n_elt = GET_MODE_NUNITS (tmode);
10831 rtvec v = rtvec_alloc (n_elt);
10832
10833 gcc_assert (VECTOR_MODE_P (tmode));
10834 gcc_assert (call_expr_nargs (exp) == n_elt);
10835
10836 for (i = 0; i < n_elt; ++i)
10837 {
10838 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10839 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10840 }
10841
10842 if (!target || !register_operand (target, tmode))
10843 target = gen_reg_rtx (tmode);
10844
10845 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10846 return target;
10847 }
10848
10849 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10850 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10851 had a language-level syntax for referencing vector elements. */
10852
10853 static rtx
10854 ix86_expand_vec_ext_builtin (tree exp, rtx target)
10855 {
10856 machine_mode tmode, mode0;
10857 tree arg0, arg1;
10858 int elt;
10859 rtx op0;
10860
10861 arg0 = CALL_EXPR_ARG (exp, 0);
10862 arg1 = CALL_EXPR_ARG (exp, 1);
10863
10864 op0 = expand_normal (arg0);
10865 elt = get_element_number (TREE_TYPE (arg0), arg1);
10866
10867 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10868 mode0 = TYPE_MODE (TREE_TYPE (arg0));
10869 gcc_assert (VECTOR_MODE_P (mode0));
10870
10871 op0 = force_reg (mode0, op0);
10872
10873 if (optimize || !target || !register_operand (target, tmode))
10874 target = gen_reg_rtx (tmode);
10875
10876 ix86_expand_vector_extract (true, target, op0, elt);
10877
10878 return target;
10879 }
10880
10881 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10882 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10883 a language-level syntax for referencing vector elements. */
10884
10885 static rtx
10886 ix86_expand_vec_set_builtin (tree exp)
10887 {
10888 machine_mode tmode, mode1;
10889 tree arg0, arg1, arg2;
10890 int elt;
10891 rtx op0, op1, target;
10892
10893 arg0 = CALL_EXPR_ARG (exp, 0);
10894 arg1 = CALL_EXPR_ARG (exp, 1);
10895 arg2 = CALL_EXPR_ARG (exp, 2);
10896
10897 tmode = TYPE_MODE (TREE_TYPE (arg0));
10898 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10899 gcc_assert (VECTOR_MODE_P (tmode));
10900
10901 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
10902 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
10903 elt = get_element_number (TREE_TYPE (arg0), arg2);
10904
10905 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
10906 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
10907
10908 op0 = force_reg (tmode, op0);
10909 op1 = force_reg (mode1, op1);
10910
10911 /* OP0 is the source of these builtin functions and shouldn't be
10912 modified. Create a copy, use it and return it as target. */
10913 target = gen_reg_rtx (tmode);
10914 emit_move_insn (target, op0);
10915 ix86_expand_vector_set (true, target, op1, elt);
10916
10917 return target;
10918 }
10919
10920 /* Expand an expression EXP that calls a built-in function,
10921 with result going to TARGET if that's convenient
10922 (and in mode MODE if that's convenient).
10923 SUBTARGET may be used as the target for computing one of EXP's operands.
10924 IGNORE is nonzero if the value is to be ignored. */
10925
10926 rtx
10927 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
10928 machine_mode mode, int ignore)
10929 {
10930 size_t i;
10931 enum insn_code icode, icode2;
10932 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
10933 tree arg0, arg1, arg2, arg3, arg4;
10934 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
10935 machine_mode mode0, mode1, mode2, mode3, mode4;
10936 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
10937
10938 /* For CPU builtins that can be folded, fold first and expand the fold. */
10939 switch (fcode)
10940 {
10941 case IX86_BUILTIN_CPU_INIT:
10942 {
10943 /* Make it call __cpu_indicator_init in libgcc. */
10944 tree call_expr, fndecl, type;
10945 type = build_function_type_list (integer_type_node, NULL_TREE);
10946 fndecl = build_fn_decl ("__cpu_indicator_init", type);
10947 call_expr = build_call_expr (fndecl, 0);
10948 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
10949 }
10950 case IX86_BUILTIN_CPU_IS:
10951 case IX86_BUILTIN_CPU_SUPPORTS:
10952 {
10953 tree arg0 = CALL_EXPR_ARG (exp, 0);
10954 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
10955 gcc_assert (fold_expr != NULL_TREE);
10956 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
10957 }
10958 }
10959
10960 HOST_WIDE_INT isa = ix86_isa_flags;
10961 HOST_WIDE_INT isa2 = ix86_isa_flags2;
10962 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
10963 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
10964 /* The general case is we require all the ISAs specified in bisa{,2}
10965 to be enabled.
10966 The exceptions are:
10967 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
10968 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
10969 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
10970 where for each such pair it is sufficient if either of the ISAs is
10971 enabled, plus if it is ored with other options also those others.
10972 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
10973 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
10974 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
10975 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
10976 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
10977 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
10978 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
10979 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
10980 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
10981 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
10982 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
10983 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
10984 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
10985 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE)
10986 {
10987 bisa &= ~OPTION_MASK_ISA_MMX;
10988 bisa |= OPTION_MASK_ISA_SSE2;
10989 }
10990 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
10991 {
10992 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
10993 if (TARGET_ABI_X32)
10994 bisa |= OPTION_MASK_ABI_X32;
10995 else
10996 bisa |= OPTION_MASK_ABI_64;
10997 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
10998 (enum fpmath_unit) 0,
10999 (enum prefer_vector_width) 0,
11000 false, add_abi_p);
11001 if (!opts)
11002 error ("%qE needs unknown isa option", fndecl);
11003 else
11004 {
11005 gcc_assert (opts != NULL);
11006 error ("%qE needs isa option %s", fndecl, opts);
11007 free (opts);
11008 }
11009 return expand_call (exp, target, ignore);
11010 }
11011
11012 switch (fcode)
11013 {
11014 case IX86_BUILTIN_MASKMOVQ:
11015 case IX86_BUILTIN_MASKMOVDQU:
11016 icode = (fcode == IX86_BUILTIN_MASKMOVQ
11017 ? CODE_FOR_mmx_maskmovq
11018 : CODE_FOR_sse2_maskmovdqu);
11019 /* Note the arg order is different from the operand order. */
11020 arg1 = CALL_EXPR_ARG (exp, 0);
11021 arg2 = CALL_EXPR_ARG (exp, 1);
11022 arg0 = CALL_EXPR_ARG (exp, 2);
11023 op0 = expand_normal (arg0);
11024 op1 = expand_normal (arg1);
11025 op2 = expand_normal (arg2);
11026 mode0 = insn_data[icode].operand[0].mode;
11027 mode1 = insn_data[icode].operand[1].mode;
11028 mode2 = insn_data[icode].operand[2].mode;
11029
11030 op0 = ix86_zero_extend_to_Pmode (op0);
11031 op0 = gen_rtx_MEM (mode1, op0);
11032
11033 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11034 op0 = copy_to_mode_reg (mode0, op0);
11035 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11036 op1 = copy_to_mode_reg (mode1, op1);
11037 if (!insn_data[icode].operand[2].predicate (op2, mode2))
11038 op2 = copy_to_mode_reg (mode2, op2);
11039 pat = GEN_FCN (icode) (op0, op1, op2);
11040 if (! pat)
11041 return 0;
11042 emit_insn (pat);
11043 return 0;
11044
11045 case IX86_BUILTIN_LDMXCSR:
11046 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11047 target = assign_386_stack_local (SImode, SLOT_TEMP);
11048 emit_move_insn (target, op0);
11049 emit_insn (gen_sse_ldmxcsr (target));
11050 return 0;
11051
11052 case IX86_BUILTIN_STMXCSR:
11053 target = assign_386_stack_local (SImode, SLOT_TEMP);
11054 emit_insn (gen_sse_stmxcsr (target));
11055 return copy_to_mode_reg (SImode, target);
11056
11057 case IX86_BUILTIN_CLFLUSH:
11058 arg0 = CALL_EXPR_ARG (exp, 0);
11059 op0 = expand_normal (arg0);
11060 icode = CODE_FOR_sse2_clflush;
11061 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11062 op0 = ix86_zero_extend_to_Pmode (op0);
11063
11064 emit_insn (gen_sse2_clflush (op0));
11065 return 0;
11066
11067 case IX86_BUILTIN_CLWB:
11068 arg0 = CALL_EXPR_ARG (exp, 0);
11069 op0 = expand_normal (arg0);
11070 icode = CODE_FOR_clwb;
11071 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11072 op0 = ix86_zero_extend_to_Pmode (op0);
11073
11074 emit_insn (gen_clwb (op0));
11075 return 0;
11076
11077 case IX86_BUILTIN_CLFLUSHOPT:
11078 arg0 = CALL_EXPR_ARG (exp, 0);
11079 op0 = expand_normal (arg0);
11080 icode = CODE_FOR_clflushopt;
11081 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11082 op0 = ix86_zero_extend_to_Pmode (op0);
11083
11084 emit_insn (gen_clflushopt (op0));
11085 return 0;
11086
11087 case IX86_BUILTIN_MONITOR:
11088 case IX86_BUILTIN_MONITORX:
11089 arg0 = CALL_EXPR_ARG (exp, 0);
11090 arg1 = CALL_EXPR_ARG (exp, 1);
11091 arg2 = CALL_EXPR_ARG (exp, 2);
11092 op0 = expand_normal (arg0);
11093 op1 = expand_normal (arg1);
11094 op2 = expand_normal (arg2);
11095 if (!REG_P (op0))
11096 op0 = ix86_zero_extend_to_Pmode (op0);
11097 if (!REG_P (op1))
11098 op1 = copy_to_mode_reg (SImode, op1);
11099 if (!REG_P (op2))
11100 op2 = copy_to_mode_reg (SImode, op2);
11101
11102 emit_insn (fcode == IX86_BUILTIN_MONITOR
11103 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11104 : gen_monitorx (Pmode, op0, op1, op2));
11105 return 0;
11106
11107 case IX86_BUILTIN_MWAIT:
11108 arg0 = CALL_EXPR_ARG (exp, 0);
11109 arg1 = CALL_EXPR_ARG (exp, 1);
11110 op0 = expand_normal (arg0);
11111 op1 = expand_normal (arg1);
11112 if (!REG_P (op0))
11113 op0 = copy_to_mode_reg (SImode, op0);
11114 if (!REG_P (op1))
11115 op1 = copy_to_mode_reg (SImode, op1);
11116 emit_insn (gen_sse3_mwait (op0, op1));
11117 return 0;
11118
11119 case IX86_BUILTIN_MWAITX:
11120 arg0 = CALL_EXPR_ARG (exp, 0);
11121 arg1 = CALL_EXPR_ARG (exp, 1);
11122 arg2 = CALL_EXPR_ARG (exp, 2);
11123 op0 = expand_normal (arg0);
11124 op1 = expand_normal (arg1);
11125 op2 = expand_normal (arg2);
11126 if (!REG_P (op0))
11127 op0 = copy_to_mode_reg (SImode, op0);
11128 if (!REG_P (op1))
11129 op1 = copy_to_mode_reg (SImode, op1);
11130 if (!REG_P (op2))
11131 op2 = copy_to_mode_reg (SImode, op2);
11132 emit_insn (gen_mwaitx (op0, op1, op2));
11133 return 0;
11134
11135 case IX86_BUILTIN_UMONITOR:
11136 arg0 = CALL_EXPR_ARG (exp, 0);
11137 op0 = expand_normal (arg0);
11138
11139 op0 = ix86_zero_extend_to_Pmode (op0);
11140 emit_insn (gen_umonitor (Pmode, op0));
11141 return 0;
11142
11143 case IX86_BUILTIN_UMWAIT:
11144 case IX86_BUILTIN_TPAUSE:
11145 arg0 = CALL_EXPR_ARG (exp, 0);
11146 arg1 = CALL_EXPR_ARG (exp, 1);
11147 op0 = expand_normal (arg0);
11148 op1 = expand_normal (arg1);
11149
11150 if (!REG_P (op0))
11151 op0 = copy_to_mode_reg (SImode, op0);
11152
11153 op1 = force_reg (DImode, op1);
11154
11155 if (TARGET_64BIT)
11156 {
11157 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11158 NULL, 1, OPTAB_DIRECT);
11159 switch (fcode)
11160 {
11161 case IX86_BUILTIN_UMWAIT:
11162 icode = CODE_FOR_umwait_rex64;
11163 break;
11164 case IX86_BUILTIN_TPAUSE:
11165 icode = CODE_FOR_tpause_rex64;
11166 break;
11167 default:
11168 gcc_unreachable ();
11169 }
11170
11171 op2 = gen_lowpart (SImode, op2);
11172 op1 = gen_lowpart (SImode, op1);
11173 pat = GEN_FCN (icode) (op0, op1, op2);
11174 }
11175 else
11176 {
11177 switch (fcode)
11178 {
11179 case IX86_BUILTIN_UMWAIT:
11180 icode = CODE_FOR_umwait;
11181 break;
11182 case IX86_BUILTIN_TPAUSE:
11183 icode = CODE_FOR_tpause;
11184 break;
11185 default:
11186 gcc_unreachable ();
11187 }
11188 pat = GEN_FCN (icode) (op0, op1);
11189 }
11190
11191 if (!pat)
11192 return 0;
11193
11194 emit_insn (pat);
11195
11196 if (target == 0
11197 || !register_operand (target, QImode))
11198 target = gen_reg_rtx (QImode);
11199
11200 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11201 const0_rtx);
11202 emit_insn (gen_rtx_SET (target, pat));
11203
11204 return target;
11205
11206 case IX86_BUILTIN_CLZERO:
11207 arg0 = CALL_EXPR_ARG (exp, 0);
11208 op0 = expand_normal (arg0);
11209 if (!REG_P (op0))
11210 op0 = ix86_zero_extend_to_Pmode (op0);
11211 emit_insn (gen_clzero (Pmode, op0));
11212 return 0;
11213
11214 case IX86_BUILTIN_CLDEMOTE:
11215 arg0 = CALL_EXPR_ARG (exp, 0);
11216 op0 = expand_normal (arg0);
11217 icode = CODE_FOR_cldemote;
11218 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11219 op0 = ix86_zero_extend_to_Pmode (op0);
11220
11221 emit_insn (gen_cldemote (op0));
11222 return 0;
11223
11224 case IX86_BUILTIN_VEC_INIT_V2SI:
11225 case IX86_BUILTIN_VEC_INIT_V4HI:
11226 case IX86_BUILTIN_VEC_INIT_V8QI:
11227 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11228
11229 case IX86_BUILTIN_VEC_EXT_V2DF:
11230 case IX86_BUILTIN_VEC_EXT_V2DI:
11231 case IX86_BUILTIN_VEC_EXT_V4SF:
11232 case IX86_BUILTIN_VEC_EXT_V4SI:
11233 case IX86_BUILTIN_VEC_EXT_V8HI:
11234 case IX86_BUILTIN_VEC_EXT_V2SI:
11235 case IX86_BUILTIN_VEC_EXT_V4HI:
11236 case IX86_BUILTIN_VEC_EXT_V16QI:
11237 return ix86_expand_vec_ext_builtin (exp, target);
11238
11239 case IX86_BUILTIN_VEC_SET_V2DI:
11240 case IX86_BUILTIN_VEC_SET_V4SF:
11241 case IX86_BUILTIN_VEC_SET_V4SI:
11242 case IX86_BUILTIN_VEC_SET_V8HI:
11243 case IX86_BUILTIN_VEC_SET_V4HI:
11244 case IX86_BUILTIN_VEC_SET_V16QI:
11245 return ix86_expand_vec_set_builtin (exp);
11246
11247 case IX86_BUILTIN_NANQ:
11248 case IX86_BUILTIN_NANSQ:
11249 return expand_call (exp, target, ignore);
11250
11251 case IX86_BUILTIN_RDPID:
11252
11253 op0 = gen_reg_rtx (word_mode);
11254
11255 if (TARGET_64BIT)
11256 {
11257 insn = gen_rdpid_rex64 (op0);
11258 op0 = convert_to_mode (SImode, op0, 1);
11259 }
11260 else
11261 insn = gen_rdpid (op0);
11262
11263 emit_insn (insn);
11264
11265 if (target == 0
11266 || !register_operand (target, SImode))
11267 target = gen_reg_rtx (SImode);
11268
11269 emit_move_insn (target, op0);
11270 return target;
11271
11272 case IX86_BUILTIN_2INTERSECTD512:
11273 case IX86_BUILTIN_2INTERSECTQ512:
11274 case IX86_BUILTIN_2INTERSECTD256:
11275 case IX86_BUILTIN_2INTERSECTQ256:
11276 case IX86_BUILTIN_2INTERSECTD128:
11277 case IX86_BUILTIN_2INTERSECTQ128:
11278 arg0 = CALL_EXPR_ARG (exp, 0);
11279 arg1 = CALL_EXPR_ARG (exp, 1);
11280 arg2 = CALL_EXPR_ARG (exp, 2);
11281 arg3 = CALL_EXPR_ARG (exp, 3);
11282 op0 = expand_normal (arg0);
11283 op1 = expand_normal (arg1);
11284 op2 = expand_normal (arg2);
11285 op3 = expand_normal (arg3);
11286
11287 if (!address_operand (op0, VOIDmode))
11288 {
11289 op0 = convert_memory_address (Pmode, op0);
11290 op0 = copy_addr_to_reg (op0);
11291 }
11292 if (!address_operand (op1, VOIDmode))
11293 {
11294 op1 = convert_memory_address (Pmode, op1);
11295 op1 = copy_addr_to_reg (op1);
11296 }
11297
11298 switch (fcode)
11299 {
11300 case IX86_BUILTIN_2INTERSECTD512:
11301 mode4 = P2HImode;
11302 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11303 break;
11304 case IX86_BUILTIN_2INTERSECTQ512:
11305 mode4 = P2QImode;
11306 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11307 break;
11308 case IX86_BUILTIN_2INTERSECTD256:
11309 mode4 = P2QImode;
11310 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11311 break;
11312 case IX86_BUILTIN_2INTERSECTQ256:
11313 mode4 = P2QImode;
11314 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11315 break;
11316 case IX86_BUILTIN_2INTERSECTD128:
11317 mode4 = P2QImode;
11318 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11319 break;
11320 case IX86_BUILTIN_2INTERSECTQ128:
11321 mode4 = P2QImode;
11322 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11323 break;
11324 default:
11325 gcc_unreachable ();
11326 }
11327
11328 mode2 = insn_data[icode].operand[1].mode;
11329 mode3 = insn_data[icode].operand[2].mode;
11330 if (!insn_data[icode].operand[1].predicate (op2, mode2))
11331 op2 = copy_to_mode_reg (mode2, op2);
11332 if (!insn_data[icode].operand[2].predicate (op3, mode3))
11333 op3 = copy_to_mode_reg (mode3, op3);
11334
11335 op4 = gen_reg_rtx (mode4);
11336 emit_insn (GEN_FCN (icode) (op4, op2, op3));
11337 mode0 = mode4 == P2HImode ? HImode : QImode;
11338 emit_move_insn (gen_rtx_MEM (mode0, op0),
11339 gen_lowpart (mode0, op4));
11340 emit_move_insn (gen_rtx_MEM (mode0, op1),
11341 gen_highpart (mode0, op4));
11342
11343 return 0;
11344
11345 case IX86_BUILTIN_RDPMC:
11346 case IX86_BUILTIN_RDTSC:
11347 case IX86_BUILTIN_RDTSCP:
11348 case IX86_BUILTIN_XGETBV:
11349
11350 op0 = gen_reg_rtx (DImode);
11351 op1 = gen_reg_rtx (DImode);
11352
11353 if (fcode == IX86_BUILTIN_RDPMC)
11354 {
11355 arg0 = CALL_EXPR_ARG (exp, 0);
11356 op2 = expand_normal (arg0);
11357 if (!register_operand (op2, SImode))
11358 op2 = copy_to_mode_reg (SImode, op2);
11359
11360 insn = (TARGET_64BIT
11361 ? gen_rdpmc_rex64 (op0, op1, op2)
11362 : gen_rdpmc (op0, op2));
11363 emit_insn (insn);
11364 }
11365 else if (fcode == IX86_BUILTIN_XGETBV)
11366 {
11367 arg0 = CALL_EXPR_ARG (exp, 0);
11368 op2 = expand_normal (arg0);
11369 if (!register_operand (op2, SImode))
11370 op2 = copy_to_mode_reg (SImode, op2);
11371
11372 insn = (TARGET_64BIT
11373 ? gen_xgetbv_rex64 (op0, op1, op2)
11374 : gen_xgetbv (op0, op2));
11375 emit_insn (insn);
11376 }
11377 else if (fcode == IX86_BUILTIN_RDTSC)
11378 {
11379 insn = (TARGET_64BIT
11380 ? gen_rdtsc_rex64 (op0, op1)
11381 : gen_rdtsc (op0));
11382 emit_insn (insn);
11383 }
11384 else
11385 {
11386 op2 = gen_reg_rtx (SImode);
11387
11388 insn = (TARGET_64BIT
11389 ? gen_rdtscp_rex64 (op0, op1, op2)
11390 : gen_rdtscp (op0, op2));
11391 emit_insn (insn);
11392
11393 arg0 = CALL_EXPR_ARG (exp, 0);
11394 op4 = expand_normal (arg0);
11395 if (!address_operand (op4, VOIDmode))
11396 {
11397 op4 = convert_memory_address (Pmode, op4);
11398 op4 = copy_addr_to_reg (op4);
11399 }
11400 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11401 }
11402
11403 if (target == 0
11404 || !register_operand (target, DImode))
11405 target = gen_reg_rtx (DImode);
11406
11407 if (TARGET_64BIT)
11408 {
11409 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11410 op1, 1, OPTAB_DIRECT);
11411 op0 = expand_simple_binop (DImode, IOR, op0, op1,
11412 op0, 1, OPTAB_DIRECT);
11413 }
11414
11415 emit_move_insn (target, op0);
11416 return target;
11417
11418 case IX86_BUILTIN_ENQCMD:
11419 case IX86_BUILTIN_ENQCMDS:
11420 case IX86_BUILTIN_MOVDIR64B:
11421
11422 arg0 = CALL_EXPR_ARG (exp, 0);
11423 arg1 = CALL_EXPR_ARG (exp, 1);
11424 op0 = expand_normal (arg0);
11425 op1 = expand_normal (arg1);
11426
11427 op0 = ix86_zero_extend_to_Pmode (op0);
11428 if (!address_operand (op1, VOIDmode))
11429 {
11430 op1 = convert_memory_address (Pmode, op1);
11431 op1 = copy_addr_to_reg (op1);
11432 }
11433 op1 = gen_rtx_MEM (XImode, op1);
11434
11435 if (fcode == IX86_BUILTIN_MOVDIR64B)
11436 {
11437 emit_insn (gen_movdir64b (Pmode, op0, op1));
11438 return 0;
11439 }
11440 else
11441 {
11442 if (target == 0
11443 || !register_operand (target, SImode))
11444 target = gen_reg_rtx (SImode);
11445
11446 emit_move_insn (target, const0_rtx);
11447 target = gen_rtx_SUBREG (QImode, target, 0);
11448
11449 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
11450 ? UNSPECV_ENQCMD
11451 : UNSPECV_ENQCMDS);
11452 icode = code_for_enqcmd (unspecv, Pmode);
11453 emit_insn (GEN_FCN (icode) (op0, op1));
11454
11455 emit_insn
11456 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11457 gen_rtx_fmt_ee (EQ, QImode,
11458 gen_rtx_REG (CCZmode, FLAGS_REG),
11459 const0_rtx)));
11460 return SUBREG_REG (target);
11461 }
11462
11463 case IX86_BUILTIN_FXSAVE:
11464 case IX86_BUILTIN_FXRSTOR:
11465 case IX86_BUILTIN_FXSAVE64:
11466 case IX86_BUILTIN_FXRSTOR64:
11467 case IX86_BUILTIN_FNSTENV:
11468 case IX86_BUILTIN_FLDENV:
11469 mode0 = BLKmode;
11470 switch (fcode)
11471 {
11472 case IX86_BUILTIN_FXSAVE:
11473 icode = CODE_FOR_fxsave;
11474 break;
11475 case IX86_BUILTIN_FXRSTOR:
11476 icode = CODE_FOR_fxrstor;
11477 break;
11478 case IX86_BUILTIN_FXSAVE64:
11479 icode = CODE_FOR_fxsave64;
11480 break;
11481 case IX86_BUILTIN_FXRSTOR64:
11482 icode = CODE_FOR_fxrstor64;
11483 break;
11484 case IX86_BUILTIN_FNSTENV:
11485 icode = CODE_FOR_fnstenv;
11486 break;
11487 case IX86_BUILTIN_FLDENV:
11488 icode = CODE_FOR_fldenv;
11489 break;
11490 default:
11491 gcc_unreachable ();
11492 }
11493
11494 arg0 = CALL_EXPR_ARG (exp, 0);
11495 op0 = expand_normal (arg0);
11496
11497 if (!address_operand (op0, VOIDmode))
11498 {
11499 op0 = convert_memory_address (Pmode, op0);
11500 op0 = copy_addr_to_reg (op0);
11501 }
11502 op0 = gen_rtx_MEM (mode0, op0);
11503
11504 pat = GEN_FCN (icode) (op0);
11505 if (pat)
11506 emit_insn (pat);
11507 return 0;
11508
11509 case IX86_BUILTIN_XSETBV:
11510 arg0 = CALL_EXPR_ARG (exp, 0);
11511 arg1 = CALL_EXPR_ARG (exp, 1);
11512 op0 = expand_normal (arg0);
11513 op1 = expand_normal (arg1);
11514
11515 if (!REG_P (op0))
11516 op0 = copy_to_mode_reg (SImode, op0);
11517
11518 op1 = force_reg (DImode, op1);
11519
11520 if (TARGET_64BIT)
11521 {
11522 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11523 NULL, 1, OPTAB_DIRECT);
11524
11525 icode = CODE_FOR_xsetbv_rex64;
11526
11527 op2 = gen_lowpart (SImode, op2);
11528 op1 = gen_lowpart (SImode, op1);
11529 pat = GEN_FCN (icode) (op0, op1, op2);
11530 }
11531 else
11532 {
11533 icode = CODE_FOR_xsetbv;
11534
11535 pat = GEN_FCN (icode) (op0, op1);
11536 }
11537 if (pat)
11538 emit_insn (pat);
11539 return 0;
11540
11541 case IX86_BUILTIN_XSAVE:
11542 case IX86_BUILTIN_XRSTOR:
11543 case IX86_BUILTIN_XSAVE64:
11544 case IX86_BUILTIN_XRSTOR64:
11545 case IX86_BUILTIN_XSAVEOPT:
11546 case IX86_BUILTIN_XSAVEOPT64:
11547 case IX86_BUILTIN_XSAVES:
11548 case IX86_BUILTIN_XRSTORS:
11549 case IX86_BUILTIN_XSAVES64:
11550 case IX86_BUILTIN_XRSTORS64:
11551 case IX86_BUILTIN_XSAVEC:
11552 case IX86_BUILTIN_XSAVEC64:
11553 arg0 = CALL_EXPR_ARG (exp, 0);
11554 arg1 = CALL_EXPR_ARG (exp, 1);
11555 op0 = expand_normal (arg0);
11556 op1 = expand_normal (arg1);
11557
11558 if (!address_operand (op0, VOIDmode))
11559 {
11560 op0 = convert_memory_address (Pmode, op0);
11561 op0 = copy_addr_to_reg (op0);
11562 }
11563 op0 = gen_rtx_MEM (BLKmode, op0);
11564
11565 op1 = force_reg (DImode, op1);
11566
11567 if (TARGET_64BIT)
11568 {
11569 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11570 NULL, 1, OPTAB_DIRECT);
11571 switch (fcode)
11572 {
11573 case IX86_BUILTIN_XSAVE:
11574 icode = CODE_FOR_xsave_rex64;
11575 break;
11576 case IX86_BUILTIN_XRSTOR:
11577 icode = CODE_FOR_xrstor_rex64;
11578 break;
11579 case IX86_BUILTIN_XSAVE64:
11580 icode = CODE_FOR_xsave64;
11581 break;
11582 case IX86_BUILTIN_XRSTOR64:
11583 icode = CODE_FOR_xrstor64;
11584 break;
11585 case IX86_BUILTIN_XSAVEOPT:
11586 icode = CODE_FOR_xsaveopt_rex64;
11587 break;
11588 case IX86_BUILTIN_XSAVEOPT64:
11589 icode = CODE_FOR_xsaveopt64;
11590 break;
11591 case IX86_BUILTIN_XSAVES:
11592 icode = CODE_FOR_xsaves_rex64;
11593 break;
11594 case IX86_BUILTIN_XRSTORS:
11595 icode = CODE_FOR_xrstors_rex64;
11596 break;
11597 case IX86_BUILTIN_XSAVES64:
11598 icode = CODE_FOR_xsaves64;
11599 break;
11600 case IX86_BUILTIN_XRSTORS64:
11601 icode = CODE_FOR_xrstors64;
11602 break;
11603 case IX86_BUILTIN_XSAVEC:
11604 icode = CODE_FOR_xsavec_rex64;
11605 break;
11606 case IX86_BUILTIN_XSAVEC64:
11607 icode = CODE_FOR_xsavec64;
11608 break;
11609 default:
11610 gcc_unreachable ();
11611 }
11612
11613 op2 = gen_lowpart (SImode, op2);
11614 op1 = gen_lowpart (SImode, op1);
11615 pat = GEN_FCN (icode) (op0, op1, op2);
11616 }
11617 else
11618 {
11619 switch (fcode)
11620 {
11621 case IX86_BUILTIN_XSAVE:
11622 icode = CODE_FOR_xsave;
11623 break;
11624 case IX86_BUILTIN_XRSTOR:
11625 icode = CODE_FOR_xrstor;
11626 break;
11627 case IX86_BUILTIN_XSAVEOPT:
11628 icode = CODE_FOR_xsaveopt;
11629 break;
11630 case IX86_BUILTIN_XSAVES:
11631 icode = CODE_FOR_xsaves;
11632 break;
11633 case IX86_BUILTIN_XRSTORS:
11634 icode = CODE_FOR_xrstors;
11635 break;
11636 case IX86_BUILTIN_XSAVEC:
11637 icode = CODE_FOR_xsavec;
11638 break;
11639 default:
11640 gcc_unreachable ();
11641 }
11642 pat = GEN_FCN (icode) (op0, op1);
11643 }
11644
11645 if (pat)
11646 emit_insn (pat);
11647 return 0;
11648
11649 case IX86_BUILTIN_LLWPCB:
11650 arg0 = CALL_EXPR_ARG (exp, 0);
11651 op0 = expand_normal (arg0);
11652
11653 if (!register_operand (op0, Pmode))
11654 op0 = ix86_zero_extend_to_Pmode (op0);
11655 emit_insn (gen_lwp_llwpcb (Pmode, op0));
11656 return 0;
11657
11658 case IX86_BUILTIN_SLWPCB:
11659 if (!target
11660 || !register_operand (target, Pmode))
11661 target = gen_reg_rtx (Pmode);
11662 emit_insn (gen_lwp_slwpcb (Pmode, target));
11663 return target;
11664
11665 case IX86_BUILTIN_LWPVAL32:
11666 case IX86_BUILTIN_LWPVAL64:
11667 case IX86_BUILTIN_LWPINS32:
11668 case IX86_BUILTIN_LWPINS64:
11669 mode = ((fcode == IX86_BUILTIN_LWPVAL32
11670 || fcode == IX86_BUILTIN_LWPINS32)
11671 ? SImode : DImode);
11672
11673 if (fcode == IX86_BUILTIN_LWPVAL32
11674 || fcode == IX86_BUILTIN_LWPVAL64)
11675 icode = code_for_lwp_lwpval (mode);
11676 else
11677 icode = code_for_lwp_lwpins (mode);
11678
11679 arg0 = CALL_EXPR_ARG (exp, 0);
11680 arg1 = CALL_EXPR_ARG (exp, 1);
11681 arg2 = CALL_EXPR_ARG (exp, 2);
11682 op0 = expand_normal (arg0);
11683 op1 = expand_normal (arg1);
11684 op2 = expand_normal (arg2);
11685 mode0 = insn_data[icode].operand[0].mode;
11686
11687 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11688 op0 = copy_to_mode_reg (mode0, op0);
11689 if (!insn_data[icode].operand[1].predicate (op1, SImode))
11690 op1 = copy_to_mode_reg (SImode, op1);
11691
11692 if (!CONST_INT_P (op2))
11693 {
11694 error ("the last argument must be a 32-bit immediate");
11695 return const0_rtx;
11696 }
11697
11698 emit_insn (GEN_FCN (icode) (op0, op1, op2));
11699
11700 if (fcode == IX86_BUILTIN_LWPINS32
11701 || fcode == IX86_BUILTIN_LWPINS64)
11702 {
11703 if (target == 0
11704 || !nonimmediate_operand (target, QImode))
11705 target = gen_reg_rtx (QImode);
11706
11707 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11708 const0_rtx);
11709 emit_insn (gen_rtx_SET (target, pat));
11710
11711 return target;
11712 }
11713 else
11714 return 0;
11715
11716 case IX86_BUILTIN_BEXTRI32:
11717 case IX86_BUILTIN_BEXTRI64:
11718 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
11719
11720 arg0 = CALL_EXPR_ARG (exp, 0);
11721 arg1 = CALL_EXPR_ARG (exp, 1);
11722 op0 = expand_normal (arg0);
11723 op1 = expand_normal (arg1);
11724
11725 if (!CONST_INT_P (op1))
11726 {
11727 error ("last argument must be an immediate");
11728 return const0_rtx;
11729 }
11730 else
11731 {
11732 unsigned char lsb_index = UINTVAL (op1);
11733 unsigned char length = UINTVAL (op1) >> 8;
11734
11735 unsigned char bitsize = GET_MODE_BITSIZE (mode);
11736
11737 icode = code_for_tbm_bextri (mode);
11738
11739 mode1 = insn_data[icode].operand[1].mode;
11740 if (!insn_data[icode].operand[1].predicate (op0, mode1))
11741 op0 = copy_to_mode_reg (mode1, op0);
11742
11743 mode0 = insn_data[icode].operand[0].mode;
11744 if (target == 0
11745 || !register_operand (target, mode0))
11746 target = gen_reg_rtx (mode0);
11747
11748 if (length == 0 || lsb_index >= bitsize)
11749 {
11750 emit_move_insn (target, const0_rtx);
11751 return target;
11752 }
11753
11754 if (length + lsb_index > bitsize)
11755 length = bitsize - lsb_index;
11756
11757 op1 = GEN_INT (length);
11758 op2 = GEN_INT (lsb_index);
11759
11760 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
11761 return target;
11762 }
11763
11764 case IX86_BUILTIN_RDRAND16_STEP:
11765 mode = HImode;
11766 goto rdrand_step;
11767
11768 case IX86_BUILTIN_RDRAND32_STEP:
11769 mode = SImode;
11770 goto rdrand_step;
11771
11772 case IX86_BUILTIN_RDRAND64_STEP:
11773 mode = DImode;
11774
11775 rdrand_step:
11776 arg0 = CALL_EXPR_ARG (exp, 0);
11777 op1 = expand_normal (arg0);
11778 if (!address_operand (op1, VOIDmode))
11779 {
11780 op1 = convert_memory_address (Pmode, op1);
11781 op1 = copy_addr_to_reg (op1);
11782 }
11783
11784 op0 = gen_reg_rtx (mode);
11785 emit_insn (gen_rdrand (mode, op0));
11786
11787 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
11788
11789 op1 = force_reg (SImode, const1_rtx);
11790
11791 /* Emit SImode conditional move. */
11792 if (mode == HImode)
11793 {
11794 if (TARGET_ZERO_EXTEND_WITH_AND
11795 && optimize_function_for_speed_p (cfun))
11796 {
11797 op2 = force_reg (SImode, const0_rtx);
11798
11799 emit_insn (gen_movstricthi
11800 (gen_lowpart (HImode, op2), op0));
11801 }
11802 else
11803 {
11804 op2 = gen_reg_rtx (SImode);
11805
11806 emit_insn (gen_zero_extendhisi2 (op2, op0));
11807 }
11808 }
11809 else if (mode == SImode)
11810 op2 = op0;
11811 else
11812 op2 = gen_rtx_SUBREG (SImode, op0, 0);
11813
11814 if (target == 0
11815 || !register_operand (target, SImode))
11816 target = gen_reg_rtx (SImode);
11817
11818 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
11819 const0_rtx);
11820 emit_insn (gen_rtx_SET (target,
11821 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
11822 return target;
11823
11824 case IX86_BUILTIN_RDSEED16_STEP:
11825 mode = HImode;
11826 goto rdseed_step;
11827
11828 case IX86_BUILTIN_RDSEED32_STEP:
11829 mode = SImode;
11830 goto rdseed_step;
11831
11832 case IX86_BUILTIN_RDSEED64_STEP:
11833 mode = DImode;
11834
11835 rdseed_step:
11836 arg0 = CALL_EXPR_ARG (exp, 0);
11837 op1 = expand_normal (arg0);
11838 if (!address_operand (op1, VOIDmode))
11839 {
11840 op1 = convert_memory_address (Pmode, op1);
11841 op1 = copy_addr_to_reg (op1);
11842 }
11843
11844 op0 = gen_reg_rtx (mode);
11845 emit_insn (gen_rdseed (mode, op0));
11846
11847 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
11848
11849 op2 = gen_reg_rtx (QImode);
11850
11851 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11852 const0_rtx);
11853 emit_insn (gen_rtx_SET (op2, pat));
11854
11855 if (target == 0
11856 || !register_operand (target, SImode))
11857 target = gen_reg_rtx (SImode);
11858
11859 emit_insn (gen_zero_extendqisi2 (target, op2));
11860 return target;
11861
11862 case IX86_BUILTIN_SBB32:
11863 icode = CODE_FOR_subborrowsi;
11864 icode2 = CODE_FOR_subborrowsi_0;
11865 mode0 = SImode;
11866 mode1 = DImode;
11867 mode2 = CCmode;
11868 goto handlecarry;
11869
11870 case IX86_BUILTIN_SBB64:
11871 icode = CODE_FOR_subborrowdi;
11872 icode2 = CODE_FOR_subborrowdi_0;
11873 mode0 = DImode;
11874 mode1 = TImode;
11875 mode2 = CCmode;
11876 goto handlecarry;
11877
11878 case IX86_BUILTIN_ADDCARRYX32:
11879 icode = CODE_FOR_addcarrysi;
11880 icode2 = CODE_FOR_addcarrysi_0;
11881 mode0 = SImode;
11882 mode1 = DImode;
11883 mode2 = CCCmode;
11884 goto handlecarry;
11885
11886 case IX86_BUILTIN_ADDCARRYX64:
11887 icode = CODE_FOR_addcarrydi;
11888 icode2 = CODE_FOR_addcarrydi_0;
11889 mode0 = DImode;
11890 mode1 = TImode;
11891 mode2 = CCCmode;
11892
11893 handlecarry:
11894 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
11895 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
11896 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
11897 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
11898
11899 op1 = expand_normal (arg0);
11900 if (!integer_zerop (arg0))
11901 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
11902
11903 op2 = expand_normal (arg1);
11904 if (!register_operand (op2, mode0))
11905 op2 = copy_to_mode_reg (mode0, op2);
11906
11907 op3 = expand_normal (arg2);
11908 if (!register_operand (op3, mode0))
11909 op3 = copy_to_mode_reg (mode0, op3);
11910
11911 op4 = expand_normal (arg3);
11912 if (!address_operand (op4, VOIDmode))
11913 {
11914 op4 = convert_memory_address (Pmode, op4);
11915 op4 = copy_addr_to_reg (op4);
11916 }
11917
11918 op0 = gen_reg_rtx (mode0);
11919 if (integer_zerop (arg0))
11920 {
11921 /* If arg0 is 0, optimize right away into add or sub
11922 instruction that sets CCCmode flags. */
11923 op1 = gen_rtx_REG (mode2, FLAGS_REG);
11924 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
11925 }
11926 else
11927 {
11928 /* Generate CF from input operand. */
11929 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
11930
11931 /* Generate instruction that consumes CF. */
11932 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
11933 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
11934 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
11935 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
11936 }
11937
11938 /* Return current CF value. */
11939 if (target == 0)
11940 target = gen_reg_rtx (QImode);
11941
11942 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
11943 emit_insn (gen_rtx_SET (target, pat));
11944
11945 /* Store the result. */
11946 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
11947
11948 return target;
11949
11950 case IX86_BUILTIN_READ_FLAGS:
11951 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
11952
11953 if (optimize
11954 || target == NULL_RTX
11955 || !nonimmediate_operand (target, word_mode)
11956 || GET_MODE (target) != word_mode)
11957 target = gen_reg_rtx (word_mode);
11958
11959 emit_insn (gen_pop (target));
11960 return target;
11961
11962 case IX86_BUILTIN_WRITE_FLAGS:
11963
11964 arg0 = CALL_EXPR_ARG (exp, 0);
11965 op0 = expand_normal (arg0);
11966 if (!general_no_elim_operand (op0, word_mode))
11967 op0 = copy_to_mode_reg (word_mode, op0);
11968
11969 emit_insn (gen_push (op0));
11970 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
11971 return 0;
11972
11973 case IX86_BUILTIN_KTESTC8:
11974 icode = CODE_FOR_ktestqi;
11975 mode3 = CCCmode;
11976 goto kortest;
11977
11978 case IX86_BUILTIN_KTESTZ8:
11979 icode = CODE_FOR_ktestqi;
11980 mode3 = CCZmode;
11981 goto kortest;
11982
11983 case IX86_BUILTIN_KTESTC16:
11984 icode = CODE_FOR_ktesthi;
11985 mode3 = CCCmode;
11986 goto kortest;
11987
11988 case IX86_BUILTIN_KTESTZ16:
11989 icode = CODE_FOR_ktesthi;
11990 mode3 = CCZmode;
11991 goto kortest;
11992
11993 case IX86_BUILTIN_KTESTC32:
11994 icode = CODE_FOR_ktestsi;
11995 mode3 = CCCmode;
11996 goto kortest;
11997
11998 case IX86_BUILTIN_KTESTZ32:
11999 icode = CODE_FOR_ktestsi;
12000 mode3 = CCZmode;
12001 goto kortest;
12002
12003 case IX86_BUILTIN_KTESTC64:
12004 icode = CODE_FOR_ktestdi;
12005 mode3 = CCCmode;
12006 goto kortest;
12007
12008 case IX86_BUILTIN_KTESTZ64:
12009 icode = CODE_FOR_ktestdi;
12010 mode3 = CCZmode;
12011 goto kortest;
12012
12013 case IX86_BUILTIN_KORTESTC8:
12014 icode = CODE_FOR_kortestqi;
12015 mode3 = CCCmode;
12016 goto kortest;
12017
12018 case IX86_BUILTIN_KORTESTZ8:
12019 icode = CODE_FOR_kortestqi;
12020 mode3 = CCZmode;
12021 goto kortest;
12022
12023 case IX86_BUILTIN_KORTESTC16:
12024 icode = CODE_FOR_kortesthi;
12025 mode3 = CCCmode;
12026 goto kortest;
12027
12028 case IX86_BUILTIN_KORTESTZ16:
12029 icode = CODE_FOR_kortesthi;
12030 mode3 = CCZmode;
12031 goto kortest;
12032
12033 case IX86_BUILTIN_KORTESTC32:
12034 icode = CODE_FOR_kortestsi;
12035 mode3 = CCCmode;
12036 goto kortest;
12037
12038 case IX86_BUILTIN_KORTESTZ32:
12039 icode = CODE_FOR_kortestsi;
12040 mode3 = CCZmode;
12041 goto kortest;
12042
12043 case IX86_BUILTIN_KORTESTC64:
12044 icode = CODE_FOR_kortestdi;
12045 mode3 = CCCmode;
12046 goto kortest;
12047
12048 case IX86_BUILTIN_KORTESTZ64:
12049 icode = CODE_FOR_kortestdi;
12050 mode3 = CCZmode;
12051
12052 kortest:
12053 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
12054 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
12055 op0 = expand_normal (arg0);
12056 op1 = expand_normal (arg1);
12057
12058 mode0 = insn_data[icode].operand[0].mode;
12059 mode1 = insn_data[icode].operand[1].mode;
12060
12061 if (GET_MODE (op0) != VOIDmode)
12062 op0 = force_reg (GET_MODE (op0), op0);
12063
12064 op0 = gen_lowpart (mode0, op0);
12065
12066 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12067 op0 = copy_to_mode_reg (mode0, op0);
12068
12069 if (GET_MODE (op1) != VOIDmode)
12070 op1 = force_reg (GET_MODE (op1), op1);
12071
12072 op1 = gen_lowpart (mode1, op1);
12073
12074 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12075 op1 = copy_to_mode_reg (mode1, op1);
12076
12077 target = gen_reg_rtx (QImode);
12078
12079 /* Emit kortest. */
12080 emit_insn (GEN_FCN (icode) (op0, op1));
12081 /* And use setcc to return result from flags. */
12082 ix86_expand_setcc (target, EQ,
12083 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12084 return target;
12085
12086 case IX86_BUILTIN_GATHERSIV2DF:
12087 icode = CODE_FOR_avx2_gathersiv2df;
12088 goto gather_gen;
12089 case IX86_BUILTIN_GATHERSIV4DF:
12090 icode = CODE_FOR_avx2_gathersiv4df;
12091 goto gather_gen;
12092 case IX86_BUILTIN_GATHERDIV2DF:
12093 icode = CODE_FOR_avx2_gatherdiv2df;
12094 goto gather_gen;
12095 case IX86_BUILTIN_GATHERDIV4DF:
12096 icode = CODE_FOR_avx2_gatherdiv4df;
12097 goto gather_gen;
12098 case IX86_BUILTIN_GATHERSIV4SF:
12099 icode = CODE_FOR_avx2_gathersiv4sf;
12100 goto gather_gen;
12101 case IX86_BUILTIN_GATHERSIV8SF:
12102 icode = CODE_FOR_avx2_gathersiv8sf;
12103 goto gather_gen;
12104 case IX86_BUILTIN_GATHERDIV4SF:
12105 icode = CODE_FOR_avx2_gatherdiv4sf;
12106 goto gather_gen;
12107 case IX86_BUILTIN_GATHERDIV8SF:
12108 icode = CODE_FOR_avx2_gatherdiv8sf;
12109 goto gather_gen;
12110 case IX86_BUILTIN_GATHERSIV2DI:
12111 icode = CODE_FOR_avx2_gathersiv2di;
12112 goto gather_gen;
12113 case IX86_BUILTIN_GATHERSIV4DI:
12114 icode = CODE_FOR_avx2_gathersiv4di;
12115 goto gather_gen;
12116 case IX86_BUILTIN_GATHERDIV2DI:
12117 icode = CODE_FOR_avx2_gatherdiv2di;
12118 goto gather_gen;
12119 case IX86_BUILTIN_GATHERDIV4DI:
12120 icode = CODE_FOR_avx2_gatherdiv4di;
12121 goto gather_gen;
12122 case IX86_BUILTIN_GATHERSIV4SI:
12123 icode = CODE_FOR_avx2_gathersiv4si;
12124 goto gather_gen;
12125 case IX86_BUILTIN_GATHERSIV8SI:
12126 icode = CODE_FOR_avx2_gathersiv8si;
12127 goto gather_gen;
12128 case IX86_BUILTIN_GATHERDIV4SI:
12129 icode = CODE_FOR_avx2_gatherdiv4si;
12130 goto gather_gen;
12131 case IX86_BUILTIN_GATHERDIV8SI:
12132 icode = CODE_FOR_avx2_gatherdiv8si;
12133 goto gather_gen;
12134 case IX86_BUILTIN_GATHERALTSIV4DF:
12135 icode = CODE_FOR_avx2_gathersiv4df;
12136 goto gather_gen;
12137 case IX86_BUILTIN_GATHERALTDIV8SF:
12138 icode = CODE_FOR_avx2_gatherdiv8sf;
12139 goto gather_gen;
12140 case IX86_BUILTIN_GATHERALTSIV4DI:
12141 icode = CODE_FOR_avx2_gathersiv4di;
12142 goto gather_gen;
12143 case IX86_BUILTIN_GATHERALTDIV8SI:
12144 icode = CODE_FOR_avx2_gatherdiv8si;
12145 goto gather_gen;
12146 case IX86_BUILTIN_GATHER3SIV16SF:
12147 icode = CODE_FOR_avx512f_gathersiv16sf;
12148 goto gather_gen;
12149 case IX86_BUILTIN_GATHER3SIV8DF:
12150 icode = CODE_FOR_avx512f_gathersiv8df;
12151 goto gather_gen;
12152 case IX86_BUILTIN_GATHER3DIV16SF:
12153 icode = CODE_FOR_avx512f_gatherdiv16sf;
12154 goto gather_gen;
12155 case IX86_BUILTIN_GATHER3DIV8DF:
12156 icode = CODE_FOR_avx512f_gatherdiv8df;
12157 goto gather_gen;
12158 case IX86_BUILTIN_GATHER3SIV16SI:
12159 icode = CODE_FOR_avx512f_gathersiv16si;
12160 goto gather_gen;
12161 case IX86_BUILTIN_GATHER3SIV8DI:
12162 icode = CODE_FOR_avx512f_gathersiv8di;
12163 goto gather_gen;
12164 case IX86_BUILTIN_GATHER3DIV16SI:
12165 icode = CODE_FOR_avx512f_gatherdiv16si;
12166 goto gather_gen;
12167 case IX86_BUILTIN_GATHER3DIV8DI:
12168 icode = CODE_FOR_avx512f_gatherdiv8di;
12169 goto gather_gen;
12170 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12171 icode = CODE_FOR_avx512f_gathersiv8df;
12172 goto gather_gen;
12173 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12174 icode = CODE_FOR_avx512f_gatherdiv16sf;
12175 goto gather_gen;
12176 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12177 icode = CODE_FOR_avx512f_gathersiv8di;
12178 goto gather_gen;
12179 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12180 icode = CODE_FOR_avx512f_gatherdiv16si;
12181 goto gather_gen;
12182 case IX86_BUILTIN_GATHER3SIV2DF:
12183 icode = CODE_FOR_avx512vl_gathersiv2df;
12184 goto gather_gen;
12185 case IX86_BUILTIN_GATHER3SIV4DF:
12186 icode = CODE_FOR_avx512vl_gathersiv4df;
12187 goto gather_gen;
12188 case IX86_BUILTIN_GATHER3DIV2DF:
12189 icode = CODE_FOR_avx512vl_gatherdiv2df;
12190 goto gather_gen;
12191 case IX86_BUILTIN_GATHER3DIV4DF:
12192 icode = CODE_FOR_avx512vl_gatherdiv4df;
12193 goto gather_gen;
12194 case IX86_BUILTIN_GATHER3SIV4SF:
12195 icode = CODE_FOR_avx512vl_gathersiv4sf;
12196 goto gather_gen;
12197 case IX86_BUILTIN_GATHER3SIV8SF:
12198 icode = CODE_FOR_avx512vl_gathersiv8sf;
12199 goto gather_gen;
12200 case IX86_BUILTIN_GATHER3DIV4SF:
12201 icode = CODE_FOR_avx512vl_gatherdiv4sf;
12202 goto gather_gen;
12203 case IX86_BUILTIN_GATHER3DIV8SF:
12204 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12205 goto gather_gen;
12206 case IX86_BUILTIN_GATHER3SIV2DI:
12207 icode = CODE_FOR_avx512vl_gathersiv2di;
12208 goto gather_gen;
12209 case IX86_BUILTIN_GATHER3SIV4DI:
12210 icode = CODE_FOR_avx512vl_gathersiv4di;
12211 goto gather_gen;
12212 case IX86_BUILTIN_GATHER3DIV2DI:
12213 icode = CODE_FOR_avx512vl_gatherdiv2di;
12214 goto gather_gen;
12215 case IX86_BUILTIN_GATHER3DIV4DI:
12216 icode = CODE_FOR_avx512vl_gatherdiv4di;
12217 goto gather_gen;
12218 case IX86_BUILTIN_GATHER3SIV4SI:
12219 icode = CODE_FOR_avx512vl_gathersiv4si;
12220 goto gather_gen;
12221 case IX86_BUILTIN_GATHER3SIV8SI:
12222 icode = CODE_FOR_avx512vl_gathersiv8si;
12223 goto gather_gen;
12224 case IX86_BUILTIN_GATHER3DIV4SI:
12225 icode = CODE_FOR_avx512vl_gatherdiv4si;
12226 goto gather_gen;
12227 case IX86_BUILTIN_GATHER3DIV8SI:
12228 icode = CODE_FOR_avx512vl_gatherdiv8si;
12229 goto gather_gen;
12230 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12231 icode = CODE_FOR_avx512vl_gathersiv4df;
12232 goto gather_gen;
12233 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12234 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12235 goto gather_gen;
12236 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12237 icode = CODE_FOR_avx512vl_gathersiv4di;
12238 goto gather_gen;
12239 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12240 icode = CODE_FOR_avx512vl_gatherdiv8si;
12241 goto gather_gen;
12242 case IX86_BUILTIN_SCATTERSIV16SF:
12243 icode = CODE_FOR_avx512f_scattersiv16sf;
12244 goto scatter_gen;
12245 case IX86_BUILTIN_SCATTERSIV8DF:
12246 icode = CODE_FOR_avx512f_scattersiv8df;
12247 goto scatter_gen;
12248 case IX86_BUILTIN_SCATTERDIV16SF:
12249 icode = CODE_FOR_avx512f_scatterdiv16sf;
12250 goto scatter_gen;
12251 case IX86_BUILTIN_SCATTERDIV8DF:
12252 icode = CODE_FOR_avx512f_scatterdiv8df;
12253 goto scatter_gen;
12254 case IX86_BUILTIN_SCATTERSIV16SI:
12255 icode = CODE_FOR_avx512f_scattersiv16si;
12256 goto scatter_gen;
12257 case IX86_BUILTIN_SCATTERSIV8DI:
12258 icode = CODE_FOR_avx512f_scattersiv8di;
12259 goto scatter_gen;
12260 case IX86_BUILTIN_SCATTERDIV16SI:
12261 icode = CODE_FOR_avx512f_scatterdiv16si;
12262 goto scatter_gen;
12263 case IX86_BUILTIN_SCATTERDIV8DI:
12264 icode = CODE_FOR_avx512f_scatterdiv8di;
12265 goto scatter_gen;
12266 case IX86_BUILTIN_SCATTERSIV8SF:
12267 icode = CODE_FOR_avx512vl_scattersiv8sf;
12268 goto scatter_gen;
12269 case IX86_BUILTIN_SCATTERSIV4SF:
12270 icode = CODE_FOR_avx512vl_scattersiv4sf;
12271 goto scatter_gen;
12272 case IX86_BUILTIN_SCATTERSIV4DF:
12273 icode = CODE_FOR_avx512vl_scattersiv4df;
12274 goto scatter_gen;
12275 case IX86_BUILTIN_SCATTERSIV2DF:
12276 icode = CODE_FOR_avx512vl_scattersiv2df;
12277 goto scatter_gen;
12278 case IX86_BUILTIN_SCATTERDIV8SF:
12279 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12280 goto scatter_gen;
12281 case IX86_BUILTIN_SCATTERDIV4SF:
12282 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12283 goto scatter_gen;
12284 case IX86_BUILTIN_SCATTERDIV4DF:
12285 icode = CODE_FOR_avx512vl_scatterdiv4df;
12286 goto scatter_gen;
12287 case IX86_BUILTIN_SCATTERDIV2DF:
12288 icode = CODE_FOR_avx512vl_scatterdiv2df;
12289 goto scatter_gen;
12290 case IX86_BUILTIN_SCATTERSIV8SI:
12291 icode = CODE_FOR_avx512vl_scattersiv8si;
12292 goto scatter_gen;
12293 case IX86_BUILTIN_SCATTERSIV4SI:
12294 icode = CODE_FOR_avx512vl_scattersiv4si;
12295 goto scatter_gen;
12296 case IX86_BUILTIN_SCATTERSIV4DI:
12297 icode = CODE_FOR_avx512vl_scattersiv4di;
12298 goto scatter_gen;
12299 case IX86_BUILTIN_SCATTERSIV2DI:
12300 icode = CODE_FOR_avx512vl_scattersiv2di;
12301 goto scatter_gen;
12302 case IX86_BUILTIN_SCATTERDIV8SI:
12303 icode = CODE_FOR_avx512vl_scatterdiv8si;
12304 goto scatter_gen;
12305 case IX86_BUILTIN_SCATTERDIV4SI:
12306 icode = CODE_FOR_avx512vl_scatterdiv4si;
12307 goto scatter_gen;
12308 case IX86_BUILTIN_SCATTERDIV4DI:
12309 icode = CODE_FOR_avx512vl_scatterdiv4di;
12310 goto scatter_gen;
12311 case IX86_BUILTIN_SCATTERDIV2DI:
12312 icode = CODE_FOR_avx512vl_scatterdiv2di;
12313 goto scatter_gen;
12314 case IX86_BUILTIN_GATHERPFDPD:
12315 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12316 goto vec_prefetch_gen;
12317 case IX86_BUILTIN_SCATTERALTSIV8DF:
12318 icode = CODE_FOR_avx512f_scattersiv8df;
12319 goto scatter_gen;
12320 case IX86_BUILTIN_SCATTERALTDIV16SF:
12321 icode = CODE_FOR_avx512f_scatterdiv16sf;
12322 goto scatter_gen;
12323 case IX86_BUILTIN_SCATTERALTSIV8DI:
12324 icode = CODE_FOR_avx512f_scattersiv8di;
12325 goto scatter_gen;
12326 case IX86_BUILTIN_SCATTERALTDIV16SI:
12327 icode = CODE_FOR_avx512f_scatterdiv16si;
12328 goto scatter_gen;
12329 case IX86_BUILTIN_SCATTERALTSIV4DF:
12330 icode = CODE_FOR_avx512vl_scattersiv4df;
12331 goto scatter_gen;
12332 case IX86_BUILTIN_SCATTERALTDIV8SF:
12333 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12334 goto scatter_gen;
12335 case IX86_BUILTIN_SCATTERALTSIV4DI:
12336 icode = CODE_FOR_avx512vl_scattersiv4di;
12337 goto scatter_gen;
12338 case IX86_BUILTIN_SCATTERALTDIV8SI:
12339 icode = CODE_FOR_avx512vl_scatterdiv8si;
12340 goto scatter_gen;
12341 case IX86_BUILTIN_SCATTERALTSIV2DF:
12342 icode = CODE_FOR_avx512vl_scattersiv2df;
12343 goto scatter_gen;
12344 case IX86_BUILTIN_SCATTERALTDIV4SF:
12345 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12346 goto scatter_gen;
12347 case IX86_BUILTIN_SCATTERALTSIV2DI:
12348 icode = CODE_FOR_avx512vl_scattersiv2di;
12349 goto scatter_gen;
12350 case IX86_BUILTIN_SCATTERALTDIV4SI:
12351 icode = CODE_FOR_avx512vl_scatterdiv4si;
12352 goto scatter_gen;
12353 case IX86_BUILTIN_GATHERPFDPS:
12354 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12355 goto vec_prefetch_gen;
12356 case IX86_BUILTIN_GATHERPFQPD:
12357 icode = CODE_FOR_avx512pf_gatherpfv8didf;
12358 goto vec_prefetch_gen;
12359 case IX86_BUILTIN_GATHERPFQPS:
12360 icode = CODE_FOR_avx512pf_gatherpfv8disf;
12361 goto vec_prefetch_gen;
12362 case IX86_BUILTIN_SCATTERPFDPD:
12363 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12364 goto vec_prefetch_gen;
12365 case IX86_BUILTIN_SCATTERPFDPS:
12366 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12367 goto vec_prefetch_gen;
12368 case IX86_BUILTIN_SCATTERPFQPD:
12369 icode = CODE_FOR_avx512pf_scatterpfv8didf;
12370 goto vec_prefetch_gen;
12371 case IX86_BUILTIN_SCATTERPFQPS:
12372 icode = CODE_FOR_avx512pf_scatterpfv8disf;
12373 goto vec_prefetch_gen;
12374
12375 gather_gen:
12376 rtx half;
12377 rtx (*gen) (rtx, rtx);
12378
12379 arg0 = CALL_EXPR_ARG (exp, 0);
12380 arg1 = CALL_EXPR_ARG (exp, 1);
12381 arg2 = CALL_EXPR_ARG (exp, 2);
12382 arg3 = CALL_EXPR_ARG (exp, 3);
12383 arg4 = CALL_EXPR_ARG (exp, 4);
12384 op0 = expand_normal (arg0);
12385 op1 = expand_normal (arg1);
12386 op2 = expand_normal (arg2);
12387 op3 = expand_normal (arg3);
12388 op4 = expand_normal (arg4);
12389 /* Note the arg order is different from the operand order. */
12390 mode0 = insn_data[icode].operand[1].mode;
12391 mode2 = insn_data[icode].operand[3].mode;
12392 mode3 = insn_data[icode].operand[4].mode;
12393 mode4 = insn_data[icode].operand[5].mode;
12394
12395 if (target == NULL_RTX
12396 || GET_MODE (target) != insn_data[icode].operand[0].mode
12397 || !insn_data[icode].operand[0].predicate (target,
12398 GET_MODE (target)))
12399 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12400 else
12401 subtarget = target;
12402
12403 switch (fcode)
12404 {
12405 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12406 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12407 half = gen_reg_rtx (V8SImode);
12408 if (!nonimmediate_operand (op2, V16SImode))
12409 op2 = copy_to_mode_reg (V16SImode, op2);
12410 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12411 op2 = half;
12412 break;
12413 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12414 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12415 case IX86_BUILTIN_GATHERALTSIV4DF:
12416 case IX86_BUILTIN_GATHERALTSIV4DI:
12417 half = gen_reg_rtx (V4SImode);
12418 if (!nonimmediate_operand (op2, V8SImode))
12419 op2 = copy_to_mode_reg (V8SImode, op2);
12420 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12421 op2 = half;
12422 break;
12423 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12424 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12425 half = gen_reg_rtx (mode0);
12426 if (mode0 == V8SFmode)
12427 gen = gen_vec_extract_lo_v16sf;
12428 else
12429 gen = gen_vec_extract_lo_v16si;
12430 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12431 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12432 emit_insn (gen (half, op0));
12433 op0 = half;
12434 op3 = lowpart_subreg (QImode, op3, HImode);
12435 break;
12436 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12437 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12438 case IX86_BUILTIN_GATHERALTDIV8SF:
12439 case IX86_BUILTIN_GATHERALTDIV8SI:
12440 half = gen_reg_rtx (mode0);
12441 if (mode0 == V4SFmode)
12442 gen = gen_vec_extract_lo_v8sf;
12443 else
12444 gen = gen_vec_extract_lo_v8si;
12445 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12446 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12447 emit_insn (gen (half, op0));
12448 op0 = half;
12449 if (VECTOR_MODE_P (GET_MODE (op3)))
12450 {
12451 half = gen_reg_rtx (mode0);
12452 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12453 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12454 emit_insn (gen (half, op3));
12455 op3 = half;
12456 }
12457 break;
12458 default:
12459 break;
12460 }
12461
12462 /* Force memory operand only with base register here. But we
12463 don't want to do it on memory operand for other builtin
12464 functions. */
12465 op1 = ix86_zero_extend_to_Pmode (op1);
12466
12467 if (!insn_data[icode].operand[1].predicate (op0, mode0))
12468 op0 = copy_to_mode_reg (mode0, op0);
12469 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12470 op1 = copy_to_mode_reg (Pmode, op1);
12471 if (!insn_data[icode].operand[3].predicate (op2, mode2))
12472 op2 = copy_to_mode_reg (mode2, op2);
12473
12474 op3 = fixup_modeless_constant (op3, mode3);
12475
12476 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12477 {
12478 if (!insn_data[icode].operand[4].predicate (op3, mode3))
12479 op3 = copy_to_mode_reg (mode3, op3);
12480 }
12481 else
12482 {
12483 op3 = copy_to_reg (op3);
12484 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12485 }
12486 if (!insn_data[icode].operand[5].predicate (op4, mode4))
12487 {
12488 error ("the last argument must be scale 1, 2, 4, 8");
12489 return const0_rtx;
12490 }
12491
12492 /* Optimize. If mask is known to have all high bits set,
12493 replace op0 with pc_rtx to signal that the instruction
12494 overwrites the whole destination and doesn't use its
12495 previous contents. */
12496 if (optimize)
12497 {
12498 if (TREE_CODE (arg3) == INTEGER_CST)
12499 {
12500 if (integer_all_onesp (arg3))
12501 op0 = pc_rtx;
12502 }
12503 else if (TREE_CODE (arg3) == VECTOR_CST)
12504 {
12505 unsigned int negative = 0;
12506 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12507 {
12508 tree cst = VECTOR_CST_ELT (arg3, i);
12509 if (TREE_CODE (cst) == INTEGER_CST
12510 && tree_int_cst_sign_bit (cst))
12511 negative++;
12512 else if (TREE_CODE (cst) == REAL_CST
12513 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12514 negative++;
12515 }
12516 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12517 op0 = pc_rtx;
12518 }
12519 else if (TREE_CODE (arg3) == SSA_NAME
12520 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12521 {
12522 /* Recognize also when mask is like:
12523 __v2df src = _mm_setzero_pd ();
12524 __v2df mask = _mm_cmpeq_pd (src, src);
12525 or
12526 __v8sf src = _mm256_setzero_ps ();
12527 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12528 as that is a cheaper way to load all ones into
12529 a register than having to load a constant from
12530 memory. */
12531 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12532 if (is_gimple_call (def_stmt))
12533 {
12534 tree fndecl = gimple_call_fndecl (def_stmt);
12535 if (fndecl
12536 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
12537 switch (DECL_MD_FUNCTION_CODE (fndecl))
12538 {
12539 case IX86_BUILTIN_CMPPD:
12540 case IX86_BUILTIN_CMPPS:
12541 case IX86_BUILTIN_CMPPD256:
12542 case IX86_BUILTIN_CMPPS256:
12543 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12544 break;
12545 /* FALLTHRU */
12546 case IX86_BUILTIN_CMPEQPD:
12547 case IX86_BUILTIN_CMPEQPS:
12548 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12549 && initializer_zerop (gimple_call_arg (def_stmt,
12550 1)))
12551 op0 = pc_rtx;
12552 break;
12553 default:
12554 break;
12555 }
12556 }
12557 }
12558 }
12559
12560 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12561 if (! pat)
12562 return const0_rtx;
12563 emit_insn (pat);
12564
12565 switch (fcode)
12566 {
12567 case IX86_BUILTIN_GATHER3DIV16SF:
12568 if (target == NULL_RTX)
12569 target = gen_reg_rtx (V8SFmode);
12570 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12571 break;
12572 case IX86_BUILTIN_GATHER3DIV16SI:
12573 if (target == NULL_RTX)
12574 target = gen_reg_rtx (V8SImode);
12575 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12576 break;
12577 case IX86_BUILTIN_GATHER3DIV8SF:
12578 case IX86_BUILTIN_GATHERDIV8SF:
12579 if (target == NULL_RTX)
12580 target = gen_reg_rtx (V4SFmode);
12581 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12582 break;
12583 case IX86_BUILTIN_GATHER3DIV8SI:
12584 case IX86_BUILTIN_GATHERDIV8SI:
12585 if (target == NULL_RTX)
12586 target = gen_reg_rtx (V4SImode);
12587 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12588 break;
12589 default:
12590 target = subtarget;
12591 break;
12592 }
12593 return target;
12594
12595 scatter_gen:
12596 arg0 = CALL_EXPR_ARG (exp, 0);
12597 arg1 = CALL_EXPR_ARG (exp, 1);
12598 arg2 = CALL_EXPR_ARG (exp, 2);
12599 arg3 = CALL_EXPR_ARG (exp, 3);
12600 arg4 = CALL_EXPR_ARG (exp, 4);
12601 op0 = expand_normal (arg0);
12602 op1 = expand_normal (arg1);
12603 op2 = expand_normal (arg2);
12604 op3 = expand_normal (arg3);
12605 op4 = expand_normal (arg4);
12606 mode1 = insn_data[icode].operand[1].mode;
12607 mode2 = insn_data[icode].operand[2].mode;
12608 mode3 = insn_data[icode].operand[3].mode;
12609 mode4 = insn_data[icode].operand[4].mode;
12610
12611 /* Scatter instruction stores operand op3 to memory with
12612 indices from op2 and scale from op4 under writemask op1.
12613 If index operand op2 has more elements then source operand
12614 op3 one need to use only its low half. And vice versa. */
12615 switch (fcode)
12616 {
12617 case IX86_BUILTIN_SCATTERALTSIV8DF:
12618 case IX86_BUILTIN_SCATTERALTSIV8DI:
12619 half = gen_reg_rtx (V8SImode);
12620 if (!nonimmediate_operand (op2, V16SImode))
12621 op2 = copy_to_mode_reg (V16SImode, op2);
12622 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12623 op2 = half;
12624 break;
12625 case IX86_BUILTIN_SCATTERALTDIV16SF:
12626 case IX86_BUILTIN_SCATTERALTDIV16SI:
12627 half = gen_reg_rtx (mode3);
12628 if (mode3 == V8SFmode)
12629 gen = gen_vec_extract_lo_v16sf;
12630 else
12631 gen = gen_vec_extract_lo_v16si;
12632 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12633 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12634 emit_insn (gen (half, op3));
12635 op3 = half;
12636 break;
12637 case IX86_BUILTIN_SCATTERALTSIV4DF:
12638 case IX86_BUILTIN_SCATTERALTSIV4DI:
12639 half = gen_reg_rtx (V4SImode);
12640 if (!nonimmediate_operand (op2, V8SImode))
12641 op2 = copy_to_mode_reg (V8SImode, op2);
12642 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12643 op2 = half;
12644 break;
12645 case IX86_BUILTIN_SCATTERALTDIV8SF:
12646 case IX86_BUILTIN_SCATTERALTDIV8SI:
12647 half = gen_reg_rtx (mode3);
12648 if (mode3 == V4SFmode)
12649 gen = gen_vec_extract_lo_v8sf;
12650 else
12651 gen = gen_vec_extract_lo_v8si;
12652 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12653 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12654 emit_insn (gen (half, op3));
12655 op3 = half;
12656 break;
12657 case IX86_BUILTIN_SCATTERALTSIV2DF:
12658 case IX86_BUILTIN_SCATTERALTSIV2DI:
12659 if (!nonimmediate_operand (op2, V4SImode))
12660 op2 = copy_to_mode_reg (V4SImode, op2);
12661 break;
12662 case IX86_BUILTIN_SCATTERALTDIV4SF:
12663 case IX86_BUILTIN_SCATTERALTDIV4SI:
12664 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12665 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12666 break;
12667 default:
12668 break;
12669 }
12670
12671 /* Force memory operand only with base register here. But we
12672 don't want to do it on memory operand for other builtin
12673 functions. */
12674 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
12675
12676 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12677 op0 = copy_to_mode_reg (Pmode, op0);
12678
12679 op1 = fixup_modeless_constant (op1, mode1);
12680
12681 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
12682 {
12683 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12684 op1 = copy_to_mode_reg (mode1, op1);
12685 }
12686 else
12687 {
12688 op1 = copy_to_reg (op1);
12689 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
12690 }
12691
12692 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12693 op2 = copy_to_mode_reg (mode2, op2);
12694
12695 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12696 op3 = copy_to_mode_reg (mode3, op3);
12697
12698 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12699 {
12700 error ("the last argument must be scale 1, 2, 4, 8");
12701 return const0_rtx;
12702 }
12703
12704 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12705 if (! pat)
12706 return const0_rtx;
12707
12708 emit_insn (pat);
12709 return 0;
12710
12711 vec_prefetch_gen:
12712 arg0 = CALL_EXPR_ARG (exp, 0);
12713 arg1 = CALL_EXPR_ARG (exp, 1);
12714 arg2 = CALL_EXPR_ARG (exp, 2);
12715 arg3 = CALL_EXPR_ARG (exp, 3);
12716 arg4 = CALL_EXPR_ARG (exp, 4);
12717 op0 = expand_normal (arg0);
12718 op1 = expand_normal (arg1);
12719 op2 = expand_normal (arg2);
12720 op3 = expand_normal (arg3);
12721 op4 = expand_normal (arg4);
12722 mode0 = insn_data[icode].operand[0].mode;
12723 mode1 = insn_data[icode].operand[1].mode;
12724 mode3 = insn_data[icode].operand[3].mode;
12725 mode4 = insn_data[icode].operand[4].mode;
12726
12727 op0 = fixup_modeless_constant (op0, mode0);
12728
12729 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
12730 {
12731 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12732 op0 = copy_to_mode_reg (mode0, op0);
12733 }
12734 else
12735 {
12736 op0 = copy_to_reg (op0);
12737 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
12738 }
12739
12740 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12741 op1 = copy_to_mode_reg (mode1, op1);
12742
12743 /* Force memory operand only with base register here. But we
12744 don't want to do it on memory operand for other builtin
12745 functions. */
12746 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
12747
12748 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
12749 op2 = copy_to_mode_reg (Pmode, op2);
12750
12751 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12752 {
12753 error ("the forth argument must be scale 1, 2, 4, 8");
12754 return const0_rtx;
12755 }
12756
12757 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12758 {
12759 error ("incorrect hint operand");
12760 return const0_rtx;
12761 }
12762
12763 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12764 if (! pat)
12765 return const0_rtx;
12766
12767 emit_insn (pat);
12768
12769 return 0;
12770
12771 case IX86_BUILTIN_XABORT:
12772 icode = CODE_FOR_xabort;
12773 arg0 = CALL_EXPR_ARG (exp, 0);
12774 op0 = expand_normal (arg0);
12775 mode0 = insn_data[icode].operand[0].mode;
12776 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12777 {
12778 error ("the argument to %<xabort%> intrinsic must "
12779 "be an 8-bit immediate");
12780 return const0_rtx;
12781 }
12782 emit_insn (gen_xabort (op0));
12783 return 0;
12784
12785 case IX86_BUILTIN_RDSSPD:
12786 case IX86_BUILTIN_RDSSPQ:
12787 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
12788
12789 if (target == 0
12790 || !register_operand (target, mode))
12791 target = gen_reg_rtx (mode);
12792
12793 op0 = force_reg (mode, const0_rtx);
12794
12795 emit_insn (gen_rdssp (mode, target, op0));
12796 return target;
12797
12798 case IX86_BUILTIN_INCSSPD:
12799 case IX86_BUILTIN_INCSSPQ:
12800 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
12801
12802 arg0 = CALL_EXPR_ARG (exp, 0);
12803 op0 = expand_normal (arg0);
12804
12805 op0 = force_reg (mode, op0);
12806
12807 emit_insn (gen_incssp (mode, op0));
12808 return 0;
12809
12810 case IX86_BUILTIN_RSTORSSP:
12811 case IX86_BUILTIN_CLRSSBSY:
12812 arg0 = CALL_EXPR_ARG (exp, 0);
12813 op0 = expand_normal (arg0);
12814 icode = (fcode == IX86_BUILTIN_RSTORSSP
12815 ? CODE_FOR_rstorssp
12816 : CODE_FOR_clrssbsy);
12817
12818 if (!address_operand (op0, VOIDmode))
12819 {
12820 op0 = convert_memory_address (Pmode, op0);
12821 op0 = copy_addr_to_reg (op0);
12822 }
12823 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
12824 return 0;
12825
12826 case IX86_BUILTIN_WRSSD:
12827 case IX86_BUILTIN_WRSSQ:
12828 case IX86_BUILTIN_WRUSSD:
12829 case IX86_BUILTIN_WRUSSQ:
12830 mode = ((fcode == IX86_BUILTIN_WRSSD
12831 || fcode == IX86_BUILTIN_WRUSSD)
12832 ? SImode : DImode);
12833
12834 arg0 = CALL_EXPR_ARG (exp, 0);
12835 op0 = expand_normal (arg0);
12836 arg1 = CALL_EXPR_ARG (exp, 1);
12837 op1 = expand_normal (arg1);
12838
12839 op0 = force_reg (mode, op0);
12840
12841 if (!address_operand (op1, VOIDmode))
12842 {
12843 op1 = convert_memory_address (Pmode, op1);
12844 op1 = copy_addr_to_reg (op1);
12845 }
12846 op1 = gen_rtx_MEM (mode, op1);
12847
12848 icode = ((fcode == IX86_BUILTIN_WRSSD
12849 || fcode == IX86_BUILTIN_WRSSQ)
12850 ? code_for_wrss (mode)
12851 : code_for_wruss (mode));
12852 emit_insn (GEN_FCN (icode) (op0, op1));
12853
12854 return 0;
12855
12856 default:
12857 break;
12858 }
12859
12860 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12861 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
12862 {
12863 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
12864 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
12865 target);
12866 }
12867
12868 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
12869 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
12870 {
12871 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
12872 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
12873 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
12874 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
12875 int masked = 1;
12876 machine_mode mode, wide_mode, nar_mode;
12877
12878 nar_mode = V4SFmode;
12879 mode = V16SFmode;
12880 wide_mode = V64SFmode;
12881 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
12882 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
12883
12884 switch (fcode)
12885 {
12886 case IX86_BUILTIN_4FMAPS:
12887 fcn = gen_avx5124fmaddps_4fmaddps;
12888 masked = 0;
12889 goto v4fma_expand;
12890
12891 case IX86_BUILTIN_4DPWSSD:
12892 nar_mode = V4SImode;
12893 mode = V16SImode;
12894 wide_mode = V64SImode;
12895 fcn = gen_avx5124vnniw_vp4dpwssd;
12896 masked = 0;
12897 goto v4fma_expand;
12898
12899 case IX86_BUILTIN_4DPWSSDS:
12900 nar_mode = V4SImode;
12901 mode = V16SImode;
12902 wide_mode = V64SImode;
12903 fcn = gen_avx5124vnniw_vp4dpwssds;
12904 masked = 0;
12905 goto v4fma_expand;
12906
12907 case IX86_BUILTIN_4FNMAPS:
12908 fcn = gen_avx5124fmaddps_4fnmaddps;
12909 masked = 0;
12910 goto v4fma_expand;
12911
12912 case IX86_BUILTIN_4FNMAPS_MASK:
12913 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
12914 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
12915 goto v4fma_expand;
12916
12917 case IX86_BUILTIN_4DPWSSD_MASK:
12918 nar_mode = V4SImode;
12919 mode = V16SImode;
12920 wide_mode = V64SImode;
12921 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
12922 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
12923 goto v4fma_expand;
12924
12925 case IX86_BUILTIN_4DPWSSDS_MASK:
12926 nar_mode = V4SImode;
12927 mode = V16SImode;
12928 wide_mode = V64SImode;
12929 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
12930 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
12931 goto v4fma_expand;
12932
12933 case IX86_BUILTIN_4FMAPS_MASK:
12934 {
12935 tree args[4];
12936 rtx ops[4];
12937 rtx wide_reg;
12938 rtx accum;
12939 rtx addr;
12940 rtx mem;
12941
12942 v4fma_expand:
12943 wide_reg = gen_reg_rtx (wide_mode);
12944 for (i = 0; i < 4; i++)
12945 {
12946 args[i] = CALL_EXPR_ARG (exp, i);
12947 ops[i] = expand_normal (args[i]);
12948
12949 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
12950 ops[i]);
12951 }
12952
12953 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
12954 accum = force_reg (mode, accum);
12955
12956 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
12957 addr = force_reg (Pmode, addr);
12958
12959 mem = gen_rtx_MEM (nar_mode, addr);
12960
12961 target = gen_reg_rtx (mode);
12962
12963 emit_move_insn (target, accum);
12964
12965 if (! masked)
12966 emit_insn (fcn (target, accum, wide_reg, mem));
12967 else
12968 {
12969 rtx merge, mask;
12970 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
12971
12972 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
12973
12974 if (CONST_INT_P (mask))
12975 mask = fixup_modeless_constant (mask, HImode);
12976
12977 mask = force_reg (HImode, mask);
12978
12979 if (GET_MODE (mask) != HImode)
12980 mask = gen_rtx_SUBREG (HImode, mask, 0);
12981
12982 /* If merge is 0 then we're about to emit z-masked variant. */
12983 if (const0_operand (merge, mode))
12984 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
12985 /* If merge is the same as accum then emit merge-masked variant. */
12986 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
12987 {
12988 merge = force_reg (mode, merge);
12989 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
12990 }
12991 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12992 else
12993 {
12994 target = gen_reg_rtx (mode);
12995 emit_move_insn (target, merge);
12996 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
12997 }
12998 }
12999 return target;
13000 }
13001
13002 case IX86_BUILTIN_4FNMASS:
13003 fcn = gen_avx5124fmaddps_4fnmaddss;
13004 masked = 0;
13005 goto s4fma_expand;
13006
13007 case IX86_BUILTIN_4FMASS:
13008 fcn = gen_avx5124fmaddps_4fmaddss;
13009 masked = 0;
13010 goto s4fma_expand;
13011
13012 case IX86_BUILTIN_4FNMASS_MASK:
13013 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
13014 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
13015 goto s4fma_expand;
13016
13017 case IX86_BUILTIN_4FMASS_MASK:
13018 {
13019 tree args[4];
13020 rtx ops[4];
13021 rtx wide_reg;
13022 rtx accum;
13023 rtx addr;
13024 rtx mem;
13025
13026 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
13027 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
13028
13029 s4fma_expand:
13030 mode = V4SFmode;
13031 wide_reg = gen_reg_rtx (V64SFmode);
13032 for (i = 0; i < 4; i++)
13033 {
13034 rtx tmp;
13035 args[i] = CALL_EXPR_ARG (exp, i);
13036 ops[i] = expand_normal (args[i]);
13037
13038 tmp = gen_reg_rtx (SFmode);
13039 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
13040
13041 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
13042 gen_rtx_SUBREG (V16SFmode, tmp, 0));
13043 }
13044
13045 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13046 accum = force_reg (V4SFmode, accum);
13047
13048 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13049 addr = force_reg (Pmode, addr);
13050
13051 mem = gen_rtx_MEM (V4SFmode, addr);
13052
13053 target = gen_reg_rtx (V4SFmode);
13054
13055 emit_move_insn (target, accum);
13056
13057 if (! masked)
13058 emit_insn (fcn (target, accum, wide_reg, mem));
13059 else
13060 {
13061 rtx merge, mask;
13062 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13063
13064 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13065
13066 if (CONST_INT_P (mask))
13067 mask = fixup_modeless_constant (mask, QImode);
13068
13069 mask = force_reg (QImode, mask);
13070
13071 if (GET_MODE (mask) != QImode)
13072 mask = gen_rtx_SUBREG (QImode, mask, 0);
13073
13074 /* If merge is 0 then we're about to emit z-masked variant. */
13075 if (const0_operand (merge, mode))
13076 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13077 /* If merge is the same as accum then emit merge-masked
13078 variant. */
13079 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13080 {
13081 merge = force_reg (mode, merge);
13082 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13083 }
13084 /* Merge with something unknown might happen if we z-mask
13085 w/ -O0. */
13086 else
13087 {
13088 target = gen_reg_rtx (mode);
13089 emit_move_insn (target, merge);
13090 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13091 }
13092 }
13093 return target;
13094 }
13095 case IX86_BUILTIN_RDPID:
13096 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13097 target);
13098 case IX86_BUILTIN_FABSQ:
13099 case IX86_BUILTIN_COPYSIGNQ:
13100 if (!TARGET_SSE)
13101 /* Emit a normal call if SSE isn't available. */
13102 return expand_call (exp, target, ignore);
13103 /* FALLTHRU */
13104 default:
13105 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13106 }
13107 }
13108
13109 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13110 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13111 {
13112 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13113 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13114 }
13115
13116 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13117 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13118 {
13119 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13120 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13121 }
13122
13123 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13124 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13125 {
13126 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13127 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13128 }
13129
13130 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13131 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13132 {
13133 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13134 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13135 }
13136
13137 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13138 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13139 {
13140 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13141 const struct builtin_description *d = bdesc_multi_arg + i;
13142 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13143 (enum ix86_builtin_func_type)
13144 d->flag, d->comparison);
13145 }
13146
13147 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13148 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13149 {
13150 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13151 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13152 target);
13153 }
13154
13155 gcc_unreachable ();
13156 }
13157
13158 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13159 fill target with val via vec_duplicate. */
13160
13161 static bool
13162 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13163 {
13164 bool ok;
13165 rtx_insn *insn;
13166 rtx dup;
13167
13168 /* First attempt to recognize VAL as-is. */
13169 dup = gen_vec_duplicate (mode, val);
13170 insn = emit_insn (gen_rtx_SET (target, dup));
13171 if (recog_memoized (insn) < 0)
13172 {
13173 rtx_insn *seq;
13174 machine_mode innermode = GET_MODE_INNER (mode);
13175 rtx reg;
13176
13177 /* If that fails, force VAL into a register. */
13178
13179 start_sequence ();
13180 reg = force_reg (innermode, val);
13181 if (GET_MODE (reg) != innermode)
13182 reg = gen_lowpart (innermode, reg);
13183 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13184 seq = get_insns ();
13185 end_sequence ();
13186 if (seq)
13187 emit_insn_before (seq, insn);
13188
13189 ok = recog_memoized (insn) >= 0;
13190 gcc_assert (ok);
13191 }
13192 return true;
13193 }
13194
13195 /* Get a vector mode of the same size as the original but with elements
13196 twice as wide. This is only guaranteed to apply to integral vectors. */
13197
13198 static machine_mode
13199 get_mode_wider_vector (machine_mode o)
13200 {
13201 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13202 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13203 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13204 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13205 return n;
13206 }
13207
13208 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13209 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13210
13211 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13212 with all elements equal to VAR. Return true if successful. */
13213
13214 static bool
13215 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13216 rtx target, rtx val)
13217 {
13218 bool ok;
13219
13220 switch (mode)
13221 {
13222 case E_V2SImode:
13223 case E_V2SFmode:
13224 if (!mmx_ok)
13225 return false;
13226 /* FALLTHRU */
13227
13228 case E_V4DFmode:
13229 case E_V4DImode:
13230 case E_V8SFmode:
13231 case E_V8SImode:
13232 case E_V2DFmode:
13233 case E_V2DImode:
13234 case E_V4SFmode:
13235 case E_V4SImode:
13236 case E_V16SImode:
13237 case E_V8DImode:
13238 case E_V16SFmode:
13239 case E_V8DFmode:
13240 return ix86_vector_duplicate_value (mode, target, val);
13241
13242 case E_V4HImode:
13243 if (!mmx_ok)
13244 return false;
13245 if (TARGET_SSE || TARGET_3DNOW_A)
13246 {
13247 rtx x;
13248
13249 val = gen_lowpart (SImode, val);
13250 x = gen_rtx_TRUNCATE (HImode, val);
13251 x = gen_rtx_VEC_DUPLICATE (mode, x);
13252 emit_insn (gen_rtx_SET (target, x));
13253 return true;
13254 }
13255 goto widen;
13256
13257 case E_V8QImode:
13258 if (!mmx_ok)
13259 return false;
13260 goto widen;
13261
13262 case E_V8HImode:
13263 if (TARGET_AVX2)
13264 return ix86_vector_duplicate_value (mode, target, val);
13265
13266 if (TARGET_SSE2)
13267 {
13268 struct expand_vec_perm_d dperm;
13269 rtx tmp1, tmp2;
13270
13271 permute:
13272 memset (&dperm, 0, sizeof (dperm));
13273 dperm.target = target;
13274 dperm.vmode = mode;
13275 dperm.nelt = GET_MODE_NUNITS (mode);
13276 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13277 dperm.one_operand_p = true;
13278
13279 /* Extend to SImode using a paradoxical SUBREG. */
13280 tmp1 = gen_reg_rtx (SImode);
13281 emit_move_insn (tmp1, gen_lowpart (SImode, val));
13282
13283 /* Insert the SImode value as low element of a V4SImode vector. */
13284 tmp2 = gen_reg_rtx (V4SImode);
13285 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13286 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13287
13288 ok = (expand_vec_perm_1 (&dperm)
13289 || expand_vec_perm_broadcast_1 (&dperm));
13290 gcc_assert (ok);
13291 return ok;
13292 }
13293 goto widen;
13294
13295 case E_V16QImode:
13296 if (TARGET_AVX2)
13297 return ix86_vector_duplicate_value (mode, target, val);
13298
13299 if (TARGET_SSE2)
13300 goto permute;
13301 goto widen;
13302
13303 widen:
13304 /* Replicate the value once into the next wider mode and recurse. */
13305 {
13306 machine_mode smode, wsmode, wvmode;
13307 rtx x;
13308
13309 smode = GET_MODE_INNER (mode);
13310 wvmode = get_mode_wider_vector (mode);
13311 wsmode = GET_MODE_INNER (wvmode);
13312
13313 val = convert_modes (wsmode, smode, val, true);
13314 x = expand_simple_binop (wsmode, ASHIFT, val,
13315 GEN_INT (GET_MODE_BITSIZE (smode)),
13316 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13317 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13318
13319 x = gen_reg_rtx (wvmode);
13320 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13321 gcc_assert (ok);
13322 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13323 return ok;
13324 }
13325
13326 case E_V16HImode:
13327 case E_V32QImode:
13328 if (TARGET_AVX2)
13329 return ix86_vector_duplicate_value (mode, target, val);
13330 else
13331 {
13332 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13333 rtx x = gen_reg_rtx (hvmode);
13334
13335 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13336 gcc_assert (ok);
13337
13338 x = gen_rtx_VEC_CONCAT (mode, x, x);
13339 emit_insn (gen_rtx_SET (target, x));
13340 }
13341 return true;
13342
13343 case E_V64QImode:
13344 case E_V32HImode:
13345 if (TARGET_AVX512BW)
13346 return ix86_vector_duplicate_value (mode, target, val);
13347 else
13348 {
13349 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13350 rtx x = gen_reg_rtx (hvmode);
13351
13352 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13353 gcc_assert (ok);
13354
13355 x = gen_rtx_VEC_CONCAT (mode, x, x);
13356 emit_insn (gen_rtx_SET (target, x));
13357 }
13358 return true;
13359
13360 default:
13361 return false;
13362 }
13363 }
13364
13365 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13366 whose ONE_VAR element is VAR, and other elements are zero. Return true
13367 if successful. */
13368
13369 static bool
13370 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13371 rtx target, rtx var, int one_var)
13372 {
13373 machine_mode vsimode;
13374 rtx new_target;
13375 rtx x, tmp;
13376 bool use_vector_set = false;
13377 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13378
13379 switch (mode)
13380 {
13381 case E_V2DImode:
13382 /* For SSE4.1, we normally use vector set. But if the second
13383 element is zero and inter-unit moves are OK, we use movq
13384 instead. */
13385 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13386 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13387 && one_var == 0));
13388 break;
13389 case E_V16QImode:
13390 case E_V4SImode:
13391 case E_V4SFmode:
13392 use_vector_set = TARGET_SSE4_1;
13393 break;
13394 case E_V8HImode:
13395 use_vector_set = TARGET_SSE2;
13396 break;
13397 case E_V8QImode:
13398 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
13399 break;
13400 case E_V4HImode:
13401 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13402 break;
13403 case E_V32QImode:
13404 case E_V16HImode:
13405 use_vector_set = TARGET_AVX;
13406 break;
13407 case E_V8SImode:
13408 use_vector_set = TARGET_AVX;
13409 gen_vec_set_0 = gen_vec_setv8si_0;
13410 break;
13411 case E_V8SFmode:
13412 use_vector_set = TARGET_AVX;
13413 gen_vec_set_0 = gen_vec_setv8sf_0;
13414 break;
13415 case E_V4DFmode:
13416 use_vector_set = TARGET_AVX;
13417 gen_vec_set_0 = gen_vec_setv4df_0;
13418 break;
13419 case E_V4DImode:
13420 /* Use ix86_expand_vector_set in 64bit mode only. */
13421 use_vector_set = TARGET_AVX && TARGET_64BIT;
13422 gen_vec_set_0 = gen_vec_setv4di_0;
13423 break;
13424 case E_V16SImode:
13425 use_vector_set = TARGET_AVX512F && one_var == 0;
13426 gen_vec_set_0 = gen_vec_setv16si_0;
13427 break;
13428 case E_V16SFmode:
13429 use_vector_set = TARGET_AVX512F && one_var == 0;
13430 gen_vec_set_0 = gen_vec_setv16sf_0;
13431 break;
13432 case E_V8DFmode:
13433 use_vector_set = TARGET_AVX512F && one_var == 0;
13434 gen_vec_set_0 = gen_vec_setv8df_0;
13435 break;
13436 case E_V8DImode:
13437 /* Use ix86_expand_vector_set in 64bit mode only. */
13438 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13439 gen_vec_set_0 = gen_vec_setv8di_0;
13440 break;
13441 default:
13442 break;
13443 }
13444
13445 if (use_vector_set)
13446 {
13447 if (gen_vec_set_0 && one_var == 0)
13448 {
13449 var = force_reg (GET_MODE_INNER (mode), var);
13450 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13451 return true;
13452 }
13453 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13454 var = force_reg (GET_MODE_INNER (mode), var);
13455 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13456 return true;
13457 }
13458
13459 switch (mode)
13460 {
13461 case E_V2SFmode:
13462 case E_V2SImode:
13463 if (!mmx_ok)
13464 return false;
13465 /* FALLTHRU */
13466
13467 case E_V2DFmode:
13468 case E_V2DImode:
13469 if (one_var != 0)
13470 return false;
13471 var = force_reg (GET_MODE_INNER (mode), var);
13472 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13473 emit_insn (gen_rtx_SET (target, x));
13474 return true;
13475
13476 case E_V4SFmode:
13477 case E_V4SImode:
13478 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13479 new_target = gen_reg_rtx (mode);
13480 else
13481 new_target = target;
13482 var = force_reg (GET_MODE_INNER (mode), var);
13483 x = gen_rtx_VEC_DUPLICATE (mode, var);
13484 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13485 emit_insn (gen_rtx_SET (new_target, x));
13486 if (one_var != 0)
13487 {
13488 /* We need to shuffle the value to the correct position, so
13489 create a new pseudo to store the intermediate result. */
13490
13491 /* With SSE2, we can use the integer shuffle insns. */
13492 if (mode != V4SFmode && TARGET_SSE2)
13493 {
13494 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13495 const1_rtx,
13496 GEN_INT (one_var == 1 ? 0 : 1),
13497 GEN_INT (one_var == 2 ? 0 : 1),
13498 GEN_INT (one_var == 3 ? 0 : 1)));
13499 if (target != new_target)
13500 emit_move_insn (target, new_target);
13501 return true;
13502 }
13503
13504 /* Otherwise convert the intermediate result to V4SFmode and
13505 use the SSE1 shuffle instructions. */
13506 if (mode != V4SFmode)
13507 {
13508 tmp = gen_reg_rtx (V4SFmode);
13509 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13510 }
13511 else
13512 tmp = new_target;
13513
13514 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13515 const1_rtx,
13516 GEN_INT (one_var == 1 ? 0 : 1),
13517 GEN_INT (one_var == 2 ? 0+4 : 1+4),
13518 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13519
13520 if (mode != V4SFmode)
13521 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13522 else if (tmp != target)
13523 emit_move_insn (target, tmp);
13524 }
13525 else if (target != new_target)
13526 emit_move_insn (target, new_target);
13527 return true;
13528
13529 case E_V8HImode:
13530 case E_V16QImode:
13531 vsimode = V4SImode;
13532 goto widen;
13533 case E_V4HImode:
13534 case E_V8QImode:
13535 if (!mmx_ok)
13536 return false;
13537 vsimode = V2SImode;
13538 goto widen;
13539 widen:
13540 if (one_var != 0)
13541 return false;
13542
13543 /* Zero extend the variable element to SImode and recurse. */
13544 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13545
13546 x = gen_reg_rtx (vsimode);
13547 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13548 var, one_var))
13549 gcc_unreachable ();
13550
13551 emit_move_insn (target, gen_lowpart (mode, x));
13552 return true;
13553
13554 default:
13555 return false;
13556 }
13557 }
13558
13559 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13560 consisting of the values in VALS. It is known that all elements
13561 except ONE_VAR are constants. Return true if successful. */
13562
13563 static bool
13564 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13565 rtx target, rtx vals, int one_var)
13566 {
13567 rtx var = XVECEXP (vals, 0, one_var);
13568 machine_mode wmode;
13569 rtx const_vec, x;
13570
13571 const_vec = copy_rtx (vals);
13572 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13573 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13574
13575 switch (mode)
13576 {
13577 case E_V2DFmode:
13578 case E_V2DImode:
13579 case E_V2SFmode:
13580 case E_V2SImode:
13581 /* For the two element vectors, it's just as easy to use
13582 the general case. */
13583 return false;
13584
13585 case E_V4DImode:
13586 /* Use ix86_expand_vector_set in 64bit mode only. */
13587 if (!TARGET_64BIT)
13588 return false;
13589 /* FALLTHRU */
13590 case E_V4DFmode:
13591 case E_V8SFmode:
13592 case E_V8SImode:
13593 case E_V16HImode:
13594 case E_V32QImode:
13595 case E_V4SFmode:
13596 case E_V4SImode:
13597 case E_V8HImode:
13598 case E_V4HImode:
13599 break;
13600
13601 case E_V16QImode:
13602 if (TARGET_SSE4_1)
13603 break;
13604 wmode = V8HImode;
13605 goto widen;
13606 case E_V8QImode:
13607 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
13608 break;
13609 wmode = V4HImode;
13610 goto widen;
13611 widen:
13612 /* There's no way to set one QImode entry easily. Combine
13613 the variable value with its adjacent constant value, and
13614 promote to an HImode set. */
13615 x = XVECEXP (vals, 0, one_var ^ 1);
13616 if (one_var & 1)
13617 {
13618 var = convert_modes (HImode, QImode, var, true);
13619 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13620 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13621 x = GEN_INT (INTVAL (x) & 0xff);
13622 }
13623 else
13624 {
13625 var = convert_modes (HImode, QImode, var, true);
13626 x = gen_int_mode (UINTVAL (x) << 8, HImode);
13627 }
13628 if (x != const0_rtx)
13629 var = expand_simple_binop (HImode, IOR, var, x, var,
13630 1, OPTAB_LIB_WIDEN);
13631
13632 x = gen_reg_rtx (wmode);
13633 emit_move_insn (x, gen_lowpart (wmode, const_vec));
13634 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
13635
13636 emit_move_insn (target, gen_lowpart (mode, x));
13637 return true;
13638
13639 default:
13640 return false;
13641 }
13642
13643 emit_move_insn (target, const_vec);
13644 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13645 return true;
13646 }
13647
13648 /* A subroutine of ix86_expand_vector_init_general. Use vector
13649 concatenate to handle the most general case: all values variable,
13650 and none identical. */
13651
13652 static void
13653 ix86_expand_vector_init_concat (machine_mode mode,
13654 rtx target, rtx *ops, int n)
13655 {
13656 machine_mode half_mode = VOIDmode;
13657 rtx half[2];
13658 rtvec v;
13659 int i, j;
13660
13661 switch (n)
13662 {
13663 case 2:
13664 switch (mode)
13665 {
13666 case E_V16SImode:
13667 half_mode = V8SImode;
13668 break;
13669 case E_V16SFmode:
13670 half_mode = V8SFmode;
13671 break;
13672 case E_V8DImode:
13673 half_mode = V4DImode;
13674 break;
13675 case E_V8DFmode:
13676 half_mode = V4DFmode;
13677 break;
13678 case E_V8SImode:
13679 half_mode = V4SImode;
13680 break;
13681 case E_V8SFmode:
13682 half_mode = V4SFmode;
13683 break;
13684 case E_V4DImode:
13685 half_mode = V2DImode;
13686 break;
13687 case E_V4DFmode:
13688 half_mode = V2DFmode;
13689 break;
13690 case E_V4SImode:
13691 half_mode = V2SImode;
13692 break;
13693 case E_V4SFmode:
13694 half_mode = V2SFmode;
13695 break;
13696 case E_V2DImode:
13697 half_mode = DImode;
13698 break;
13699 case E_V2SImode:
13700 half_mode = SImode;
13701 break;
13702 case E_V2DFmode:
13703 half_mode = DFmode;
13704 break;
13705 case E_V2SFmode:
13706 half_mode = SFmode;
13707 break;
13708 default:
13709 gcc_unreachable ();
13710 }
13711
13712 if (!register_operand (ops[1], half_mode))
13713 ops[1] = force_reg (half_mode, ops[1]);
13714 if (!register_operand (ops[0], half_mode))
13715 ops[0] = force_reg (half_mode, ops[0]);
13716 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
13717 ops[1])));
13718 break;
13719
13720 case 4:
13721 switch (mode)
13722 {
13723 case E_V4DImode:
13724 half_mode = V2DImode;
13725 break;
13726 case E_V4DFmode:
13727 half_mode = V2DFmode;
13728 break;
13729 case E_V4SImode:
13730 half_mode = V2SImode;
13731 break;
13732 case E_V4SFmode:
13733 half_mode = V2SFmode;
13734 break;
13735 default:
13736 gcc_unreachable ();
13737 }
13738 goto half;
13739
13740 case 8:
13741 switch (mode)
13742 {
13743 case E_V8DImode:
13744 half_mode = V4DImode;
13745 break;
13746 case E_V8DFmode:
13747 half_mode = V4DFmode;
13748 break;
13749 case E_V8SImode:
13750 half_mode = V4SImode;
13751 break;
13752 case E_V8SFmode:
13753 half_mode = V4SFmode;
13754 break;
13755 default:
13756 gcc_unreachable ();
13757 }
13758 goto half;
13759
13760 case 16:
13761 switch (mode)
13762 {
13763 case E_V16SImode:
13764 half_mode = V8SImode;
13765 break;
13766 case E_V16SFmode:
13767 half_mode = V8SFmode;
13768 break;
13769 default:
13770 gcc_unreachable ();
13771 }
13772 goto half;
13773
13774 half:
13775 /* FIXME: We process inputs backward to help RA. PR 36222. */
13776 i = n - 1;
13777 for (j = 1; j != -1; j--)
13778 {
13779 half[j] = gen_reg_rtx (half_mode);
13780 switch (n >> 1)
13781 {
13782 case 2:
13783 v = gen_rtvec (2, ops[i-1], ops[i]);
13784 i -= 2;
13785 break;
13786 case 4:
13787 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
13788 i -= 4;
13789 break;
13790 case 8:
13791 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
13792 ops[i-3], ops[i-2], ops[i-1], ops[i]);
13793 i -= 8;
13794 break;
13795 default:
13796 gcc_unreachable ();
13797 }
13798 ix86_expand_vector_init (false, half[j],
13799 gen_rtx_PARALLEL (half_mode, v));
13800 }
13801
13802 ix86_expand_vector_init_concat (mode, target, half, 2);
13803 break;
13804
13805 default:
13806 gcc_unreachable ();
13807 }
13808 }
13809
13810 /* A subroutine of ix86_expand_vector_init_general. Use vector
13811 interleave to handle the most general case: all values variable,
13812 and none identical. */
13813
13814 static void
13815 ix86_expand_vector_init_interleave (machine_mode mode,
13816 rtx target, rtx *ops, int n)
13817 {
13818 machine_mode first_imode, second_imode, third_imode, inner_mode;
13819 int i, j;
13820 rtx op0, op1;
13821 rtx (*gen_load_even) (rtx, rtx, rtx);
13822 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
13823 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
13824
13825 switch (mode)
13826 {
13827 case E_V8HImode:
13828 gen_load_even = gen_vec_setv8hi;
13829 gen_interleave_first_low = gen_vec_interleave_lowv4si;
13830 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13831 inner_mode = HImode;
13832 first_imode = V4SImode;
13833 second_imode = V2DImode;
13834 third_imode = VOIDmode;
13835 break;
13836 case E_V16QImode:
13837 gen_load_even = gen_vec_setv16qi;
13838 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
13839 gen_interleave_second_low = gen_vec_interleave_lowv4si;
13840 inner_mode = QImode;
13841 first_imode = V8HImode;
13842 second_imode = V4SImode;
13843 third_imode = V2DImode;
13844 break;
13845 default:
13846 gcc_unreachable ();
13847 }
13848
13849 for (i = 0; i < n; i++)
13850 {
13851 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13852 op0 = gen_reg_rtx (SImode);
13853 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
13854
13855 /* Insert the SImode value as low element of V4SImode vector. */
13856 op1 = gen_reg_rtx (V4SImode);
13857 op0 = gen_rtx_VEC_MERGE (V4SImode,
13858 gen_rtx_VEC_DUPLICATE (V4SImode,
13859 op0),
13860 CONST0_RTX (V4SImode),
13861 const1_rtx);
13862 emit_insn (gen_rtx_SET (op1, op0));
13863
13864 /* Cast the V4SImode vector back to a vector in orignal mode. */
13865 op0 = gen_reg_rtx (mode);
13866 emit_move_insn (op0, gen_lowpart (mode, op1));
13867
13868 /* Load even elements into the second position. */
13869 emit_insn (gen_load_even (op0,
13870 force_reg (inner_mode,
13871 ops [i + i + 1]),
13872 const1_rtx));
13873
13874 /* Cast vector to FIRST_IMODE vector. */
13875 ops[i] = gen_reg_rtx (first_imode);
13876 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
13877 }
13878
13879 /* Interleave low FIRST_IMODE vectors. */
13880 for (i = j = 0; i < n; i += 2, j++)
13881 {
13882 op0 = gen_reg_rtx (first_imode);
13883 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
13884
13885 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13886 ops[j] = gen_reg_rtx (second_imode);
13887 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
13888 }
13889
13890 /* Interleave low SECOND_IMODE vectors. */
13891 switch (second_imode)
13892 {
13893 case E_V4SImode:
13894 for (i = j = 0; i < n / 2; i += 2, j++)
13895 {
13896 op0 = gen_reg_rtx (second_imode);
13897 emit_insn (gen_interleave_second_low (op0, ops[i],
13898 ops[i + 1]));
13899
13900 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13901 vector. */
13902 ops[j] = gen_reg_rtx (third_imode);
13903 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
13904 }
13905 second_imode = V2DImode;
13906 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13907 /* FALLTHRU */
13908
13909 case E_V2DImode:
13910 op0 = gen_reg_rtx (second_imode);
13911 emit_insn (gen_interleave_second_low (op0, ops[0],
13912 ops[1]));
13913
13914 /* Cast the SECOND_IMODE vector back to a vector on original
13915 mode. */
13916 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
13917 break;
13918
13919 default:
13920 gcc_unreachable ();
13921 }
13922 }
13923
13924 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
13925 all values variable, and none identical. */
13926
13927 static void
13928 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
13929 rtx target, rtx vals)
13930 {
13931 rtx ops[64], op0, op1, op2, op3, op4, op5;
13932 machine_mode half_mode = VOIDmode;
13933 machine_mode quarter_mode = VOIDmode;
13934 int n, i;
13935
13936 switch (mode)
13937 {
13938 case E_V2SFmode:
13939 case E_V2SImode:
13940 if (!mmx_ok && !TARGET_SSE)
13941 break;
13942 /* FALLTHRU */
13943
13944 case E_V16SImode:
13945 case E_V16SFmode:
13946 case E_V8DFmode:
13947 case E_V8DImode:
13948 case E_V8SFmode:
13949 case E_V8SImode:
13950 case E_V4DFmode:
13951 case E_V4DImode:
13952 case E_V4SFmode:
13953 case E_V4SImode:
13954 case E_V2DFmode:
13955 case E_V2DImode:
13956 n = GET_MODE_NUNITS (mode);
13957 for (i = 0; i < n; i++)
13958 ops[i] = XVECEXP (vals, 0, i);
13959 ix86_expand_vector_init_concat (mode, target, ops, n);
13960 return;
13961
13962 case E_V2TImode:
13963 for (i = 0; i < 2; i++)
13964 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13965 op0 = gen_reg_rtx (V4DImode);
13966 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
13967 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13968 return;
13969
13970 case E_V4TImode:
13971 for (i = 0; i < 4; i++)
13972 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13973 ops[4] = gen_reg_rtx (V4DImode);
13974 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
13975 ops[5] = gen_reg_rtx (V4DImode);
13976 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
13977 op0 = gen_reg_rtx (V8DImode);
13978 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
13979 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13980 return;
13981
13982 case E_V32QImode:
13983 half_mode = V16QImode;
13984 goto half;
13985
13986 case E_V16HImode:
13987 half_mode = V8HImode;
13988 goto half;
13989
13990 half:
13991 n = GET_MODE_NUNITS (mode);
13992 for (i = 0; i < n; i++)
13993 ops[i] = XVECEXP (vals, 0, i);
13994 op0 = gen_reg_rtx (half_mode);
13995 op1 = gen_reg_rtx (half_mode);
13996 ix86_expand_vector_init_interleave (half_mode, op0, ops,
13997 n >> 2);
13998 ix86_expand_vector_init_interleave (half_mode, op1,
13999 &ops [n >> 1], n >> 2);
14000 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
14001 return;
14002
14003 case E_V64QImode:
14004 quarter_mode = V16QImode;
14005 half_mode = V32QImode;
14006 goto quarter;
14007
14008 case E_V32HImode:
14009 quarter_mode = V8HImode;
14010 half_mode = V16HImode;
14011 goto quarter;
14012
14013 quarter:
14014 n = GET_MODE_NUNITS (mode);
14015 for (i = 0; i < n; i++)
14016 ops[i] = XVECEXP (vals, 0, i);
14017 op0 = gen_reg_rtx (quarter_mode);
14018 op1 = gen_reg_rtx (quarter_mode);
14019 op2 = gen_reg_rtx (quarter_mode);
14020 op3 = gen_reg_rtx (quarter_mode);
14021 op4 = gen_reg_rtx (half_mode);
14022 op5 = gen_reg_rtx (half_mode);
14023 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
14024 n >> 3);
14025 ix86_expand_vector_init_interleave (quarter_mode, op1,
14026 &ops [n >> 2], n >> 3);
14027 ix86_expand_vector_init_interleave (quarter_mode, op2,
14028 &ops [n >> 1], n >> 3);
14029 ix86_expand_vector_init_interleave (quarter_mode, op3,
14030 &ops [(n >> 1) | (n >> 2)], n >> 3);
14031 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
14032 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
14033 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
14034 return;
14035
14036 case E_V16QImode:
14037 if (!TARGET_SSE4_1)
14038 break;
14039 /* FALLTHRU */
14040
14041 case E_V8HImode:
14042 if (!TARGET_SSE2)
14043 break;
14044
14045 /* Don't use ix86_expand_vector_init_interleave if we can't
14046 move from GPR to SSE register directly. */
14047 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
14048 break;
14049
14050 n = GET_MODE_NUNITS (mode);
14051 for (i = 0; i < n; i++)
14052 ops[i] = XVECEXP (vals, 0, i);
14053 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
14054 return;
14055
14056 case E_V4HImode:
14057 case E_V8QImode:
14058 break;
14059
14060 default:
14061 gcc_unreachable ();
14062 }
14063
14064 {
14065 int i, j, n_elts, n_words, n_elt_per_word;
14066 machine_mode inner_mode;
14067 rtx words[4], shift;
14068
14069 inner_mode = GET_MODE_INNER (mode);
14070 n_elts = GET_MODE_NUNITS (mode);
14071 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
14072 n_elt_per_word = n_elts / n_words;
14073 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14074
14075 for (i = 0; i < n_words; ++i)
14076 {
14077 rtx word = NULL_RTX;
14078
14079 for (j = 0; j < n_elt_per_word; ++j)
14080 {
14081 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14082 elt = convert_modes (word_mode, inner_mode, elt, true);
14083
14084 if (j == 0)
14085 word = elt;
14086 else
14087 {
14088 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14089 word, 1, OPTAB_LIB_WIDEN);
14090 word = expand_simple_binop (word_mode, IOR, word, elt,
14091 word, 1, OPTAB_LIB_WIDEN);
14092 }
14093 }
14094
14095 words[i] = word;
14096 }
14097
14098 if (n_words == 1)
14099 emit_move_insn (target, gen_lowpart (mode, words[0]));
14100 else if (n_words == 2)
14101 {
14102 rtx tmp = gen_reg_rtx (mode);
14103 emit_clobber (tmp);
14104 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14105 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14106 emit_move_insn (target, tmp);
14107 }
14108 else if (n_words == 4)
14109 {
14110 rtx tmp = gen_reg_rtx (V4SImode);
14111 gcc_assert (word_mode == SImode);
14112 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14113 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14114 emit_move_insn (target, gen_lowpart (mode, tmp));
14115 }
14116 else
14117 gcc_unreachable ();
14118 }
14119 }
14120
14121 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14122 instructions unless MMX_OK is true. */
14123
14124 void
14125 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14126 {
14127 machine_mode mode = GET_MODE (target);
14128 machine_mode inner_mode = GET_MODE_INNER (mode);
14129 int n_elts = GET_MODE_NUNITS (mode);
14130 int n_var = 0, one_var = -1;
14131 bool all_same = true, all_const_zero = true;
14132 int i;
14133 rtx x;
14134
14135 /* Handle first initialization from vector elts. */
14136 if (n_elts != XVECLEN (vals, 0))
14137 {
14138 rtx subtarget = target;
14139 x = XVECEXP (vals, 0, 0);
14140 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14141 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14142 {
14143 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14144 if (inner_mode == QImode || inner_mode == HImode)
14145 {
14146 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14147 mode = mode_for_vector (SImode, n_bits / 4).require ();
14148 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
14149 ops[0] = gen_lowpart (inner_mode, ops[0]);
14150 ops[1] = gen_lowpart (inner_mode, ops[1]);
14151 subtarget = gen_reg_rtx (mode);
14152 }
14153 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14154 if (subtarget != target)
14155 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14156 return;
14157 }
14158 gcc_unreachable ();
14159 }
14160
14161 for (i = 0; i < n_elts; ++i)
14162 {
14163 x = XVECEXP (vals, 0, i);
14164 if (!(CONST_SCALAR_INT_P (x)
14165 || CONST_DOUBLE_P (x)
14166 || CONST_FIXED_P (x)))
14167 n_var++, one_var = i;
14168 else if (x != CONST0_RTX (inner_mode))
14169 all_const_zero = false;
14170 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14171 all_same = false;
14172 }
14173
14174 /* Constants are best loaded from the constant pool. */
14175 if (n_var == 0)
14176 {
14177 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14178 return;
14179 }
14180
14181 /* If all values are identical, broadcast the value. */
14182 if (all_same
14183 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14184 XVECEXP (vals, 0, 0)))
14185 return;
14186
14187 /* Values where only one field is non-constant are best loaded from
14188 the pool and overwritten via move later. */
14189 if (n_var == 1)
14190 {
14191 if (all_const_zero
14192 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14193 XVECEXP (vals, 0, one_var),
14194 one_var))
14195 return;
14196
14197 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14198 return;
14199 }
14200
14201 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14202 }
14203
14204 void
14205 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14206 {
14207 machine_mode mode = GET_MODE (target);
14208 machine_mode inner_mode = GET_MODE_INNER (mode);
14209 machine_mode half_mode;
14210 bool use_vec_merge = false;
14211 rtx tmp;
14212 static rtx (*gen_extract[6][2]) (rtx, rtx)
14213 = {
14214 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14215 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14216 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14217 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14218 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14219 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14220 };
14221 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14222 = {
14223 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14224 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14225 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14226 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14227 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14228 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14229 };
14230 int i, j, n;
14231 machine_mode mmode = VOIDmode;
14232 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14233
14234 switch (mode)
14235 {
14236 case E_V2SImode:
14237 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14238 if (use_vec_merge)
14239 break;
14240 /* FALLTHRU */
14241
14242 case E_V2SFmode:
14243 if (mmx_ok)
14244 {
14245 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14246 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14247 if (elt == 0)
14248 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14249 else
14250 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14251 emit_insn (gen_rtx_SET (target, tmp));
14252 return;
14253 }
14254 break;
14255
14256 case E_V2DImode:
14257 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14258 if (use_vec_merge)
14259 break;
14260
14261 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14262 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14263 if (elt == 0)
14264 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14265 else
14266 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14267 emit_insn (gen_rtx_SET (target, tmp));
14268 return;
14269
14270 case E_V2DFmode:
14271 /* NB: For ELT == 0, use standard scalar operation patterns which
14272 preserve the rest of the vector for combiner:
14273
14274 (vec_merge:V2DF
14275 (vec_duplicate:V2DF (reg:DF))
14276 (reg:V2DF)
14277 (const_int 1))
14278 */
14279 if (elt == 0)
14280 goto do_vec_merge;
14281
14282 {
14283 rtx op0, op1;
14284
14285 /* For the two element vectors, we implement a VEC_CONCAT with
14286 the extraction of the other element. */
14287
14288 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14289 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14290
14291 if (elt == 0)
14292 op0 = val, op1 = tmp;
14293 else
14294 op0 = tmp, op1 = val;
14295
14296 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14297 emit_insn (gen_rtx_SET (target, tmp));
14298 }
14299 return;
14300
14301 case E_V4SFmode:
14302 use_vec_merge = TARGET_SSE4_1;
14303 if (use_vec_merge)
14304 break;
14305
14306 switch (elt)
14307 {
14308 case 0:
14309 use_vec_merge = true;
14310 break;
14311
14312 case 1:
14313 /* tmp = target = A B C D */
14314 tmp = copy_to_reg (target);
14315 /* target = A A B B */
14316 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14317 /* target = X A B B */
14318 ix86_expand_vector_set (false, target, val, 0);
14319 /* target = A X C D */
14320 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14321 const1_rtx, const0_rtx,
14322 GEN_INT (2+4), GEN_INT (3+4)));
14323 return;
14324
14325 case 2:
14326 /* tmp = target = A B C D */
14327 tmp = copy_to_reg (target);
14328 /* tmp = X B C D */
14329 ix86_expand_vector_set (false, tmp, val, 0);
14330 /* target = A B X D */
14331 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14332 const0_rtx, const1_rtx,
14333 GEN_INT (0+4), GEN_INT (3+4)));
14334 return;
14335
14336 case 3:
14337 /* tmp = target = A B C D */
14338 tmp = copy_to_reg (target);
14339 /* tmp = X B C D */
14340 ix86_expand_vector_set (false, tmp, val, 0);
14341 /* target = A B X D */
14342 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14343 const0_rtx, const1_rtx,
14344 GEN_INT (2+4), GEN_INT (0+4)));
14345 return;
14346
14347 default:
14348 gcc_unreachable ();
14349 }
14350 break;
14351
14352 case E_V4SImode:
14353 use_vec_merge = TARGET_SSE4_1;
14354 if (use_vec_merge)
14355 break;
14356
14357 /* Element 0 handled by vec_merge below. */
14358 if (elt == 0)
14359 {
14360 use_vec_merge = true;
14361 break;
14362 }
14363
14364 if (TARGET_SSE2)
14365 {
14366 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14367 store into element 0, then shuffle them back. */
14368
14369 rtx order[4];
14370
14371 order[0] = GEN_INT (elt);
14372 order[1] = const1_rtx;
14373 order[2] = const2_rtx;
14374 order[3] = GEN_INT (3);
14375 order[elt] = const0_rtx;
14376
14377 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14378 order[1], order[2], order[3]));
14379
14380 ix86_expand_vector_set (false, target, val, 0);
14381
14382 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14383 order[1], order[2], order[3]));
14384 }
14385 else
14386 {
14387 /* For SSE1, we have to reuse the V4SF code. */
14388 rtx t = gen_reg_rtx (V4SFmode);
14389 emit_move_insn (t, gen_lowpart (V4SFmode, target));
14390 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14391 emit_move_insn (target, gen_lowpart (mode, t));
14392 }
14393 return;
14394
14395 case E_V8HImode:
14396 use_vec_merge = TARGET_SSE2;
14397 break;
14398 case E_V4HImode:
14399 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14400 break;
14401
14402 case E_V16QImode:
14403 use_vec_merge = TARGET_SSE4_1;
14404 break;
14405
14406 case E_V8QImode:
14407 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14408 break;
14409
14410 case E_V32QImode:
14411 half_mode = V16QImode;
14412 j = 0;
14413 n = 16;
14414 goto half;
14415
14416 case E_V16HImode:
14417 half_mode = V8HImode;
14418 j = 1;
14419 n = 8;
14420 goto half;
14421
14422 case E_V8SImode:
14423 half_mode = V4SImode;
14424 j = 2;
14425 n = 4;
14426 goto half;
14427
14428 case E_V4DImode:
14429 half_mode = V2DImode;
14430 j = 3;
14431 n = 2;
14432 goto half;
14433
14434 case E_V8SFmode:
14435 half_mode = V4SFmode;
14436 j = 4;
14437 n = 4;
14438 goto half;
14439
14440 case E_V4DFmode:
14441 half_mode = V2DFmode;
14442 j = 5;
14443 n = 2;
14444 goto half;
14445
14446 half:
14447 /* Compute offset. */
14448 i = elt / n;
14449 elt %= n;
14450
14451 gcc_assert (i <= 1);
14452
14453 /* Extract the half. */
14454 tmp = gen_reg_rtx (half_mode);
14455 emit_insn (gen_extract[j][i] (tmp, target));
14456
14457 /* Put val in tmp at elt. */
14458 ix86_expand_vector_set (false, tmp, val, elt);
14459
14460 /* Put it back. */
14461 emit_insn (gen_insert[j][i] (target, target, tmp));
14462 return;
14463
14464 case E_V8DFmode:
14465 if (TARGET_AVX512F)
14466 {
14467 mmode = QImode;
14468 gen_blendm = gen_avx512f_blendmv8df;
14469 }
14470 break;
14471
14472 case E_V8DImode:
14473 if (TARGET_AVX512F)
14474 {
14475 mmode = QImode;
14476 gen_blendm = gen_avx512f_blendmv8di;
14477 }
14478 break;
14479
14480 case E_V16SFmode:
14481 if (TARGET_AVX512F)
14482 {
14483 mmode = HImode;
14484 gen_blendm = gen_avx512f_blendmv16sf;
14485 }
14486 break;
14487
14488 case E_V16SImode:
14489 if (TARGET_AVX512F)
14490 {
14491 mmode = HImode;
14492 gen_blendm = gen_avx512f_blendmv16si;
14493 }
14494 break;
14495
14496 case E_V32HImode:
14497 if (TARGET_AVX512BW)
14498 {
14499 mmode = SImode;
14500 gen_blendm = gen_avx512bw_blendmv32hi;
14501 }
14502 else if (TARGET_AVX512F)
14503 {
14504 half_mode = E_V8HImode;
14505 n = 8;
14506 goto quarter;
14507 }
14508 break;
14509
14510 case E_V64QImode:
14511 if (TARGET_AVX512BW)
14512 {
14513 mmode = DImode;
14514 gen_blendm = gen_avx512bw_blendmv64qi;
14515 }
14516 else if (TARGET_AVX512F)
14517 {
14518 half_mode = E_V16QImode;
14519 n = 16;
14520 goto quarter;
14521 }
14522 break;
14523
14524 quarter:
14525 /* Compute offset. */
14526 i = elt / n;
14527 elt %= n;
14528
14529 gcc_assert (i <= 3);
14530
14531 {
14532 /* Extract the quarter. */
14533 tmp = gen_reg_rtx (V4SImode);
14534 rtx tmp2 = gen_lowpart (V16SImode, target);
14535 rtx mask = gen_reg_rtx (QImode);
14536
14537 emit_move_insn (mask, constm1_rtx);
14538 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
14539 tmp, mask));
14540
14541 tmp2 = gen_reg_rtx (half_mode);
14542 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
14543 tmp = tmp2;
14544
14545 /* Put val in tmp at elt. */
14546 ix86_expand_vector_set (false, tmp, val, elt);
14547
14548 /* Put it back. */
14549 tmp2 = gen_reg_rtx (V16SImode);
14550 rtx tmp3 = gen_lowpart (V16SImode, target);
14551 mask = gen_reg_rtx (HImode);
14552 emit_move_insn (mask, constm1_rtx);
14553 tmp = gen_lowpart (V4SImode, tmp);
14554 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
14555 tmp3, mask));
14556 emit_move_insn (target, gen_lowpart (mode, tmp2));
14557 }
14558 return;
14559
14560 default:
14561 break;
14562 }
14563
14564 if (mmode != VOIDmode)
14565 {
14566 tmp = gen_reg_rtx (mode);
14567 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
14568 /* The avx512*_blendm<mode> expanders have different operand order
14569 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14570 elements where the mask is set and second input operand otherwise,
14571 in {sse,avx}*_*blend* the first input operand is used for elements
14572 where the mask is clear and second input operand otherwise. */
14573 emit_insn (gen_blendm (target, target, tmp,
14574 force_reg (mmode,
14575 gen_int_mode (HOST_WIDE_INT_1U << elt,
14576 mmode))));
14577 }
14578 else if (use_vec_merge)
14579 {
14580 do_vec_merge:
14581 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
14582 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
14583 GEN_INT (HOST_WIDE_INT_1U << elt));
14584 emit_insn (gen_rtx_SET (target, tmp));
14585 }
14586 else
14587 {
14588 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14589
14590 emit_move_insn (mem, target);
14591
14592 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
14593 emit_move_insn (tmp, val);
14594
14595 emit_move_insn (target, mem);
14596 }
14597 }
14598
14599 void
14600 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
14601 {
14602 machine_mode mode = GET_MODE (vec);
14603 machine_mode inner_mode = GET_MODE_INNER (mode);
14604 bool use_vec_extr = false;
14605 rtx tmp;
14606
14607 switch (mode)
14608 {
14609 case E_V2SImode:
14610 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14611 if (use_vec_extr)
14612 break;
14613 /* FALLTHRU */
14614
14615 case E_V2SFmode:
14616 if (!mmx_ok)
14617 break;
14618 /* FALLTHRU */
14619
14620 case E_V2DFmode:
14621 case E_V2DImode:
14622 case E_V2TImode:
14623 case E_V4TImode:
14624 use_vec_extr = true;
14625 break;
14626
14627 case E_V4SFmode:
14628 use_vec_extr = TARGET_SSE4_1;
14629 if (use_vec_extr)
14630 break;
14631
14632 switch (elt)
14633 {
14634 case 0:
14635 tmp = vec;
14636 break;
14637
14638 case 1:
14639 case 3:
14640 tmp = gen_reg_rtx (mode);
14641 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
14642 GEN_INT (elt), GEN_INT (elt),
14643 GEN_INT (elt+4), GEN_INT (elt+4)));
14644 break;
14645
14646 case 2:
14647 tmp = gen_reg_rtx (mode);
14648 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
14649 break;
14650
14651 default:
14652 gcc_unreachable ();
14653 }
14654 vec = tmp;
14655 use_vec_extr = true;
14656 elt = 0;
14657 break;
14658
14659 case E_V4SImode:
14660 use_vec_extr = TARGET_SSE4_1;
14661 if (use_vec_extr)
14662 break;
14663
14664 if (TARGET_SSE2)
14665 {
14666 switch (elt)
14667 {
14668 case 0:
14669 tmp = vec;
14670 break;
14671
14672 case 1:
14673 case 3:
14674 tmp = gen_reg_rtx (mode);
14675 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
14676 GEN_INT (elt), GEN_INT (elt),
14677 GEN_INT (elt), GEN_INT (elt)));
14678 break;
14679
14680 case 2:
14681 tmp = gen_reg_rtx (mode);
14682 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
14683 break;
14684
14685 default:
14686 gcc_unreachable ();
14687 }
14688 vec = tmp;
14689 use_vec_extr = true;
14690 elt = 0;
14691 }
14692 else
14693 {
14694 /* For SSE1, we have to reuse the V4SF code. */
14695 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
14696 gen_lowpart (V4SFmode, vec), elt);
14697 return;
14698 }
14699 break;
14700
14701 case E_V8HImode:
14702 use_vec_extr = TARGET_SSE2;
14703 break;
14704 case E_V4HImode:
14705 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14706 break;
14707
14708 case E_V16QImode:
14709 use_vec_extr = TARGET_SSE4_1;
14710 if (!use_vec_extr
14711 && TARGET_SSE2
14712 && elt == 0
14713 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
14714 {
14715 tmp = gen_reg_rtx (SImode);
14716 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
14717 0);
14718 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
14719 return;
14720 }
14721 break;
14722
14723 case E_V8SFmode:
14724 if (TARGET_AVX)
14725 {
14726 tmp = gen_reg_rtx (V4SFmode);
14727 if (elt < 4)
14728 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
14729 else
14730 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
14731 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14732 return;
14733 }
14734 break;
14735
14736 case E_V4DFmode:
14737 if (TARGET_AVX)
14738 {
14739 tmp = gen_reg_rtx (V2DFmode);
14740 if (elt < 2)
14741 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
14742 else
14743 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
14744 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14745 return;
14746 }
14747 break;
14748
14749 case E_V32QImode:
14750 if (TARGET_AVX)
14751 {
14752 tmp = gen_reg_rtx (V16QImode);
14753 if (elt < 16)
14754 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
14755 else
14756 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
14757 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14758 return;
14759 }
14760 break;
14761
14762 case E_V16HImode:
14763 if (TARGET_AVX)
14764 {
14765 tmp = gen_reg_rtx (V8HImode);
14766 if (elt < 8)
14767 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
14768 else
14769 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
14770 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14771 return;
14772 }
14773 break;
14774
14775 case E_V8SImode:
14776 if (TARGET_AVX)
14777 {
14778 tmp = gen_reg_rtx (V4SImode);
14779 if (elt < 4)
14780 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
14781 else
14782 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
14783 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14784 return;
14785 }
14786 break;
14787
14788 case E_V4DImode:
14789 if (TARGET_AVX)
14790 {
14791 tmp = gen_reg_rtx (V2DImode);
14792 if (elt < 2)
14793 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
14794 else
14795 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
14796 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14797 return;
14798 }
14799 break;
14800
14801 case E_V32HImode:
14802 if (TARGET_AVX512BW)
14803 {
14804 tmp = gen_reg_rtx (V16HImode);
14805 if (elt < 16)
14806 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
14807 else
14808 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
14809 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14810 return;
14811 }
14812 break;
14813
14814 case E_V64QImode:
14815 if (TARGET_AVX512BW)
14816 {
14817 tmp = gen_reg_rtx (V32QImode);
14818 if (elt < 32)
14819 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
14820 else
14821 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
14822 ix86_expand_vector_extract (false, target, tmp, elt & 31);
14823 return;
14824 }
14825 break;
14826
14827 case E_V16SFmode:
14828 tmp = gen_reg_rtx (V8SFmode);
14829 if (elt < 8)
14830 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
14831 else
14832 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
14833 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14834 return;
14835
14836 case E_V8DFmode:
14837 tmp = gen_reg_rtx (V4DFmode);
14838 if (elt < 4)
14839 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
14840 else
14841 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
14842 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14843 return;
14844
14845 case E_V16SImode:
14846 tmp = gen_reg_rtx (V8SImode);
14847 if (elt < 8)
14848 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
14849 else
14850 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
14851 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14852 return;
14853
14854 case E_V8DImode:
14855 tmp = gen_reg_rtx (V4DImode);
14856 if (elt < 4)
14857 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
14858 else
14859 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
14860 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14861 return;
14862
14863 case E_V8QImode:
14864 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14865 /* ??? Could extract the appropriate HImode element and shift. */
14866 break;
14867
14868 default:
14869 break;
14870 }
14871
14872 if (use_vec_extr)
14873 {
14874 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
14875 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
14876
14877 /* Let the rtl optimizers know about the zero extension performed. */
14878 if (inner_mode == QImode || inner_mode == HImode)
14879 {
14880 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
14881 target = gen_lowpart (SImode, target);
14882 }
14883
14884 emit_insn (gen_rtx_SET (target, tmp));
14885 }
14886 else
14887 {
14888 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14889
14890 emit_move_insn (mem, vec);
14891
14892 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
14893 emit_move_insn (target, tmp);
14894 }
14895 }
14896
14897 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14898 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14899 The upper bits of DEST are undefined, though they shouldn't cause
14900 exceptions (some bits from src or all zeros are ok). */
14901
14902 static void
14903 emit_reduc_half (rtx dest, rtx src, int i)
14904 {
14905 rtx tem, d = dest;
14906 switch (GET_MODE (src))
14907 {
14908 case E_V4SFmode:
14909 if (i == 128)
14910 tem = gen_sse_movhlps (dest, src, src);
14911 else
14912 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
14913 GEN_INT (1 + 4), GEN_INT (1 + 4));
14914 break;
14915 case E_V2DFmode:
14916 tem = gen_vec_interleave_highv2df (dest, src, src);
14917 break;
14918 case E_V16QImode:
14919 case E_V8HImode:
14920 case E_V4SImode:
14921 case E_V2DImode:
14922 d = gen_reg_rtx (V1TImode);
14923 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
14924 GEN_INT (i / 2));
14925 break;
14926 case E_V8SFmode:
14927 if (i == 256)
14928 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
14929 else
14930 tem = gen_avx_shufps256 (dest, src, src,
14931 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
14932 break;
14933 case E_V4DFmode:
14934 if (i == 256)
14935 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
14936 else
14937 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
14938 break;
14939 case E_V32QImode:
14940 case E_V16HImode:
14941 case E_V8SImode:
14942 case E_V4DImode:
14943 if (i == 256)
14944 {
14945 if (GET_MODE (dest) != V4DImode)
14946 d = gen_reg_rtx (V4DImode);
14947 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
14948 gen_lowpart (V4DImode, src),
14949 const1_rtx);
14950 }
14951 else
14952 {
14953 d = gen_reg_rtx (V2TImode);
14954 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
14955 GEN_INT (i / 2));
14956 }
14957 break;
14958 case E_V64QImode:
14959 case E_V32HImode:
14960 if (i < 64)
14961 {
14962 d = gen_reg_rtx (V4TImode);
14963 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
14964 GEN_INT (i / 2));
14965 break;
14966 }
14967 /* FALLTHRU */
14968 case E_V16SImode:
14969 case E_V16SFmode:
14970 case E_V8DImode:
14971 case E_V8DFmode:
14972 if (i > 128)
14973 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
14974 gen_lowpart (V16SImode, src),
14975 gen_lowpart (V16SImode, src),
14976 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
14977 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
14978 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
14979 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
14980 GEN_INT (0xC), GEN_INT (0xD),
14981 GEN_INT (0xE), GEN_INT (0xF),
14982 GEN_INT (0x10), GEN_INT (0x11),
14983 GEN_INT (0x12), GEN_INT (0x13),
14984 GEN_INT (0x14), GEN_INT (0x15),
14985 GEN_INT (0x16), GEN_INT (0x17));
14986 else
14987 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
14988 gen_lowpart (V16SImode, src),
14989 GEN_INT (i == 128 ? 0x2 : 0x1),
14990 GEN_INT (0x3),
14991 GEN_INT (0x3),
14992 GEN_INT (0x3),
14993 GEN_INT (i == 128 ? 0x6 : 0x5),
14994 GEN_INT (0x7),
14995 GEN_INT (0x7),
14996 GEN_INT (0x7),
14997 GEN_INT (i == 128 ? 0xA : 0x9),
14998 GEN_INT (0xB),
14999 GEN_INT (0xB),
15000 GEN_INT (0xB),
15001 GEN_INT (i == 128 ? 0xE : 0xD),
15002 GEN_INT (0xF),
15003 GEN_INT (0xF),
15004 GEN_INT (0xF));
15005 break;
15006 default:
15007 gcc_unreachable ();
15008 }
15009 emit_insn (tem);
15010 if (d != dest)
15011 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
15012 }
15013
15014 /* Expand a vector reduction. FN is the binary pattern to reduce;
15015 DEST is the destination; IN is the input vector. */
15016
15017 void
15018 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
15019 {
15020 rtx half, dst, vec = in;
15021 machine_mode mode = GET_MODE (in);
15022 int i;
15023
15024 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15025 if (TARGET_SSE4_1
15026 && mode == V8HImode
15027 && fn == gen_uminv8hi3)
15028 {
15029 emit_insn (gen_sse4_1_phminposuw (dest, in));
15030 return;
15031 }
15032
15033 for (i = GET_MODE_BITSIZE (mode);
15034 i > GET_MODE_UNIT_BITSIZE (mode);
15035 i >>= 1)
15036 {
15037 half = gen_reg_rtx (mode);
15038 emit_reduc_half (half, vec, i);
15039 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
15040 dst = dest;
15041 else
15042 dst = gen_reg_rtx (mode);
15043 emit_insn (fn (dst, half, vec));
15044 vec = dst;
15045 }
15046 }
15047
15048 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15049 FP status register is set. */
15050
15051 void
15052 ix86_emit_fp_unordered_jump (rtx label)
15053 {
15054 rtx reg = gen_reg_rtx (HImode);
15055 rtx_insn *insn;
15056 rtx temp;
15057
15058 emit_insn (gen_x86_fnstsw_1 (reg));
15059
15060 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
15061 {
15062 emit_insn (gen_x86_sahf_1 (reg));
15063
15064 temp = gen_rtx_REG (CCmode, FLAGS_REG);
15065 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
15066 }
15067 else
15068 {
15069 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
15070
15071 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15072 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15073 }
15074
15075 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15076 gen_rtx_LABEL_REF (VOIDmode, label),
15077 pc_rtx);
15078 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15079 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15080 JUMP_LABEL (insn) = label;
15081 }
15082
15083 /* Output code to perform an sinh XFmode calculation. */
15084
15085 void ix86_emit_i387_sinh (rtx op0, rtx op1)
15086 {
15087 rtx e1 = gen_reg_rtx (XFmode);
15088 rtx e2 = gen_reg_rtx (XFmode);
15089 rtx scratch = gen_reg_rtx (HImode);
15090 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15091 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15092 rtx cst1, tmp;
15093 rtx_code_label *jump_label = gen_label_rtx ();
15094 rtx_insn *insn;
15095
15096 /* scratch = fxam (op1) */
15097 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15098
15099 /* e1 = expm1 (|op1|) */
15100 emit_insn (gen_absxf2 (e2, op1));
15101 emit_insn (gen_expm1xf2 (e1, e2));
15102
15103 /* e2 = e1 / (e1 + 1.0) + e1 */
15104 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15105 emit_insn (gen_addxf3 (e2, e1, cst1));
15106 emit_insn (gen_divxf3 (e2, e1, e2));
15107 emit_insn (gen_addxf3 (e2, e2, e1));
15108
15109 /* flags = signbit (op1) */
15110 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15111
15112 /* if (flags) then e2 = -e2 */
15113 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15114 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15115 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15116 pc_rtx);
15117 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15118 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15119 JUMP_LABEL (insn) = jump_label;
15120
15121 emit_insn (gen_negxf2 (e2, e2));
15122
15123 emit_label (jump_label);
15124 LABEL_NUSES (jump_label) = 1;
15125
15126 /* op0 = 0.5 * e2 */
15127 half = force_reg (XFmode, half);
15128 emit_insn (gen_mulxf3 (op0, e2, half));
15129 }
15130
15131 /* Output code to perform an cosh XFmode calculation. */
15132
15133 void ix86_emit_i387_cosh (rtx op0, rtx op1)
15134 {
15135 rtx e1 = gen_reg_rtx (XFmode);
15136 rtx e2 = gen_reg_rtx (XFmode);
15137 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15138 rtx cst1;
15139
15140 /* e1 = exp (op1) */
15141 emit_insn (gen_expxf2 (e1, op1));
15142
15143 /* e2 = e1 + 1.0 / e1 */
15144 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15145 emit_insn (gen_divxf3 (e2, cst1, e1));
15146 emit_insn (gen_addxf3 (e2, e1, e2));
15147
15148 /* op0 = 0.5 * e2 */
15149 half = force_reg (XFmode, half);
15150 emit_insn (gen_mulxf3 (op0, e2, half));
15151 }
15152
15153 /* Output code to perform an tanh XFmode calculation. */
15154
15155 void ix86_emit_i387_tanh (rtx op0, rtx op1)
15156 {
15157 rtx e1 = gen_reg_rtx (XFmode);
15158 rtx e2 = gen_reg_rtx (XFmode);
15159 rtx scratch = gen_reg_rtx (HImode);
15160 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15161 rtx cst2, tmp;
15162 rtx_code_label *jump_label = gen_label_rtx ();
15163 rtx_insn *insn;
15164
15165 /* scratch = fxam (op1) */
15166 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15167
15168 /* e1 = expm1 (-|2 * op1|) */
15169 emit_insn (gen_addxf3 (e2, op1, op1));
15170 emit_insn (gen_absxf2 (e2, e2));
15171 emit_insn (gen_negxf2 (e2, e2));
15172 emit_insn (gen_expm1xf2 (e1, e2));
15173
15174 /* e2 = e1 / (e1 + 2.0) */
15175 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15176 emit_insn (gen_addxf3 (e2, e1, cst2));
15177 emit_insn (gen_divxf3 (e2, e1, e2));
15178
15179 /* flags = signbit (op1) */
15180 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15181
15182 /* if (!flags) then e2 = -e2 */
15183 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15184 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15185 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15186 pc_rtx);
15187 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15188 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15189 JUMP_LABEL (insn) = jump_label;
15190
15191 emit_insn (gen_negxf2 (e2, e2));
15192
15193 emit_label (jump_label);
15194 LABEL_NUSES (jump_label) = 1;
15195
15196 emit_move_insn (op0, e2);
15197 }
15198
15199 /* Output code to perform an asinh XFmode calculation. */
15200
15201 void ix86_emit_i387_asinh (rtx op0, rtx op1)
15202 {
15203 rtx e1 = gen_reg_rtx (XFmode);
15204 rtx e2 = gen_reg_rtx (XFmode);
15205 rtx scratch = gen_reg_rtx (HImode);
15206 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15207 rtx cst1, tmp;
15208 rtx_code_label *jump_label = gen_label_rtx ();
15209 rtx_insn *insn;
15210
15211 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15212 emit_insn (gen_mulxf3 (e1, op1, op1));
15213 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15214 emit_insn (gen_addxf3 (e2, e1, cst1));
15215 emit_insn (gen_sqrtxf2 (e2, e2));
15216 emit_insn (gen_addxf3 (e2, e2, cst1));
15217
15218 /* e1 = e1 / e2 */
15219 emit_insn (gen_divxf3 (e1, e1, e2));
15220
15221 /* scratch = fxam (op1) */
15222 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15223
15224 /* e1 = e1 + |op1| */
15225 emit_insn (gen_absxf2 (e2, op1));
15226 emit_insn (gen_addxf3 (e1, e1, e2));
15227
15228 /* e2 = log1p (e1) */
15229 ix86_emit_i387_log1p (e2, e1);
15230
15231 /* flags = signbit (op1) */
15232 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15233
15234 /* if (flags) then e2 = -e2 */
15235 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15236 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15237 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15238 pc_rtx);
15239 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15240 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15241 JUMP_LABEL (insn) = jump_label;
15242
15243 emit_insn (gen_negxf2 (e2, e2));
15244
15245 emit_label (jump_label);
15246 LABEL_NUSES (jump_label) = 1;
15247
15248 emit_move_insn (op0, e2);
15249 }
15250
15251 /* Output code to perform an acosh XFmode calculation. */
15252
15253 void ix86_emit_i387_acosh (rtx op0, rtx op1)
15254 {
15255 rtx e1 = gen_reg_rtx (XFmode);
15256 rtx e2 = gen_reg_rtx (XFmode);
15257 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15258
15259 /* e2 = sqrt (op1 + 1.0) */
15260 emit_insn (gen_addxf3 (e2, op1, cst1));
15261 emit_insn (gen_sqrtxf2 (e2, e2));
15262
15263 /* e1 = sqrt (op1 - 1.0) */
15264 emit_insn (gen_subxf3 (e1, op1, cst1));
15265 emit_insn (gen_sqrtxf2 (e1, e1));
15266
15267 /* e1 = e1 * e2 */
15268 emit_insn (gen_mulxf3 (e1, e1, e2));
15269
15270 /* e1 = e1 + op1 */
15271 emit_insn (gen_addxf3 (e1, e1, op1));
15272
15273 /* op0 = log (e1) */
15274 emit_insn (gen_logxf2 (op0, e1));
15275 }
15276
15277 /* Output code to perform an atanh XFmode calculation. */
15278
15279 void ix86_emit_i387_atanh (rtx op0, rtx op1)
15280 {
15281 rtx e1 = gen_reg_rtx (XFmode);
15282 rtx e2 = gen_reg_rtx (XFmode);
15283 rtx scratch = gen_reg_rtx (HImode);
15284 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15285 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15286 rtx cst1, tmp;
15287 rtx_code_label *jump_label = gen_label_rtx ();
15288 rtx_insn *insn;
15289
15290 /* scratch = fxam (op1) */
15291 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15292
15293 /* e2 = |op1| */
15294 emit_insn (gen_absxf2 (e2, op1));
15295
15296 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15297 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15298 emit_insn (gen_addxf3 (e1, e2, cst1));
15299 emit_insn (gen_addxf3 (e2, e2, e2));
15300 emit_insn (gen_negxf2 (e2, e2));
15301 emit_insn (gen_divxf3 (e1, e2, e1));
15302
15303 /* e2 = log1p (e1) */
15304 ix86_emit_i387_log1p (e2, e1);
15305
15306 /* flags = signbit (op1) */
15307 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15308
15309 /* if (!flags) then e2 = -e2 */
15310 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15311 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15312 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15313 pc_rtx);
15314 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15315 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15316 JUMP_LABEL (insn) = jump_label;
15317
15318 emit_insn (gen_negxf2 (e2, e2));
15319
15320 emit_label (jump_label);
15321 LABEL_NUSES (jump_label) = 1;
15322
15323 /* op0 = 0.5 * e2 */
15324 half = force_reg (XFmode, half);
15325 emit_insn (gen_mulxf3 (op0, e2, half));
15326 }
15327
15328 /* Output code to perform a log1p XFmode calculation. */
15329
15330 void ix86_emit_i387_log1p (rtx op0, rtx op1)
15331 {
15332 rtx_code_label *label1 = gen_label_rtx ();
15333 rtx_code_label *label2 = gen_label_rtx ();
15334
15335 rtx tmp = gen_reg_rtx (XFmode);
15336 rtx res = gen_reg_rtx (XFmode);
15337 rtx cst, cstln2, cst1;
15338 rtx_insn *insn;
15339
15340 cst = const_double_from_real_value
15341 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15342 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15343
15344 emit_insn (gen_absxf2 (tmp, op1));
15345
15346 cst = force_reg (XFmode, cst);
15347 ix86_expand_branch (GE, tmp, cst, label1);
15348 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15349 insn = get_last_insn ();
15350 JUMP_LABEL (insn) = label1;
15351
15352 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15353 emit_jump (label2);
15354
15355 emit_label (label1);
15356 LABEL_NUSES (label1) = 1;
15357
15358 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15359 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15360 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15361
15362 emit_label (label2);
15363 LABEL_NUSES (label2) = 1;
15364
15365 emit_move_insn (op0, res);
15366 }
15367
15368 /* Emit code for round calculation. */
15369 void ix86_emit_i387_round (rtx op0, rtx op1)
15370 {
15371 machine_mode inmode = GET_MODE (op1);
15372 machine_mode outmode = GET_MODE (op0);
15373 rtx e1 = gen_reg_rtx (XFmode);
15374 rtx e2 = gen_reg_rtx (XFmode);
15375 rtx scratch = gen_reg_rtx (HImode);
15376 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15377 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15378 rtx res = gen_reg_rtx (outmode);
15379 rtx_code_label *jump_label = gen_label_rtx ();
15380 rtx (*floor_insn) (rtx, rtx);
15381 rtx (*neg_insn) (rtx, rtx);
15382 rtx_insn *insn;
15383 rtx tmp;
15384
15385 switch (inmode)
15386 {
15387 case E_SFmode:
15388 case E_DFmode:
15389 tmp = gen_reg_rtx (XFmode);
15390
15391 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15392 op1 = tmp;
15393 break;
15394 case E_XFmode:
15395 break;
15396 default:
15397 gcc_unreachable ();
15398 }
15399
15400 switch (outmode)
15401 {
15402 case E_SFmode:
15403 floor_insn = gen_frndintxf2_floor;
15404 neg_insn = gen_negsf2;
15405 break;
15406 case E_DFmode:
15407 floor_insn = gen_frndintxf2_floor;
15408 neg_insn = gen_negdf2;
15409 break;
15410 case E_XFmode:
15411 floor_insn = gen_frndintxf2_floor;
15412 neg_insn = gen_negxf2;
15413 break;
15414 case E_HImode:
15415 floor_insn = gen_lfloorxfhi2;
15416 neg_insn = gen_neghi2;
15417 break;
15418 case E_SImode:
15419 floor_insn = gen_lfloorxfsi2;
15420 neg_insn = gen_negsi2;
15421 break;
15422 case E_DImode:
15423 floor_insn = gen_lfloorxfdi2;
15424 neg_insn = gen_negdi2;
15425 break;
15426 default:
15427 gcc_unreachable ();
15428 }
15429
15430 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15431
15432 /* scratch = fxam(op1) */
15433 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15434
15435 /* e1 = fabs(op1) */
15436 emit_insn (gen_absxf2 (e1, op1));
15437
15438 /* e2 = e1 + 0.5 */
15439 half = force_reg (XFmode, half);
15440 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15441
15442 /* res = floor(e2) */
15443 switch (outmode)
15444 {
15445 case E_SFmode:
15446 case E_DFmode:
15447 {
15448 tmp = gen_reg_rtx (XFmode);
15449
15450 emit_insn (floor_insn (tmp, e2));
15451 emit_insn (gen_rtx_SET (res,
15452 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15453 UNSPEC_TRUNC_NOOP)));
15454 }
15455 break;
15456 default:
15457 emit_insn (floor_insn (res, e2));
15458 }
15459
15460 /* flags = signbit(a) */
15461 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15462
15463 /* if (flags) then res = -res */
15464 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15465 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15466 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15467 pc_rtx);
15468 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15469 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15470 JUMP_LABEL (insn) = jump_label;
15471
15472 emit_insn (neg_insn (res, res));
15473
15474 emit_label (jump_label);
15475 LABEL_NUSES (jump_label) = 1;
15476
15477 emit_move_insn (op0, res);
15478 }
15479
15480 /* Output code to perform a Newton-Rhapson approximation of a single precision
15481 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15482
15483 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15484 {
15485 rtx x0, x1, e0, e1;
15486
15487 x0 = gen_reg_rtx (mode);
15488 e0 = gen_reg_rtx (mode);
15489 e1 = gen_reg_rtx (mode);
15490 x1 = gen_reg_rtx (mode);
15491
15492 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15493
15494 b = force_reg (mode, b);
15495
15496 /* x0 = rcp(b) estimate */
15497 if (mode == V16SFmode || mode == V8DFmode)
15498 {
15499 if (TARGET_AVX512ER)
15500 {
15501 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15502 UNSPEC_RCP28)));
15503 /* res = a * x0 */
15504 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15505 return;
15506 }
15507 else
15508 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15509 UNSPEC_RCP14)));
15510 }
15511 else
15512 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15513 UNSPEC_RCP)));
15514
15515 /* e0 = x0 * b */
15516 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15517
15518 /* e0 = x0 * e0 */
15519 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
15520
15521 /* e1 = x0 + x0 */
15522 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
15523
15524 /* x1 = e1 - e0 */
15525 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
15526
15527 /* res = a * x1 */
15528 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
15529 }
15530
15531 /* Output code to perform a Newton-Rhapson approximation of a
15532 single precision floating point [reciprocal] square root. */
15533
15534 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
15535 {
15536 rtx x0, e0, e1, e2, e3, mthree, mhalf;
15537 REAL_VALUE_TYPE r;
15538 int unspec;
15539
15540 x0 = gen_reg_rtx (mode);
15541 e0 = gen_reg_rtx (mode);
15542 e1 = gen_reg_rtx (mode);
15543 e2 = gen_reg_rtx (mode);
15544 e3 = gen_reg_rtx (mode);
15545
15546 if (TARGET_AVX512ER && mode == V16SFmode)
15547 {
15548 if (recip)
15549 /* res = rsqrt28(a) estimate */
15550 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15551 UNSPEC_RSQRT28)));
15552 else
15553 {
15554 /* x0 = rsqrt28(a) estimate */
15555 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15556 UNSPEC_RSQRT28)));
15557 /* res = rcp28(x0) estimate */
15558 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
15559 UNSPEC_RCP28)));
15560 }
15561 return;
15562 }
15563
15564 real_from_integer (&r, VOIDmode, -3, SIGNED);
15565 mthree = const_double_from_real_value (r, SFmode);
15566
15567 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
15568 mhalf = const_double_from_real_value (r, SFmode);
15569 unspec = UNSPEC_RSQRT;
15570
15571 if (VECTOR_MODE_P (mode))
15572 {
15573 mthree = ix86_build_const_vector (mode, true, mthree);
15574 mhalf = ix86_build_const_vector (mode, true, mhalf);
15575 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15576 if (GET_MODE_SIZE (mode) == 64)
15577 unspec = UNSPEC_RSQRT14;
15578 }
15579
15580 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15581 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15582
15583 a = force_reg (mode, a);
15584
15585 /* x0 = rsqrt(a) estimate */
15586 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15587 unspec)));
15588
15589 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15590 if (!recip)
15591 {
15592 rtx zero = force_reg (mode, CONST0_RTX(mode));
15593 rtx mask;
15594
15595 /* Handle masked compare. */
15596 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
15597 {
15598 mask = gen_reg_rtx (HImode);
15599 /* Imm value 0x4 corresponds to not-equal comparison. */
15600 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
15601 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
15602 }
15603 else
15604 {
15605 mask = gen_reg_rtx (mode);
15606 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
15607 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
15608 }
15609 }
15610
15611 mthree = force_reg (mode, mthree);
15612
15613 /* e0 = x0 * a */
15614 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
15615
15616 unsigned vector_size = GET_MODE_SIZE (mode);
15617 if (TARGET_FMA
15618 || (TARGET_AVX512F && vector_size == 64)
15619 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
15620 emit_insn (gen_rtx_SET (e2,
15621 gen_rtx_FMA (mode, e0, x0, mthree)));
15622 else
15623 {
15624 /* e1 = e0 * x0 */
15625 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
15626
15627 /* e2 = e1 - 3. */
15628 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
15629 }
15630
15631 mhalf = force_reg (mode, mhalf);
15632 if (recip)
15633 /* e3 = -.5 * x0 */
15634 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
15635 else
15636 /* e3 = -.5 * e0 */
15637 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
15638 /* ret = e2 * e3 */
15639 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
15640 }
15641
15642 /* Expand fabs (OP0) and return a new rtx that holds the result. The
15643 mask for masking out the sign-bit is stored in *SMASK, if that is
15644 non-null. */
15645
15646 static rtx
15647 ix86_expand_sse_fabs (rtx op0, rtx *smask)
15648 {
15649 machine_mode vmode, mode = GET_MODE (op0);
15650 rtx xa, mask;
15651
15652 xa = gen_reg_rtx (mode);
15653 if (mode == SFmode)
15654 vmode = V4SFmode;
15655 else if (mode == DFmode)
15656 vmode = V2DFmode;
15657 else
15658 vmode = mode;
15659 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
15660 if (!VECTOR_MODE_P (mode))
15661 {
15662 /* We need to generate a scalar mode mask in this case. */
15663 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15664 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15665 mask = gen_reg_rtx (mode);
15666 emit_insn (gen_rtx_SET (mask, tmp));
15667 }
15668 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
15669
15670 if (smask)
15671 *smask = mask;
15672
15673 return xa;
15674 }
15675
15676 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15677 swapping the operands if SWAP_OPERANDS is true. The expanded
15678 code is a forward jump to a newly created label in case the
15679 comparison is true. The generated label rtx is returned. */
15680 static rtx_code_label *
15681 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
15682 bool swap_operands)
15683 {
15684 bool unordered_compare = ix86_unordered_fp_compare (code);
15685 rtx_code_label *label;
15686 rtx tmp, reg;
15687
15688 if (swap_operands)
15689 std::swap (op0, op1);
15690
15691 label = gen_label_rtx ();
15692 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
15693 if (unordered_compare)
15694 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
15695 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
15696 emit_insn (gen_rtx_SET (reg, tmp));
15697 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
15698 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
15699 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
15700 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15701 JUMP_LABEL (tmp) = label;
15702
15703 return label;
15704 }
15705
15706 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15707 using comparison code CODE. Operands are swapped for the comparison if
15708 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15709 static rtx
15710 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
15711 bool swap_operands)
15712 {
15713 rtx (*insn)(rtx, rtx, rtx, rtx);
15714 machine_mode mode = GET_MODE (op0);
15715 rtx mask = gen_reg_rtx (mode);
15716
15717 if (swap_operands)
15718 std::swap (op0, op1);
15719
15720 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
15721
15722 emit_insn (insn (mask, op0, op1,
15723 gen_rtx_fmt_ee (code, mode, op0, op1)));
15724 return mask;
15725 }
15726
15727 /* Expand copysign from SIGN to the positive value ABS_VALUE
15728 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15729 the sign-bit. */
15730
15731 static void
15732 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
15733 {
15734 machine_mode mode = GET_MODE (sign);
15735 rtx sgn = gen_reg_rtx (mode);
15736 if (mask == NULL_RTX)
15737 {
15738 machine_mode vmode;
15739
15740 if (mode == SFmode)
15741 vmode = V4SFmode;
15742 else if (mode == DFmode)
15743 vmode = V2DFmode;
15744 else
15745 vmode = mode;
15746
15747 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
15748 if (!VECTOR_MODE_P (mode))
15749 {
15750 /* We need to generate a scalar mode mask in this case. */
15751 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15752 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15753 mask = gen_reg_rtx (mode);
15754 emit_insn (gen_rtx_SET (mask, tmp));
15755 }
15756 }
15757 else
15758 mask = gen_rtx_NOT (mode, mask);
15759 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
15760 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
15761 }
15762
15763 /* Expand SSE sequence for computing lround from OP1 storing
15764 into OP0. */
15765
15766 void
15767 ix86_expand_lround (rtx op0, rtx op1)
15768 {
15769 /* C code for the stuff we're doing below:
15770 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15771 return (long)tmp;
15772 */
15773 machine_mode mode = GET_MODE (op1);
15774 const struct real_format *fmt;
15775 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
15776 rtx adj;
15777
15778 /* load nextafter (0.5, 0.0) */
15779 fmt = REAL_MODE_FORMAT (mode);
15780 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
15781 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
15782
15783 /* adj = copysign (0.5, op1) */
15784 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
15785 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
15786
15787 /* adj = op1 + adj */
15788 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
15789
15790 /* op0 = (imode)adj */
15791 expand_fix (op0, adj, 0);
15792 }
15793
15794 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15795 into OPERAND0. */
15796
15797 void
15798 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
15799 {
15800 /* C code for the stuff we're doing below (for do_floor):
15801 xi = (long)op1;
15802 xi -= (double)xi > op1 ? 1 : 0;
15803 return xi;
15804 */
15805 machine_mode fmode = GET_MODE (op1);
15806 machine_mode imode = GET_MODE (op0);
15807 rtx ireg, freg, tmp;
15808 rtx_code_label *label;
15809
15810 /* reg = (long)op1 */
15811 ireg = gen_reg_rtx (imode);
15812 expand_fix (ireg, op1, 0);
15813
15814 /* freg = (double)reg */
15815 freg = gen_reg_rtx (fmode);
15816 expand_float (freg, ireg, 0);
15817
15818 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15819 label = ix86_expand_sse_compare_and_jump (UNLE,
15820 freg, op1, !do_floor);
15821 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
15822 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
15823 emit_move_insn (ireg, tmp);
15824
15825 emit_label (label);
15826 LABEL_NUSES (label) = 1;
15827
15828 emit_move_insn (op0, ireg);
15829 }
15830
15831 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15832 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15833
15834 static rtx
15835 ix86_gen_TWO52 (machine_mode mode)
15836 {
15837 REAL_VALUE_TYPE TWO52r;
15838 rtx TWO52;
15839
15840 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
15841 TWO52 = const_double_from_real_value (TWO52r, mode);
15842 TWO52 = force_reg (mode, TWO52);
15843
15844 return TWO52;
15845 }
15846
15847 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15848
15849 void
15850 ix86_expand_rint (rtx operand0, rtx operand1)
15851 {
15852 /* C code for the stuff we're doing below:
15853 xa = fabs (operand1);
15854 if (!isless (xa, 2**52))
15855 return operand1;
15856 two52 = 2**52;
15857 if (flag_rounding_math)
15858 {
15859 two52 = copysign (two52, operand1);
15860 xa = operand1;
15861 }
15862 xa = xa + two52 - two52;
15863 return copysign (xa, operand1);
15864 */
15865 machine_mode mode = GET_MODE (operand0);
15866 rtx res, xa, TWO52, two52, mask;
15867 rtx_code_label *label;
15868
15869 res = gen_reg_rtx (mode);
15870 emit_move_insn (res, operand1);
15871
15872 /* xa = abs (operand1) */
15873 xa = ix86_expand_sse_fabs (res, &mask);
15874
15875 /* if (!isless (xa, TWO52)) goto label; */
15876 TWO52 = ix86_gen_TWO52 (mode);
15877 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15878
15879 two52 = TWO52;
15880 if (flag_rounding_math)
15881 {
15882 two52 = gen_reg_rtx (mode);
15883 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
15884 xa = res;
15885 }
15886
15887 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
15888 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
15889
15890 ix86_sse_copysign_to_positive (res, xa, res, mask);
15891
15892 emit_label (label);
15893 LABEL_NUSES (label) = 1;
15894
15895 emit_move_insn (operand0, res);
15896 }
15897
15898 /* Expand SSE2 sequence for computing floor or ceil
15899 from OPERAND1 storing into OPERAND0. */
15900 void
15901 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
15902 {
15903 /* C code for the stuff we expand below.
15904 double xa = fabs (x), x2;
15905 if (!isless (xa, TWO52))
15906 return x;
15907 x2 = (double)(long)x;
15908 Compensate. Floor:
15909 if (x2 > x)
15910 x2 -= 1;
15911 Compensate. Ceil:
15912 if (x2 < x)
15913 x2 += 1;
15914 if (HONOR_SIGNED_ZEROS (mode))
15915 return copysign (x2, x);
15916 return x2;
15917 */
15918 machine_mode mode = GET_MODE (operand0);
15919 rtx xa, xi, TWO52, tmp, one, res, mask;
15920 rtx_code_label *label;
15921
15922 TWO52 = ix86_gen_TWO52 (mode);
15923
15924 /* Temporary for holding the result, initialized to the input
15925 operand to ease control flow. */
15926 res = gen_reg_rtx (mode);
15927 emit_move_insn (res, operand1);
15928
15929 /* xa = abs (operand1) */
15930 xa = ix86_expand_sse_fabs (res, &mask);
15931
15932 /* if (!isless (xa, TWO52)) goto label; */
15933 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15934
15935 /* xa = (double)(long)x */
15936 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
15937 expand_fix (xi, res, 0);
15938 expand_float (xa, xi, 0);
15939
15940 /* generate 1.0 */
15941 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15942
15943 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15944 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15945 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15946 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15947 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15948 emit_move_insn (res, tmp);
15949
15950 if (HONOR_SIGNED_ZEROS (mode))
15951 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
15952
15953 emit_label (label);
15954 LABEL_NUSES (label) = 1;
15955
15956 emit_move_insn (operand0, res);
15957 }
15958
15959 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15960 into OPERAND0 without relying on DImode truncation via cvttsd2siq
15961 that is only available on 64bit targets. */
15962 void
15963 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
15964 {
15965 /* C code for the stuff we expand below.
15966 double xa = fabs (x), x2;
15967 if (!isless (xa, TWO52))
15968 return x;
15969 xa = xa + TWO52 - TWO52;
15970 x2 = copysign (xa, x);
15971 Compensate. Floor:
15972 if (x2 > x)
15973 x2 -= 1;
15974 Compensate. Ceil:
15975 if (x2 < x)
15976 x2 += 1;
15977 if (HONOR_SIGNED_ZEROS (mode))
15978 x2 = copysign (x2, x);
15979 return x2;
15980 */
15981 machine_mode mode = GET_MODE (operand0);
15982 rtx xa, TWO52, tmp, one, res, mask;
15983 rtx_code_label *label;
15984
15985 TWO52 = ix86_gen_TWO52 (mode);
15986
15987 /* Temporary for holding the result, initialized to the input
15988 operand to ease control flow. */
15989 res = gen_reg_rtx (mode);
15990 emit_move_insn (res, operand1);
15991
15992 /* xa = abs (operand1) */
15993 xa = ix86_expand_sse_fabs (res, &mask);
15994
15995 /* if (!isless (xa, TWO52)) goto label; */
15996 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15997
15998 /* xa = xa + TWO52 - TWO52; */
15999 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16000 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
16001
16002 /* xa = copysign (xa, operand1) */
16003 ix86_sse_copysign_to_positive (xa, xa, res, mask);
16004
16005 /* generate 1.0 */
16006 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16007
16008 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16009 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16010 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16011 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16012 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16013 if (!do_floor && HONOR_SIGNED_ZEROS (mode))
16014 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
16015 emit_move_insn (res, tmp);
16016
16017 emit_label (label);
16018 LABEL_NUSES (label) = 1;
16019
16020 emit_move_insn (operand0, res);
16021 }
16022
16023 /* Expand SSE sequence for computing trunc
16024 from OPERAND1 storing into OPERAND0. */
16025 void
16026 ix86_expand_trunc (rtx operand0, rtx operand1)
16027 {
16028 /* C code for SSE variant we expand below.
16029 double xa = fabs (x), x2;
16030 if (!isless (xa, TWO52))
16031 return x;
16032 x2 = (double)(long)x;
16033 if (HONOR_SIGNED_ZEROS (mode))
16034 return copysign (x2, x);
16035 return x2;
16036 */
16037 machine_mode mode = GET_MODE (operand0);
16038 rtx xa, xi, TWO52, res, mask;
16039 rtx_code_label *label;
16040
16041 TWO52 = ix86_gen_TWO52 (mode);
16042
16043 /* Temporary for holding the result, initialized to the input
16044 operand to ease control flow. */
16045 res = gen_reg_rtx (mode);
16046 emit_move_insn (res, operand1);
16047
16048 /* xa = abs (operand1) */
16049 xa = ix86_expand_sse_fabs (res, &mask);
16050
16051 /* if (!isless (xa, TWO52)) goto label; */
16052 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16053
16054 /* x = (double)(long)x */
16055 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16056 expand_fix (xi, res, 0);
16057 expand_float (res, xi, 0);
16058
16059 if (HONOR_SIGNED_ZEROS (mode))
16060 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
16061
16062 emit_label (label);
16063 LABEL_NUSES (label) = 1;
16064
16065 emit_move_insn (operand0, res);
16066 }
16067
16068 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16069 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16070 that is only available on 64bit targets. */
16071 void
16072 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16073 {
16074 machine_mode mode = GET_MODE (operand0);
16075 rtx xa, mask, TWO52, one, res, smask, tmp;
16076 rtx_code_label *label;
16077
16078 /* C code for SSE variant we expand below.
16079 double xa = fabs (x), x2;
16080 if (!isless (xa, TWO52))
16081 return x;
16082 xa2 = xa + TWO52 - TWO52;
16083 Compensate:
16084 if (xa2 > xa)
16085 xa2 -= 1.0;
16086 x2 = copysign (xa2, x);
16087 return x2;
16088 */
16089
16090 TWO52 = ix86_gen_TWO52 (mode);
16091
16092 /* Temporary for holding the result, initialized to the input
16093 operand to ease control flow. */
16094 res = gen_reg_rtx (mode);
16095 emit_move_insn (res, operand1);
16096
16097 /* xa = abs (operand1) */
16098 xa = ix86_expand_sse_fabs (res, &smask);
16099
16100 /* if (!isless (xa, TWO52)) goto label; */
16101 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16102
16103 /* res = xa + TWO52 - TWO52; */
16104 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16105 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
16106 emit_move_insn (res, tmp);
16107
16108 /* generate 1.0 */
16109 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16110
16111 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
16112 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
16113 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
16114 tmp = expand_simple_binop (mode, MINUS,
16115 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
16116 emit_move_insn (res, tmp);
16117
16118 /* res = copysign (res, operand1) */
16119 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
16120
16121 emit_label (label);
16122 LABEL_NUSES (label) = 1;
16123
16124 emit_move_insn (operand0, res);
16125 }
16126
16127 /* Expand SSE sequence for computing round
16128 from OPERAND1 storing into OPERAND0. */
16129 void
16130 ix86_expand_round (rtx operand0, rtx operand1)
16131 {
16132 /* C code for the stuff we're doing below:
16133 double xa = fabs (x);
16134 if (!isless (xa, TWO52))
16135 return x;
16136 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16137 return copysign (xa, x);
16138 */
16139 machine_mode mode = GET_MODE (operand0);
16140 rtx res, TWO52, xa, xi, half, mask;
16141 rtx_code_label *label;
16142 const struct real_format *fmt;
16143 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16144
16145 /* Temporary for holding the result, initialized to the input
16146 operand to ease control flow. */
16147 res = gen_reg_rtx (mode);
16148 emit_move_insn (res, operand1);
16149
16150 TWO52 = ix86_gen_TWO52 (mode);
16151 xa = ix86_expand_sse_fabs (res, &mask);
16152 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16153
16154 /* load nextafter (0.5, 0.0) */
16155 fmt = REAL_MODE_FORMAT (mode);
16156 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16157 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16158
16159 /* xa = xa + 0.5 */
16160 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16161 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16162
16163 /* xa = (double)(int64_t)xa */
16164 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16165 expand_fix (xi, xa, 0);
16166 expand_float (xa, xi, 0);
16167
16168 /* res = copysign (xa, operand1) */
16169 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
16170
16171 emit_label (label);
16172 LABEL_NUSES (label) = 1;
16173
16174 emit_move_insn (operand0, res);
16175 }
16176
16177 /* Expand SSE sequence for computing round from OPERAND1 storing
16178 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16179 that is only available on 64bit targets. */
16180 void
16181 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16182 {
16183 /* C code for the stuff we expand below.
16184 double xa = fabs (x), xa2, x2;
16185 if (!isless (xa, TWO52))
16186 return x;
16187 Using the absolute value and copying back sign makes
16188 -0.0 -> -0.0 correct.
16189 xa2 = xa + TWO52 - TWO52;
16190 Compensate.
16191 dxa = xa2 - xa;
16192 if (dxa <= -0.5)
16193 xa2 += 1;
16194 else if (dxa > 0.5)
16195 xa2 -= 1;
16196 x2 = copysign (xa2, x);
16197 return x2;
16198 */
16199 machine_mode mode = GET_MODE (operand0);
16200 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16201 rtx_code_label *label;
16202
16203 TWO52 = ix86_gen_TWO52 (mode);
16204
16205 /* Temporary for holding the result, initialized to the input
16206 operand to ease control flow. */
16207 res = gen_reg_rtx (mode);
16208 emit_move_insn (res, operand1);
16209
16210 /* xa = abs (operand1) */
16211 xa = ix86_expand_sse_fabs (res, &mask);
16212
16213 /* if (!isless (xa, TWO52)) goto label; */
16214 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16215
16216 /* xa2 = xa + TWO52 - TWO52; */
16217 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16218 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16219
16220 /* dxa = xa2 - xa; */
16221 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16222
16223 /* generate 0.5, 1.0 and -0.5 */
16224 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16225 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16226 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16227 0, OPTAB_DIRECT);
16228
16229 /* Compensate. */
16230 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16231 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16232 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16233 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16234 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16235 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16236 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16237 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16238
16239 /* res = copysign (xa2, operand1) */
16240 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
16241
16242 emit_label (label);
16243 LABEL_NUSES (label) = 1;
16244
16245 emit_move_insn (operand0, res);
16246 }
16247
16248 /* Expand SSE sequence for computing round
16249 from OP1 storing into OP0 using sse4 round insn. */
16250 void
16251 ix86_expand_round_sse4 (rtx op0, rtx op1)
16252 {
16253 machine_mode mode = GET_MODE (op0);
16254 rtx e1, e2, res, half;
16255 const struct real_format *fmt;
16256 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16257 rtx (*gen_copysign) (rtx, rtx, rtx);
16258 rtx (*gen_round) (rtx, rtx, rtx);
16259
16260 switch (mode)
16261 {
16262 case E_SFmode:
16263 gen_copysign = gen_copysignsf3;
16264 gen_round = gen_sse4_1_roundsf2;
16265 break;
16266 case E_DFmode:
16267 gen_copysign = gen_copysigndf3;
16268 gen_round = gen_sse4_1_rounddf2;
16269 break;
16270 default:
16271 gcc_unreachable ();
16272 }
16273
16274 /* round (a) = trunc (a + copysign (0.5, a)) */
16275
16276 /* load nextafter (0.5, 0.0) */
16277 fmt = REAL_MODE_FORMAT (mode);
16278 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16279 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16280 half = const_double_from_real_value (pred_half, mode);
16281
16282 /* e1 = copysign (0.5, op1) */
16283 e1 = gen_reg_rtx (mode);
16284 emit_insn (gen_copysign (e1, half, op1));
16285
16286 /* e2 = op1 + e1 */
16287 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16288
16289 /* res = trunc (e2) */
16290 res = gen_reg_rtx (mode);
16291 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16292
16293 emit_move_insn (op0, res);
16294 }
16295
16296 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16297 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16298 insn every time. */
16299
16300 static GTY(()) rtx_insn *vselect_insn;
16301
16302 /* Initialize vselect_insn. */
16303
16304 static void
16305 init_vselect_insn (void)
16306 {
16307 unsigned i;
16308 rtx x;
16309
16310 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16311 for (i = 0; i < MAX_VECT_LEN; ++i)
16312 XVECEXP (x, 0, i) = const0_rtx;
16313 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16314 const0_rtx), x);
16315 x = gen_rtx_SET (const0_rtx, x);
16316 start_sequence ();
16317 vselect_insn = emit_insn (x);
16318 end_sequence ();
16319 }
16320
16321 /* Construct (set target (vec_select op0 (parallel perm))) and
16322 return true if that's a valid instruction in the active ISA. */
16323
16324 static bool
16325 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16326 unsigned nelt, bool testing_p)
16327 {
16328 unsigned int i;
16329 rtx x, save_vconcat;
16330 int icode;
16331
16332 if (vselect_insn == NULL_RTX)
16333 init_vselect_insn ();
16334
16335 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16336 PUT_NUM_ELEM (XVEC (x, 0), nelt);
16337 for (i = 0; i < nelt; ++i)
16338 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16339 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16340 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16341 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16342 SET_DEST (PATTERN (vselect_insn)) = target;
16343 icode = recog_memoized (vselect_insn);
16344
16345 if (icode >= 0 && !testing_p)
16346 emit_insn (copy_rtx (PATTERN (vselect_insn)));
16347
16348 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16349 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16350 INSN_CODE (vselect_insn) = -1;
16351
16352 return icode >= 0;
16353 }
16354
16355 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16356
16357 static bool
16358 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16359 const unsigned char *perm, unsigned nelt,
16360 bool testing_p)
16361 {
16362 machine_mode v2mode;
16363 rtx x;
16364 bool ok;
16365
16366 if (vselect_insn == NULL_RTX)
16367 init_vselect_insn ();
16368
16369 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16370 return false;
16371 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16372 PUT_MODE (x, v2mode);
16373 XEXP (x, 0) = op0;
16374 XEXP (x, 1) = op1;
16375 ok = expand_vselect (target, x, perm, nelt, testing_p);
16376 XEXP (x, 0) = const0_rtx;
16377 XEXP (x, 1) = const0_rtx;
16378 return ok;
16379 }
16380
16381 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16382 using movss or movsd. */
16383 static bool
16384 expand_vec_perm_movs (struct expand_vec_perm_d *d)
16385 {
16386 machine_mode vmode = d->vmode;
16387 unsigned i, nelt = d->nelt;
16388 rtx x;
16389
16390 if (d->one_operand_p)
16391 return false;
16392
16393 if (!(TARGET_SSE && vmode == V4SFmode)
16394 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
16395 && !(TARGET_SSE2 && vmode == V2DFmode))
16396 return false;
16397
16398 /* Only the first element is changed. */
16399 if (d->perm[0] != nelt && d->perm[0] != 0)
16400 return false;
16401 for (i = 1; i < nelt; ++i)
16402 if (d->perm[i] != i + nelt - d->perm[0])
16403 return false;
16404
16405 if (d->testing_p)
16406 return true;
16407
16408 if (d->perm[0] == nelt)
16409 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16410 else
16411 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16412
16413 emit_insn (gen_rtx_SET (d->target, x));
16414
16415 return true;
16416 }
16417
16418 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16419 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16420
16421 static bool
16422 expand_vec_perm_blend (struct expand_vec_perm_d *d)
16423 {
16424 machine_mode mmode, vmode = d->vmode;
16425 unsigned i, nelt = d->nelt;
16426 unsigned HOST_WIDE_INT mask;
16427 rtx target, op0, op1, maskop, x;
16428 rtx rperm[32], vperm;
16429
16430 if (d->one_operand_p)
16431 return false;
16432 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16433 && (TARGET_AVX512BW
16434 || GET_MODE_UNIT_SIZE (vmode) >= 4))
16435 ;
16436 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16437 ;
16438 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16439 ;
16440 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16441 ;
16442 else
16443 return false;
16444
16445 /* This is a blend, not a permute. Elements must stay in their
16446 respective lanes. */
16447 for (i = 0; i < nelt; ++i)
16448 {
16449 unsigned e = d->perm[i];
16450 if (!(e == i || e == i + nelt))
16451 return false;
16452 }
16453
16454 if (d->testing_p)
16455 return true;
16456
16457 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16458 decision should be extracted elsewhere, so that we only try that
16459 sequence once all budget==3 options have been tried. */
16460 target = d->target;
16461 op0 = d->op0;
16462 op1 = d->op1;
16463 mask = 0;
16464
16465 switch (vmode)
16466 {
16467 case E_V8DFmode:
16468 case E_V16SFmode:
16469 case E_V4DFmode:
16470 case E_V8SFmode:
16471 case E_V2DFmode:
16472 case E_V4SFmode:
16473 case E_V8HImode:
16474 case E_V8SImode:
16475 case E_V32HImode:
16476 case E_V64QImode:
16477 case E_V16SImode:
16478 case E_V8DImode:
16479 for (i = 0; i < nelt; ++i)
16480 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
16481 break;
16482
16483 case E_V2DImode:
16484 for (i = 0; i < 2; ++i)
16485 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16486 vmode = V8HImode;
16487 goto do_subreg;
16488
16489 case E_V4SImode:
16490 for (i = 0; i < 4; ++i)
16491 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16492 vmode = V8HImode;
16493 goto do_subreg;
16494
16495 case E_V16QImode:
16496 /* See if bytes move in pairs so we can use pblendw with
16497 an immediate argument, rather than pblendvb with a vector
16498 argument. */
16499 for (i = 0; i < 16; i += 2)
16500 if (d->perm[i] + 1 != d->perm[i + 1])
16501 {
16502 use_pblendvb:
16503 for (i = 0; i < nelt; ++i)
16504 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
16505
16506 finish_pblendvb:
16507 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16508 vperm = force_reg (vmode, vperm);
16509
16510 if (GET_MODE_SIZE (vmode) == 16)
16511 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
16512 else
16513 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
16514 if (target != d->target)
16515 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16516 return true;
16517 }
16518
16519 for (i = 0; i < 8; ++i)
16520 mask |= (d->perm[i * 2] >= 16) << i;
16521 vmode = V8HImode;
16522 /* FALLTHRU */
16523
16524 do_subreg:
16525 target = gen_reg_rtx (vmode);
16526 op0 = gen_lowpart (vmode, op0);
16527 op1 = gen_lowpart (vmode, op1);
16528 break;
16529
16530 case E_V32QImode:
16531 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16532 for (i = 0; i < 32; i += 2)
16533 if (d->perm[i] + 1 != d->perm[i + 1])
16534 goto use_pblendvb;
16535 /* See if bytes move in quadruplets. If yes, vpblendd
16536 with immediate can be used. */
16537 for (i = 0; i < 32; i += 4)
16538 if (d->perm[i] + 2 != d->perm[i + 2])
16539 break;
16540 if (i < 32)
16541 {
16542 /* See if bytes move the same in both lanes. If yes,
16543 vpblendw with immediate can be used. */
16544 for (i = 0; i < 16; i += 2)
16545 if (d->perm[i] + 16 != d->perm[i + 16])
16546 goto use_pblendvb;
16547
16548 /* Use vpblendw. */
16549 for (i = 0; i < 16; ++i)
16550 mask |= (d->perm[i * 2] >= 32) << i;
16551 vmode = V16HImode;
16552 goto do_subreg;
16553 }
16554
16555 /* Use vpblendd. */
16556 for (i = 0; i < 8; ++i)
16557 mask |= (d->perm[i * 4] >= 32) << i;
16558 vmode = V8SImode;
16559 goto do_subreg;
16560
16561 case E_V16HImode:
16562 /* See if words move in pairs. If yes, vpblendd can be used. */
16563 for (i = 0; i < 16; i += 2)
16564 if (d->perm[i] + 1 != d->perm[i + 1])
16565 break;
16566 if (i < 16)
16567 {
16568 /* See if words move the same in both lanes. If not,
16569 vpblendvb must be used. */
16570 for (i = 0; i < 8; i++)
16571 if (d->perm[i] + 8 != d->perm[i + 8])
16572 {
16573 /* Use vpblendvb. */
16574 for (i = 0; i < 32; ++i)
16575 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
16576
16577 vmode = V32QImode;
16578 nelt = 32;
16579 target = gen_reg_rtx (vmode);
16580 op0 = gen_lowpart (vmode, op0);
16581 op1 = gen_lowpart (vmode, op1);
16582 goto finish_pblendvb;
16583 }
16584
16585 /* Use vpblendw. */
16586 for (i = 0; i < 16; ++i)
16587 mask |= (d->perm[i] >= 16) << i;
16588 break;
16589 }
16590
16591 /* Use vpblendd. */
16592 for (i = 0; i < 8; ++i)
16593 mask |= (d->perm[i * 2] >= 16) << i;
16594 vmode = V8SImode;
16595 goto do_subreg;
16596
16597 case E_V4DImode:
16598 /* Use vpblendd. */
16599 for (i = 0; i < 4; ++i)
16600 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16601 vmode = V8SImode;
16602 goto do_subreg;
16603
16604 default:
16605 gcc_unreachable ();
16606 }
16607
16608 switch (vmode)
16609 {
16610 case E_V8DFmode:
16611 case E_V8DImode:
16612 mmode = QImode;
16613 break;
16614 case E_V16SFmode:
16615 case E_V16SImode:
16616 mmode = HImode;
16617 break;
16618 case E_V32HImode:
16619 mmode = SImode;
16620 break;
16621 case E_V64QImode:
16622 mmode = DImode;
16623 break;
16624 default:
16625 mmode = VOIDmode;
16626 }
16627
16628 if (mmode != VOIDmode)
16629 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
16630 else
16631 maskop = GEN_INT (mask);
16632
16633 /* This matches five different patterns with the different modes. */
16634 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
16635 x = gen_rtx_SET (target, x);
16636 emit_insn (x);
16637 if (target != d->target)
16638 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16639
16640 return true;
16641 }
16642
16643 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16644 in terms of the variable form of vpermilps.
16645
16646 Note that we will have already failed the immediate input vpermilps,
16647 which requires that the high and low part shuffle be identical; the
16648 variable form doesn't require that. */
16649
16650 static bool
16651 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
16652 {
16653 rtx rperm[8], vperm;
16654 unsigned i;
16655
16656 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
16657 return false;
16658
16659 /* We can only permute within the 128-bit lane. */
16660 for (i = 0; i < 8; ++i)
16661 {
16662 unsigned e = d->perm[i];
16663 if (i < 4 ? e >= 4 : e < 4)
16664 return false;
16665 }
16666
16667 if (d->testing_p)
16668 return true;
16669
16670 for (i = 0; i < 8; ++i)
16671 {
16672 unsigned e = d->perm[i];
16673
16674 /* Within each 128-bit lane, the elements of op0 are numbered
16675 from 0 and the elements of op1 are numbered from 4. */
16676 if (e >= 8 + 4)
16677 e -= 8;
16678 else if (e >= 4)
16679 e -= 4;
16680
16681 rperm[i] = GEN_INT (e);
16682 }
16683
16684 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
16685 vperm = force_reg (V8SImode, vperm);
16686 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
16687
16688 return true;
16689 }
16690
16691 /* Return true if permutation D can be performed as VMODE permutation
16692 instead. */
16693
16694 static bool
16695 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
16696 {
16697 unsigned int i, j, chunk;
16698
16699 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
16700 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
16701 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
16702 return false;
16703
16704 if (GET_MODE_NUNITS (vmode) >= d->nelt)
16705 return true;
16706
16707 chunk = d->nelt / GET_MODE_NUNITS (vmode);
16708 for (i = 0; i < d->nelt; i += chunk)
16709 if (d->perm[i] & (chunk - 1))
16710 return false;
16711 else
16712 for (j = 1; j < chunk; ++j)
16713 if (d->perm[i] + j != d->perm[i + j])
16714 return false;
16715
16716 return true;
16717 }
16718
16719 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16720 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16721
16722 static bool
16723 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
16724 {
16725 unsigned i, nelt, eltsz, mask;
16726 unsigned char perm[64];
16727 machine_mode vmode = V16QImode;
16728 rtx rperm[64], vperm, target, op0, op1;
16729
16730 nelt = d->nelt;
16731
16732 if (!d->one_operand_p)
16733 {
16734 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
16735 {
16736 if (TARGET_AVX2
16737 && valid_perm_using_mode_p (V2TImode, d))
16738 {
16739 if (d->testing_p)
16740 return true;
16741
16742 /* Use vperm2i128 insn. The pattern uses
16743 V4DImode instead of V2TImode. */
16744 target = d->target;
16745 if (d->vmode != V4DImode)
16746 target = gen_reg_rtx (V4DImode);
16747 op0 = gen_lowpart (V4DImode, d->op0);
16748 op1 = gen_lowpart (V4DImode, d->op1);
16749 rperm[0]
16750 = GEN_INT ((d->perm[0] / (nelt / 2))
16751 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
16752 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
16753 if (target != d->target)
16754 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16755 return true;
16756 }
16757 return false;
16758 }
16759 }
16760 else
16761 {
16762 if (GET_MODE_SIZE (d->vmode) == 16)
16763 {
16764 if (!TARGET_SSSE3)
16765 return false;
16766 }
16767 else if (GET_MODE_SIZE (d->vmode) == 32)
16768 {
16769 if (!TARGET_AVX2)
16770 return false;
16771
16772 /* V4DImode should be already handled through
16773 expand_vselect by vpermq instruction. */
16774 gcc_assert (d->vmode != V4DImode);
16775
16776 vmode = V32QImode;
16777 if (d->vmode == V8SImode
16778 || d->vmode == V16HImode
16779 || d->vmode == V32QImode)
16780 {
16781 /* First see if vpermq can be used for
16782 V8SImode/V16HImode/V32QImode. */
16783 if (valid_perm_using_mode_p (V4DImode, d))
16784 {
16785 for (i = 0; i < 4; i++)
16786 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
16787 if (d->testing_p)
16788 return true;
16789 target = gen_reg_rtx (V4DImode);
16790 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
16791 perm, 4, false))
16792 {
16793 emit_move_insn (d->target,
16794 gen_lowpart (d->vmode, target));
16795 return true;
16796 }
16797 return false;
16798 }
16799
16800 /* Next see if vpermd can be used. */
16801 if (valid_perm_using_mode_p (V8SImode, d))
16802 vmode = V8SImode;
16803 }
16804 /* Or if vpermps can be used. */
16805 else if (d->vmode == V8SFmode)
16806 vmode = V8SImode;
16807
16808 if (vmode == V32QImode)
16809 {
16810 /* vpshufb only works intra lanes, it is not
16811 possible to shuffle bytes in between the lanes. */
16812 for (i = 0; i < nelt; ++i)
16813 if ((d->perm[i] ^ i) & (nelt / 2))
16814 return false;
16815 }
16816 }
16817 else if (GET_MODE_SIZE (d->vmode) == 64)
16818 {
16819 if (!TARGET_AVX512BW)
16820 return false;
16821
16822 /* If vpermq didn't work, vpshufb won't work either. */
16823 if (d->vmode == V8DFmode || d->vmode == V8DImode)
16824 return false;
16825
16826 vmode = V64QImode;
16827 if (d->vmode == V16SImode
16828 || d->vmode == V32HImode
16829 || d->vmode == V64QImode)
16830 {
16831 /* First see if vpermq can be used for
16832 V16SImode/V32HImode/V64QImode. */
16833 if (valid_perm_using_mode_p (V8DImode, d))
16834 {
16835 for (i = 0; i < 8; i++)
16836 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
16837 if (d->testing_p)
16838 return true;
16839 target = gen_reg_rtx (V8DImode);
16840 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
16841 perm, 8, false))
16842 {
16843 emit_move_insn (d->target,
16844 gen_lowpart (d->vmode, target));
16845 return true;
16846 }
16847 return false;
16848 }
16849
16850 /* Next see if vpermd can be used. */
16851 if (valid_perm_using_mode_p (V16SImode, d))
16852 vmode = V16SImode;
16853 }
16854 /* Or if vpermps can be used. */
16855 else if (d->vmode == V16SFmode)
16856 vmode = V16SImode;
16857 if (vmode == V64QImode)
16858 {
16859 /* vpshufb only works intra lanes, it is not
16860 possible to shuffle bytes in between the lanes. */
16861 for (i = 0; i < nelt; ++i)
16862 if ((d->perm[i] ^ i) & (3 * nelt / 4))
16863 return false;
16864 }
16865 }
16866 else
16867 return false;
16868 }
16869
16870 if (d->testing_p)
16871 return true;
16872
16873 if (vmode == V8SImode)
16874 for (i = 0; i < 8; ++i)
16875 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
16876 else if (vmode == V16SImode)
16877 for (i = 0; i < 16; ++i)
16878 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
16879 else
16880 {
16881 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
16882 if (!d->one_operand_p)
16883 mask = 2 * nelt - 1;
16884 else if (vmode == V16QImode)
16885 mask = nelt - 1;
16886 else if (vmode == V64QImode)
16887 mask = nelt / 4 - 1;
16888 else
16889 mask = nelt / 2 - 1;
16890
16891 for (i = 0; i < nelt; ++i)
16892 {
16893 unsigned j, e = d->perm[i] & mask;
16894 for (j = 0; j < eltsz; ++j)
16895 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
16896 }
16897 }
16898
16899 vperm = gen_rtx_CONST_VECTOR (vmode,
16900 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
16901 vperm = force_reg (vmode, vperm);
16902
16903 target = d->target;
16904 if (d->vmode != vmode)
16905 target = gen_reg_rtx (vmode);
16906 op0 = gen_lowpart (vmode, d->op0);
16907 if (d->one_operand_p)
16908 {
16909 if (vmode == V16QImode)
16910 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
16911 else if (vmode == V32QImode)
16912 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
16913 else if (vmode == V64QImode)
16914 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
16915 else if (vmode == V8SFmode)
16916 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
16917 else if (vmode == V8SImode)
16918 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
16919 else if (vmode == V16SFmode)
16920 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
16921 else if (vmode == V16SImode)
16922 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
16923 else
16924 gcc_unreachable ();
16925 }
16926 else
16927 {
16928 op1 = gen_lowpart (vmode, d->op1);
16929 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
16930 }
16931 if (target != d->target)
16932 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16933
16934 return true;
16935 }
16936
16937 /* For V*[QHS]Imode permutations, check if the same permutation
16938 can't be performed in a 2x, 4x or 8x wider inner mode. */
16939
16940 static bool
16941 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
16942 struct expand_vec_perm_d *nd)
16943 {
16944 int i;
16945 machine_mode mode = VOIDmode;
16946
16947 switch (d->vmode)
16948 {
16949 case E_V16QImode: mode = V8HImode; break;
16950 case E_V32QImode: mode = V16HImode; break;
16951 case E_V64QImode: mode = V32HImode; break;
16952 case E_V8HImode: mode = V4SImode; break;
16953 case E_V16HImode: mode = V8SImode; break;
16954 case E_V32HImode: mode = V16SImode; break;
16955 case E_V4SImode: mode = V2DImode; break;
16956 case E_V8SImode: mode = V4DImode; break;
16957 case E_V16SImode: mode = V8DImode; break;
16958 default: return false;
16959 }
16960 for (i = 0; i < d->nelt; i += 2)
16961 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
16962 return false;
16963 nd->vmode = mode;
16964 nd->nelt = d->nelt / 2;
16965 for (i = 0; i < nd->nelt; i++)
16966 nd->perm[i] = d->perm[2 * i] / 2;
16967 if (GET_MODE_INNER (mode) != DImode)
16968 canonicalize_vector_int_perm (nd, nd);
16969 if (nd != d)
16970 {
16971 nd->one_operand_p = d->one_operand_p;
16972 nd->testing_p = d->testing_p;
16973 if (d->op0 == d->op1)
16974 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
16975 else
16976 {
16977 nd->op0 = gen_lowpart (nd->vmode, d->op0);
16978 nd->op1 = gen_lowpart (nd->vmode, d->op1);
16979 }
16980 if (d->testing_p)
16981 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
16982 else
16983 nd->target = gen_reg_rtx (nd->vmode);
16984 }
16985 return true;
16986 }
16987
16988 /* Try to expand one-operand permutation with constant mask. */
16989
16990 static bool
16991 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
16992 {
16993 machine_mode mode = GET_MODE (d->op0);
16994 machine_mode maskmode = mode;
16995 rtx (*gen) (rtx, rtx, rtx) = NULL;
16996 rtx target, op0, mask;
16997 rtx vec[64];
16998
16999 if (!rtx_equal_p (d->op0, d->op1))
17000 return false;
17001
17002 if (!TARGET_AVX512F)
17003 return false;
17004
17005 switch (mode)
17006 {
17007 case E_V16SImode:
17008 gen = gen_avx512f_permvarv16si;
17009 break;
17010 case E_V16SFmode:
17011 gen = gen_avx512f_permvarv16sf;
17012 maskmode = V16SImode;
17013 break;
17014 case E_V8DImode:
17015 gen = gen_avx512f_permvarv8di;
17016 break;
17017 case E_V8DFmode:
17018 gen = gen_avx512f_permvarv8df;
17019 maskmode = V8DImode;
17020 break;
17021 default:
17022 return false;
17023 }
17024
17025 target = d->target;
17026 op0 = d->op0;
17027 for (int i = 0; i < d->nelt; ++i)
17028 vec[i] = GEN_INT (d->perm[i]);
17029 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
17030 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
17031 return true;
17032 }
17033
17034 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
17035
17036 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
17037 in a single instruction. */
17038
17039 static bool
17040 expand_vec_perm_1 (struct expand_vec_perm_d *d)
17041 {
17042 unsigned i, nelt = d->nelt;
17043 struct expand_vec_perm_d nd;
17044
17045 /* Check plain VEC_SELECT first, because AVX has instructions that could
17046 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17047 input where SEL+CONCAT may not. */
17048 if (d->one_operand_p)
17049 {
17050 int mask = nelt - 1;
17051 bool identity_perm = true;
17052 bool broadcast_perm = true;
17053
17054 for (i = 0; i < nelt; i++)
17055 {
17056 nd.perm[i] = d->perm[i] & mask;
17057 if (nd.perm[i] != i)
17058 identity_perm = false;
17059 if (nd.perm[i])
17060 broadcast_perm = false;
17061 }
17062
17063 if (identity_perm)
17064 {
17065 if (!d->testing_p)
17066 emit_move_insn (d->target, d->op0);
17067 return true;
17068 }
17069 else if (broadcast_perm && TARGET_AVX2)
17070 {
17071 /* Use vpbroadcast{b,w,d}. */
17072 rtx (*gen) (rtx, rtx) = NULL;
17073 switch (d->vmode)
17074 {
17075 case E_V64QImode:
17076 if (TARGET_AVX512BW)
17077 gen = gen_avx512bw_vec_dupv64qi_1;
17078 break;
17079 case E_V32QImode:
17080 gen = gen_avx2_pbroadcastv32qi_1;
17081 break;
17082 case E_V32HImode:
17083 if (TARGET_AVX512BW)
17084 gen = gen_avx512bw_vec_dupv32hi_1;
17085 break;
17086 case E_V16HImode:
17087 gen = gen_avx2_pbroadcastv16hi_1;
17088 break;
17089 case E_V16SImode:
17090 if (TARGET_AVX512F)
17091 gen = gen_avx512f_vec_dupv16si_1;
17092 break;
17093 case E_V8SImode:
17094 gen = gen_avx2_pbroadcastv8si_1;
17095 break;
17096 case E_V16QImode:
17097 gen = gen_avx2_pbroadcastv16qi;
17098 break;
17099 case E_V8HImode:
17100 gen = gen_avx2_pbroadcastv8hi;
17101 break;
17102 case E_V16SFmode:
17103 if (TARGET_AVX512F)
17104 gen = gen_avx512f_vec_dupv16sf_1;
17105 break;
17106 case E_V8SFmode:
17107 gen = gen_avx2_vec_dupv8sf_1;
17108 break;
17109 case E_V8DFmode:
17110 if (TARGET_AVX512F)
17111 gen = gen_avx512f_vec_dupv8df_1;
17112 break;
17113 case E_V8DImode:
17114 if (TARGET_AVX512F)
17115 gen = gen_avx512f_vec_dupv8di_1;
17116 break;
17117 /* For other modes prefer other shuffles this function creates. */
17118 default: break;
17119 }
17120 if (gen != NULL)
17121 {
17122 if (!d->testing_p)
17123 emit_insn (gen (d->target, d->op0));
17124 return true;
17125 }
17126 }
17127
17128 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17129 return true;
17130
17131 /* There are plenty of patterns in sse.md that are written for
17132 SEL+CONCAT and are not replicated for a single op. Perhaps
17133 that should be changed, to avoid the nastiness here. */
17134
17135 /* Recognize interleave style patterns, which means incrementing
17136 every other permutation operand. */
17137 for (i = 0; i < nelt; i += 2)
17138 {
17139 nd.perm[i] = d->perm[i] & mask;
17140 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17141 }
17142 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17143 d->testing_p))
17144 return true;
17145
17146 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17147 if (nelt >= 4)
17148 {
17149 for (i = 0; i < nelt; i += 4)
17150 {
17151 nd.perm[i + 0] = d->perm[i + 0] & mask;
17152 nd.perm[i + 1] = d->perm[i + 1] & mask;
17153 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17154 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17155 }
17156
17157 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17158 d->testing_p))
17159 return true;
17160 }
17161 }
17162
17163 /* Try movss/movsd instructions. */
17164 if (expand_vec_perm_movs (d))
17165 return true;
17166
17167 /* Finally, try the fully general two operand permute. */
17168 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17169 d->testing_p))
17170 return true;
17171
17172 /* Recognize interleave style patterns with reversed operands. */
17173 if (!d->one_operand_p)
17174 {
17175 for (i = 0; i < nelt; ++i)
17176 {
17177 unsigned e = d->perm[i];
17178 if (e >= nelt)
17179 e -= nelt;
17180 else
17181 e += nelt;
17182 nd.perm[i] = e;
17183 }
17184
17185 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17186 d->testing_p))
17187 return true;
17188 }
17189
17190 /* Try the SSE4.1 blend variable merge instructions. */
17191 if (expand_vec_perm_blend (d))
17192 return true;
17193
17194 /* Try one of the AVX vpermil variable permutations. */
17195 if (expand_vec_perm_vpermil (d))
17196 return true;
17197
17198 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17199 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17200 if (expand_vec_perm_pshufb (d))
17201 return true;
17202
17203 /* Try the AVX2 vpalignr instruction. */
17204 if (expand_vec_perm_palignr (d, true))
17205 return true;
17206
17207 /* Try the AVX512F vperm{s,d} instructions. */
17208 if (ix86_expand_vec_one_operand_perm_avx512 (d))
17209 return true;
17210
17211 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17212 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17213 return true;
17214
17215 /* See if we can get the same permutation in different vector integer
17216 mode. */
17217 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17218 {
17219 if (!d->testing_p)
17220 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17221 return true;
17222 }
17223 return false;
17224 }
17225
17226 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17227 in terms of a pair of pshuflw + pshufhw instructions. */
17228
17229 static bool
17230 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17231 {
17232 unsigned char perm2[MAX_VECT_LEN];
17233 unsigned i;
17234 bool ok;
17235
17236 if (d->vmode != V8HImode || !d->one_operand_p)
17237 return false;
17238
17239 /* The two permutations only operate in 64-bit lanes. */
17240 for (i = 0; i < 4; ++i)
17241 if (d->perm[i] >= 4)
17242 return false;
17243 for (i = 4; i < 8; ++i)
17244 if (d->perm[i] < 4)
17245 return false;
17246
17247 if (d->testing_p)
17248 return true;
17249
17250 /* Emit the pshuflw. */
17251 memcpy (perm2, d->perm, 4);
17252 for (i = 4; i < 8; ++i)
17253 perm2[i] = i;
17254 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17255 gcc_assert (ok);
17256
17257 /* Emit the pshufhw. */
17258 memcpy (perm2 + 4, d->perm + 4, 4);
17259 for (i = 0; i < 4; ++i)
17260 perm2[i] = i;
17261 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17262 gcc_assert (ok);
17263
17264 return true;
17265 }
17266
17267 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17268 the permutation using the SSSE3 palignr instruction. This succeeds
17269 when all of the elements in PERM fit within one vector and we merely
17270 need to shift them down so that a single vector permutation has a
17271 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17272 the vpalignr instruction itself can perform the requested permutation. */
17273
17274 static bool
17275 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17276 {
17277 unsigned i, nelt = d->nelt;
17278 unsigned min, max, minswap, maxswap;
17279 bool in_order, ok, swap = false;
17280 rtx shift, target;
17281 struct expand_vec_perm_d dcopy;
17282
17283 /* Even with AVX, palignr only operates on 128-bit vectors,
17284 in AVX2 palignr operates on both 128-bit lanes. */
17285 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17286 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17287 return false;
17288
17289 min = 2 * nelt;
17290 max = 0;
17291 minswap = 2 * nelt;
17292 maxswap = 0;
17293 for (i = 0; i < nelt; ++i)
17294 {
17295 unsigned e = d->perm[i];
17296 unsigned eswap = d->perm[i] ^ nelt;
17297 if (GET_MODE_SIZE (d->vmode) == 32)
17298 {
17299 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17300 eswap = e ^ (nelt / 2);
17301 }
17302 if (e < min)
17303 min = e;
17304 if (e > max)
17305 max = e;
17306 if (eswap < minswap)
17307 minswap = eswap;
17308 if (eswap > maxswap)
17309 maxswap = eswap;
17310 }
17311 if (min == 0
17312 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17313 {
17314 if (d->one_operand_p
17315 || minswap == 0
17316 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17317 ? nelt / 2 : nelt))
17318 return false;
17319 swap = true;
17320 min = minswap;
17321 max = maxswap;
17322 }
17323
17324 /* Given that we have SSSE3, we know we'll be able to implement the
17325 single operand permutation after the palignr with pshufb for
17326 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17327 first. */
17328 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17329 return true;
17330
17331 dcopy = *d;
17332 if (swap)
17333 {
17334 dcopy.op0 = d->op1;
17335 dcopy.op1 = d->op0;
17336 for (i = 0; i < nelt; ++i)
17337 dcopy.perm[i] ^= nelt;
17338 }
17339
17340 in_order = true;
17341 for (i = 0; i < nelt; ++i)
17342 {
17343 unsigned e = dcopy.perm[i];
17344 if (GET_MODE_SIZE (d->vmode) == 32
17345 && e >= nelt
17346 && (e & (nelt / 2 - 1)) < min)
17347 e = e - min - (nelt / 2);
17348 else
17349 e = e - min;
17350 if (e != i)
17351 in_order = false;
17352 dcopy.perm[i] = e;
17353 }
17354 dcopy.one_operand_p = true;
17355
17356 if (single_insn_only_p && !in_order)
17357 return false;
17358
17359 /* For AVX2, test whether we can permute the result in one instruction. */
17360 if (d->testing_p)
17361 {
17362 if (in_order)
17363 return true;
17364 dcopy.op1 = dcopy.op0;
17365 return expand_vec_perm_1 (&dcopy);
17366 }
17367
17368 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17369 if (GET_MODE_SIZE (d->vmode) == 16)
17370 {
17371 target = gen_reg_rtx (TImode);
17372 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17373 gen_lowpart (TImode, dcopy.op0), shift));
17374 }
17375 else
17376 {
17377 target = gen_reg_rtx (V2TImode);
17378 emit_insn (gen_avx2_palignrv2ti (target,
17379 gen_lowpart (V2TImode, dcopy.op1),
17380 gen_lowpart (V2TImode, dcopy.op0),
17381 shift));
17382 }
17383
17384 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17385
17386 /* Test for the degenerate case where the alignment by itself
17387 produces the desired permutation. */
17388 if (in_order)
17389 {
17390 emit_move_insn (d->target, dcopy.op0);
17391 return true;
17392 }
17393
17394 ok = expand_vec_perm_1 (&dcopy);
17395 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17396
17397 return ok;
17398 }
17399
17400 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17401 the permutation using the SSE4_1 pblendv instruction. Potentially
17402 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17403
17404 static bool
17405 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17406 {
17407 unsigned i, which, nelt = d->nelt;
17408 struct expand_vec_perm_d dcopy, dcopy1;
17409 machine_mode vmode = d->vmode;
17410 bool ok;
17411
17412 /* Use the same checks as in expand_vec_perm_blend. */
17413 if (d->one_operand_p)
17414 return false;
17415 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17416 ;
17417 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17418 ;
17419 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17420 ;
17421 else
17422 return false;
17423
17424 /* Figure out where permutation elements stay not in their
17425 respective lanes. */
17426 for (i = 0, which = 0; i < nelt; ++i)
17427 {
17428 unsigned e = d->perm[i];
17429 if (e != i)
17430 which |= (e < nelt ? 1 : 2);
17431 }
17432 /* We can pblend the part where elements stay not in their
17433 respective lanes only when these elements are all in one
17434 half of a permutation.
17435 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17436 lanes, but both 8 and 9 >= 8
17437 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17438 respective lanes and 8 >= 8, but 2 not. */
17439 if (which != 1 && which != 2)
17440 return false;
17441 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17442 return true;
17443
17444 /* First we apply one operand permutation to the part where
17445 elements stay not in their respective lanes. */
17446 dcopy = *d;
17447 if (which == 2)
17448 dcopy.op0 = dcopy.op1 = d->op1;
17449 else
17450 dcopy.op0 = dcopy.op1 = d->op0;
17451 if (!d->testing_p)
17452 dcopy.target = gen_reg_rtx (vmode);
17453 dcopy.one_operand_p = true;
17454
17455 for (i = 0; i < nelt; ++i)
17456 dcopy.perm[i] = d->perm[i] & (nelt - 1);
17457
17458 ok = expand_vec_perm_1 (&dcopy);
17459 if (GET_MODE_SIZE (vmode) != 16 && !ok)
17460 return false;
17461 else
17462 gcc_assert (ok);
17463 if (d->testing_p)
17464 return true;
17465
17466 /* Next we put permuted elements into their positions. */
17467 dcopy1 = *d;
17468 if (which == 2)
17469 dcopy1.op1 = dcopy.target;
17470 else
17471 dcopy1.op0 = dcopy.target;
17472
17473 for (i = 0; i < nelt; ++i)
17474 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17475
17476 ok = expand_vec_perm_blend (&dcopy1);
17477 gcc_assert (ok);
17478
17479 return true;
17480 }
17481
17482 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17483
17484 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17485 a two vector permutation into a single vector permutation by using
17486 an interleave operation to merge the vectors. */
17487
17488 static bool
17489 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17490 {
17491 struct expand_vec_perm_d dremap, dfinal;
17492 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17493 unsigned HOST_WIDE_INT contents;
17494 unsigned char remap[2 * MAX_VECT_LEN];
17495 rtx_insn *seq;
17496 bool ok, same_halves = false;
17497
17498 if (GET_MODE_SIZE (d->vmode) == 16)
17499 {
17500 if (d->one_operand_p)
17501 return false;
17502 }
17503 else if (GET_MODE_SIZE (d->vmode) == 32)
17504 {
17505 if (!TARGET_AVX)
17506 return false;
17507 /* For 32-byte modes allow even d->one_operand_p.
17508 The lack of cross-lane shuffling in some instructions
17509 might prevent a single insn shuffle. */
17510 dfinal = *d;
17511 dfinal.testing_p = true;
17512 /* If expand_vec_perm_interleave3 can expand this into
17513 a 3 insn sequence, give up and let it be expanded as
17514 3 insn sequence. While that is one insn longer,
17515 it doesn't need a memory operand and in the common
17516 case that both interleave low and high permutations
17517 with the same operands are adjacent needs 4 insns
17518 for both after CSE. */
17519 if (expand_vec_perm_interleave3 (&dfinal))
17520 return false;
17521 }
17522 else
17523 return false;
17524
17525 /* Examine from whence the elements come. */
17526 contents = 0;
17527 for (i = 0; i < nelt; ++i)
17528 contents |= HOST_WIDE_INT_1U << d->perm[i];
17529
17530 memset (remap, 0xff, sizeof (remap));
17531 dremap = *d;
17532
17533 if (GET_MODE_SIZE (d->vmode) == 16)
17534 {
17535 unsigned HOST_WIDE_INT h1, h2, h3, h4;
17536
17537 /* Split the two input vectors into 4 halves. */
17538 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
17539 h2 = h1 << nelt2;
17540 h3 = h2 << nelt2;
17541 h4 = h3 << nelt2;
17542
17543 /* If the elements from the low halves use interleave low, and similarly
17544 for interleave high. If the elements are from mis-matched halves, we
17545 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17546 if ((contents & (h1 | h3)) == contents)
17547 {
17548 /* punpckl* */
17549 for (i = 0; i < nelt2; ++i)
17550 {
17551 remap[i] = i * 2;
17552 remap[i + nelt] = i * 2 + 1;
17553 dremap.perm[i * 2] = i;
17554 dremap.perm[i * 2 + 1] = i + nelt;
17555 }
17556 if (!TARGET_SSE2 && d->vmode == V4SImode)
17557 dremap.vmode = V4SFmode;
17558 }
17559 else if ((contents & (h2 | h4)) == contents)
17560 {
17561 /* punpckh* */
17562 for (i = 0; i < nelt2; ++i)
17563 {
17564 remap[i + nelt2] = i * 2;
17565 remap[i + nelt + nelt2] = i * 2 + 1;
17566 dremap.perm[i * 2] = i + nelt2;
17567 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
17568 }
17569 if (!TARGET_SSE2 && d->vmode == V4SImode)
17570 dremap.vmode = V4SFmode;
17571 }
17572 else if ((contents & (h1 | h4)) == contents)
17573 {
17574 /* shufps */
17575 for (i = 0; i < nelt2; ++i)
17576 {
17577 remap[i] = i;
17578 remap[i + nelt + nelt2] = i + nelt2;
17579 dremap.perm[i] = i;
17580 dremap.perm[i + nelt2] = i + nelt + nelt2;
17581 }
17582 if (nelt != 4)
17583 {
17584 /* shufpd */
17585 dremap.vmode = V2DImode;
17586 dremap.nelt = 2;
17587 dremap.perm[0] = 0;
17588 dremap.perm[1] = 3;
17589 }
17590 }
17591 else if ((contents & (h2 | h3)) == contents)
17592 {
17593 /* shufps */
17594 for (i = 0; i < nelt2; ++i)
17595 {
17596 remap[i + nelt2] = i;
17597 remap[i + nelt] = i + nelt2;
17598 dremap.perm[i] = i + nelt2;
17599 dremap.perm[i + nelt2] = i + nelt;
17600 }
17601 if (nelt != 4)
17602 {
17603 /* shufpd */
17604 dremap.vmode = V2DImode;
17605 dremap.nelt = 2;
17606 dremap.perm[0] = 1;
17607 dremap.perm[1] = 2;
17608 }
17609 }
17610 else
17611 return false;
17612 }
17613 else
17614 {
17615 unsigned int nelt4 = nelt / 4, nzcnt = 0;
17616 unsigned HOST_WIDE_INT q[8];
17617 unsigned int nonzero_halves[4];
17618
17619 /* Split the two input vectors into 8 quarters. */
17620 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
17621 for (i = 1; i < 8; ++i)
17622 q[i] = q[0] << (nelt4 * i);
17623 for (i = 0; i < 4; ++i)
17624 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
17625 {
17626 nonzero_halves[nzcnt] = i;
17627 ++nzcnt;
17628 }
17629
17630 if (nzcnt == 1)
17631 {
17632 gcc_assert (d->one_operand_p);
17633 nonzero_halves[1] = nonzero_halves[0];
17634 same_halves = true;
17635 }
17636 else if (d->one_operand_p)
17637 {
17638 gcc_assert (nonzero_halves[0] == 0);
17639 gcc_assert (nonzero_halves[1] == 1);
17640 }
17641
17642 if (nzcnt <= 2)
17643 {
17644 if (d->perm[0] / nelt2 == nonzero_halves[1])
17645 {
17646 /* Attempt to increase the likelihood that dfinal
17647 shuffle will be intra-lane. */
17648 std::swap (nonzero_halves[0], nonzero_halves[1]);
17649 }
17650
17651 /* vperm2f128 or vperm2i128. */
17652 for (i = 0; i < nelt2; ++i)
17653 {
17654 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
17655 remap[i + nonzero_halves[0] * nelt2] = i;
17656 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
17657 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
17658 }
17659
17660 if (d->vmode != V8SFmode
17661 && d->vmode != V4DFmode
17662 && d->vmode != V8SImode)
17663 {
17664 dremap.vmode = V8SImode;
17665 dremap.nelt = 8;
17666 for (i = 0; i < 4; ++i)
17667 {
17668 dremap.perm[i] = i + nonzero_halves[0] * 4;
17669 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
17670 }
17671 }
17672 }
17673 else if (d->one_operand_p)
17674 return false;
17675 else if (TARGET_AVX2
17676 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
17677 {
17678 /* vpunpckl* */
17679 for (i = 0; i < nelt4; ++i)
17680 {
17681 remap[i] = i * 2;
17682 remap[i + nelt] = i * 2 + 1;
17683 remap[i + nelt2] = i * 2 + nelt2;
17684 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
17685 dremap.perm[i * 2] = i;
17686 dremap.perm[i * 2 + 1] = i + nelt;
17687 dremap.perm[i * 2 + nelt2] = i + nelt2;
17688 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
17689 }
17690 }
17691 else if (TARGET_AVX2
17692 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
17693 {
17694 /* vpunpckh* */
17695 for (i = 0; i < nelt4; ++i)
17696 {
17697 remap[i + nelt4] = i * 2;
17698 remap[i + nelt + nelt4] = i * 2 + 1;
17699 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
17700 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
17701 dremap.perm[i * 2] = i + nelt4;
17702 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
17703 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
17704 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
17705 }
17706 }
17707 else
17708 return false;
17709 }
17710
17711 /* Use the remapping array set up above to move the elements from their
17712 swizzled locations into their final destinations. */
17713 dfinal = *d;
17714 for (i = 0; i < nelt; ++i)
17715 {
17716 unsigned e = remap[d->perm[i]];
17717 gcc_assert (e < nelt);
17718 /* If same_halves is true, both halves of the remapped vector are the
17719 same. Avoid cross-lane accesses if possible. */
17720 if (same_halves && i >= nelt2)
17721 {
17722 gcc_assert (e < nelt2);
17723 dfinal.perm[i] = e + nelt2;
17724 }
17725 else
17726 dfinal.perm[i] = e;
17727 }
17728 if (!d->testing_p)
17729 {
17730 dremap.target = gen_reg_rtx (dremap.vmode);
17731 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17732 }
17733 dfinal.op1 = dfinal.op0;
17734 dfinal.one_operand_p = true;
17735
17736 /* Test if the final remap can be done with a single insn. For V4SFmode or
17737 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17738 start_sequence ();
17739 ok = expand_vec_perm_1 (&dfinal);
17740 seq = get_insns ();
17741 end_sequence ();
17742
17743 if (!ok)
17744 return false;
17745
17746 if (d->testing_p)
17747 return true;
17748
17749 if (dremap.vmode != dfinal.vmode)
17750 {
17751 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
17752 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
17753 }
17754
17755 ok = expand_vec_perm_1 (&dremap);
17756 gcc_assert (ok);
17757
17758 emit_insn (seq);
17759 return true;
17760 }
17761
17762 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17763 a single vector cross-lane permutation into vpermq followed
17764 by any of the single insn permutations. */
17765
17766 static bool
17767 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
17768 {
17769 struct expand_vec_perm_d dremap, dfinal;
17770 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
17771 unsigned contents[2];
17772 bool ok;
17773
17774 if (!(TARGET_AVX2
17775 && (d->vmode == V32QImode || d->vmode == V16HImode)
17776 && d->one_operand_p))
17777 return false;
17778
17779 contents[0] = 0;
17780 contents[1] = 0;
17781 for (i = 0; i < nelt2; ++i)
17782 {
17783 contents[0] |= 1u << (d->perm[i] / nelt4);
17784 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
17785 }
17786
17787 for (i = 0; i < 2; ++i)
17788 {
17789 unsigned int cnt = 0;
17790 for (j = 0; j < 4; ++j)
17791 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
17792 return false;
17793 }
17794
17795 if (d->testing_p)
17796 return true;
17797
17798 dremap = *d;
17799 dremap.vmode = V4DImode;
17800 dremap.nelt = 4;
17801 dremap.target = gen_reg_rtx (V4DImode);
17802 dremap.op0 = gen_lowpart (V4DImode, d->op0);
17803 dremap.op1 = dremap.op0;
17804 dremap.one_operand_p = true;
17805 for (i = 0; i < 2; ++i)
17806 {
17807 unsigned int cnt = 0;
17808 for (j = 0; j < 4; ++j)
17809 if ((contents[i] & (1u << j)) != 0)
17810 dremap.perm[2 * i + cnt++] = j;
17811 for (; cnt < 2; ++cnt)
17812 dremap.perm[2 * i + cnt] = 0;
17813 }
17814
17815 dfinal = *d;
17816 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17817 dfinal.op1 = dfinal.op0;
17818 dfinal.one_operand_p = true;
17819 for (i = 0, j = 0; i < nelt; ++i)
17820 {
17821 if (i == nelt2)
17822 j = 2;
17823 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
17824 if ((d->perm[i] / nelt4) == dremap.perm[j])
17825 ;
17826 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
17827 dfinal.perm[i] |= nelt4;
17828 else
17829 gcc_unreachable ();
17830 }
17831
17832 ok = expand_vec_perm_1 (&dremap);
17833 gcc_assert (ok);
17834
17835 ok = expand_vec_perm_1 (&dfinal);
17836 gcc_assert (ok);
17837
17838 return true;
17839 }
17840
17841 static bool canonicalize_perm (struct expand_vec_perm_d *d);
17842
17843 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
17844 a vector permutation using two instructions, vperm2f128 resp.
17845 vperm2i128 followed by any single in-lane permutation. */
17846
17847 static bool
17848 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
17849 {
17850 struct expand_vec_perm_d dfirst, dsecond;
17851 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
17852 bool ok;
17853
17854 if (!TARGET_AVX
17855 || GET_MODE_SIZE (d->vmode) != 32
17856 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
17857 return false;
17858
17859 dsecond = *d;
17860 dsecond.one_operand_p = false;
17861 dsecond.testing_p = true;
17862
17863 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17864 immediate. For perm < 16 the second permutation uses
17865 d->op0 as first operand, for perm >= 16 it uses d->op1
17866 as first operand. The second operand is the result of
17867 vperm2[fi]128. */
17868 for (perm = 0; perm < 32; perm++)
17869 {
17870 /* Ignore permutations which do not move anything cross-lane. */
17871 if (perm < 16)
17872 {
17873 /* The second shuffle for e.g. V4DFmode has
17874 0123 and ABCD operands.
17875 Ignore AB23, as 23 is already in the second lane
17876 of the first operand. */
17877 if ((perm & 0xc) == (1 << 2)) continue;
17878 /* And 01CD, as 01 is in the first lane of the first
17879 operand. */
17880 if ((perm & 3) == 0) continue;
17881 /* And 4567, as then the vperm2[fi]128 doesn't change
17882 anything on the original 4567 second operand. */
17883 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
17884 }
17885 else
17886 {
17887 /* The second shuffle for e.g. V4DFmode has
17888 4567 and ABCD operands.
17889 Ignore AB67, as 67 is already in the second lane
17890 of the first operand. */
17891 if ((perm & 0xc) == (3 << 2)) continue;
17892 /* And 45CD, as 45 is in the first lane of the first
17893 operand. */
17894 if ((perm & 3) == 2) continue;
17895 /* And 0123, as then the vperm2[fi]128 doesn't change
17896 anything on the original 0123 first operand. */
17897 if ((perm & 0xf) == (1 << 2)) continue;
17898 }
17899
17900 for (i = 0; i < nelt; i++)
17901 {
17902 j = d->perm[i] / nelt2;
17903 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
17904 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
17905 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
17906 dsecond.perm[i] = d->perm[i] & (nelt - 1);
17907 else
17908 break;
17909 }
17910
17911 if (i == nelt)
17912 {
17913 start_sequence ();
17914 ok = expand_vec_perm_1 (&dsecond);
17915 end_sequence ();
17916 }
17917 else
17918 ok = false;
17919
17920 if (ok)
17921 {
17922 if (d->testing_p)
17923 return true;
17924
17925 /* Found a usable second shuffle. dfirst will be
17926 vperm2f128 on d->op0 and d->op1. */
17927 dsecond.testing_p = false;
17928 dfirst = *d;
17929 dfirst.target = gen_reg_rtx (d->vmode);
17930 for (i = 0; i < nelt; i++)
17931 dfirst.perm[i] = (i & (nelt2 - 1))
17932 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
17933
17934 canonicalize_perm (&dfirst);
17935 ok = expand_vec_perm_1 (&dfirst);
17936 gcc_assert (ok);
17937
17938 /* And dsecond is some single insn shuffle, taking
17939 d->op0 and result of vperm2f128 (if perm < 16) or
17940 d->op1 and result of vperm2f128 (otherwise). */
17941 if (perm >= 16)
17942 dsecond.op0 = dsecond.op1;
17943 dsecond.op1 = dfirst.target;
17944
17945 ok = expand_vec_perm_1 (&dsecond);
17946 gcc_assert (ok);
17947
17948 return true;
17949 }
17950
17951 /* For one operand, the only useful vperm2f128 permutation is 0x01
17952 aka lanes swap. */
17953 if (d->one_operand_p)
17954 return false;
17955 }
17956
17957 return false;
17958 }
17959
17960 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17961 a two vector permutation using 2 intra-lane interleave insns
17962 and cross-lane shuffle for 32-byte vectors. */
17963
17964 static bool
17965 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
17966 {
17967 unsigned i, nelt;
17968 rtx (*gen) (rtx, rtx, rtx);
17969
17970 if (d->one_operand_p)
17971 return false;
17972 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
17973 ;
17974 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
17975 ;
17976 else
17977 return false;
17978
17979 nelt = d->nelt;
17980 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
17981 return false;
17982 for (i = 0; i < nelt; i += 2)
17983 if (d->perm[i] != d->perm[0] + i / 2
17984 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
17985 return false;
17986
17987 if (d->testing_p)
17988 return true;
17989
17990 switch (d->vmode)
17991 {
17992 case E_V32QImode:
17993 if (d->perm[0])
17994 gen = gen_vec_interleave_highv32qi;
17995 else
17996 gen = gen_vec_interleave_lowv32qi;
17997 break;
17998 case E_V16HImode:
17999 if (d->perm[0])
18000 gen = gen_vec_interleave_highv16hi;
18001 else
18002 gen = gen_vec_interleave_lowv16hi;
18003 break;
18004 case E_V8SImode:
18005 if (d->perm[0])
18006 gen = gen_vec_interleave_highv8si;
18007 else
18008 gen = gen_vec_interleave_lowv8si;
18009 break;
18010 case E_V4DImode:
18011 if (d->perm[0])
18012 gen = gen_vec_interleave_highv4di;
18013 else
18014 gen = gen_vec_interleave_lowv4di;
18015 break;
18016 case E_V8SFmode:
18017 if (d->perm[0])
18018 gen = gen_vec_interleave_highv8sf;
18019 else
18020 gen = gen_vec_interleave_lowv8sf;
18021 break;
18022 case E_V4DFmode:
18023 if (d->perm[0])
18024 gen = gen_vec_interleave_highv4df;
18025 else
18026 gen = gen_vec_interleave_lowv4df;
18027 break;
18028 default:
18029 gcc_unreachable ();
18030 }
18031
18032 emit_insn (gen (d->target, d->op0, d->op1));
18033 return true;
18034 }
18035
18036 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18037 a single vector permutation using a single intra-lane vector
18038 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18039 the non-swapped and swapped vectors together. */
18040
18041 static bool
18042 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
18043 {
18044 struct expand_vec_perm_d dfirst, dsecond;
18045 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
18046 rtx_insn *seq;
18047 bool ok;
18048 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18049
18050 if (!TARGET_AVX
18051 || TARGET_AVX2
18052 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18053 || !d->one_operand_p)
18054 return false;
18055
18056 dfirst = *d;
18057 for (i = 0; i < nelt; i++)
18058 dfirst.perm[i] = 0xff;
18059 for (i = 0, msk = 0; i < nelt; i++)
18060 {
18061 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18062 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
18063 return false;
18064 dfirst.perm[j] = d->perm[i];
18065 if (j != i)
18066 msk |= (1 << i);
18067 }
18068 for (i = 0; i < nelt; i++)
18069 if (dfirst.perm[i] == 0xff)
18070 dfirst.perm[i] = i;
18071
18072 if (!d->testing_p)
18073 dfirst.target = gen_reg_rtx (dfirst.vmode);
18074
18075 start_sequence ();
18076 ok = expand_vec_perm_1 (&dfirst);
18077 seq = get_insns ();
18078 end_sequence ();
18079
18080 if (!ok)
18081 return false;
18082
18083 if (d->testing_p)
18084 return true;
18085
18086 emit_insn (seq);
18087
18088 dsecond = *d;
18089 dsecond.op0 = dfirst.target;
18090 dsecond.op1 = dfirst.target;
18091 dsecond.one_operand_p = true;
18092 dsecond.target = gen_reg_rtx (dsecond.vmode);
18093 for (i = 0; i < nelt; i++)
18094 dsecond.perm[i] = i ^ nelt2;
18095
18096 ok = expand_vec_perm_1 (&dsecond);
18097 gcc_assert (ok);
18098
18099 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18100 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18101 return true;
18102 }
18103
18104 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
18105 permutation using two vperm2f128, followed by a vshufpd insn blending
18106 the two vectors together. */
18107
18108 static bool
18109 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18110 {
18111 struct expand_vec_perm_d dfirst, dsecond, dthird;
18112 bool ok;
18113
18114 if (!TARGET_AVX || (d->vmode != V4DFmode))
18115 return false;
18116
18117 if (d->testing_p)
18118 return true;
18119
18120 dfirst = *d;
18121 dsecond = *d;
18122 dthird = *d;
18123
18124 dfirst.perm[0] = (d->perm[0] & ~1);
18125 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18126 dfirst.perm[2] = (d->perm[2] & ~1);
18127 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18128 dsecond.perm[0] = (d->perm[1] & ~1);
18129 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18130 dsecond.perm[2] = (d->perm[3] & ~1);
18131 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18132 dthird.perm[0] = (d->perm[0] % 2);
18133 dthird.perm[1] = (d->perm[1] % 2) + 4;
18134 dthird.perm[2] = (d->perm[2] % 2) + 2;
18135 dthird.perm[3] = (d->perm[3] % 2) + 6;
18136
18137 dfirst.target = gen_reg_rtx (dfirst.vmode);
18138 dsecond.target = gen_reg_rtx (dsecond.vmode);
18139 dthird.op0 = dfirst.target;
18140 dthird.op1 = dsecond.target;
18141 dthird.one_operand_p = false;
18142
18143 canonicalize_perm (&dfirst);
18144 canonicalize_perm (&dsecond);
18145
18146 ok = expand_vec_perm_1 (&dfirst)
18147 && expand_vec_perm_1 (&dsecond)
18148 && expand_vec_perm_1 (&dthird);
18149
18150 gcc_assert (ok);
18151
18152 return true;
18153 }
18154
18155 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
18156
18157 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18158 a two vector permutation using two intra-lane vector
18159 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18160 the non-swapped and swapped vectors together. */
18161
18162 static bool
18163 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
18164 {
18165 struct expand_vec_perm_d dfirst, dsecond, dthird;
18166 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
18167 rtx_insn *seq1, *seq2;
18168 bool ok;
18169 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18170
18171 if (!TARGET_AVX
18172 || TARGET_AVX2
18173 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18174 || d->one_operand_p)
18175 return false;
18176
18177 dfirst = *d;
18178 dsecond = *d;
18179 for (i = 0; i < nelt; i++)
18180 {
18181 dfirst.perm[i] = 0xff;
18182 dsecond.perm[i] = 0xff;
18183 }
18184 for (i = 0, msk = 0; i < nelt; i++)
18185 {
18186 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18187 if (j == i)
18188 {
18189 dfirst.perm[j] = d->perm[i];
18190 which1 |= (d->perm[i] < nelt ? 1 : 2);
18191 }
18192 else
18193 {
18194 dsecond.perm[j] = d->perm[i];
18195 which2 |= (d->perm[i] < nelt ? 1 : 2);
18196 msk |= (1U << i);
18197 }
18198 }
18199 if (msk == 0 || msk == (1U << nelt) - 1)
18200 return false;
18201
18202 if (!d->testing_p)
18203 {
18204 dfirst.target = gen_reg_rtx (dfirst.vmode);
18205 dsecond.target = gen_reg_rtx (dsecond.vmode);
18206 }
18207
18208 for (i = 0; i < nelt; i++)
18209 {
18210 if (dfirst.perm[i] == 0xff)
18211 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
18212 if (dsecond.perm[i] == 0xff)
18213 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
18214 }
18215 canonicalize_perm (&dfirst);
18216 start_sequence ();
18217 ok = ix86_expand_vec_perm_const_1 (&dfirst);
18218 seq1 = get_insns ();
18219 end_sequence ();
18220
18221 if (!ok)
18222 return false;
18223
18224 canonicalize_perm (&dsecond);
18225 start_sequence ();
18226 ok = ix86_expand_vec_perm_const_1 (&dsecond);
18227 seq2 = get_insns ();
18228 end_sequence ();
18229
18230 if (!ok)
18231 return false;
18232
18233 if (d->testing_p)
18234 return true;
18235
18236 emit_insn (seq1);
18237 emit_insn (seq2);
18238
18239 dthird = *d;
18240 dthird.op0 = dsecond.target;
18241 dthird.op1 = dsecond.target;
18242 dthird.one_operand_p = true;
18243 dthird.target = gen_reg_rtx (dthird.vmode);
18244 for (i = 0; i < nelt; i++)
18245 dthird.perm[i] = i ^ nelt2;
18246
18247 ok = expand_vec_perm_1 (&dthird);
18248 gcc_assert (ok);
18249
18250 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18251 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
18252 return true;
18253 }
18254
18255 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18256 permutation with two pshufb insns and an ior. We should have already
18257 failed all two instruction sequences. */
18258
18259 static bool
18260 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18261 {
18262 rtx rperm[2][16], vperm, l, h, op, m128;
18263 unsigned int i, nelt, eltsz;
18264
18265 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18266 return false;
18267 gcc_assert (!d->one_operand_p);
18268
18269 if (d->testing_p)
18270 return true;
18271
18272 nelt = d->nelt;
18273 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18274
18275 /* Generate two permutation masks. If the required element is within
18276 the given vector it is shuffled into the proper lane. If the required
18277 element is in the other vector, force a zero into the lane by setting
18278 bit 7 in the permutation mask. */
18279 m128 = GEN_INT (-128);
18280 for (i = 0; i < nelt; ++i)
18281 {
18282 unsigned j, e = d->perm[i];
18283 unsigned which = (e >= nelt);
18284 if (e >= nelt)
18285 e -= nelt;
18286
18287 for (j = 0; j < eltsz; ++j)
18288 {
18289 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18290 rperm[1-which][i*eltsz + j] = m128;
18291 }
18292 }
18293
18294 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18295 vperm = force_reg (V16QImode, vperm);
18296
18297 l = gen_reg_rtx (V16QImode);
18298 op = gen_lowpart (V16QImode, d->op0);
18299 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18300
18301 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18302 vperm = force_reg (V16QImode, vperm);
18303
18304 h = gen_reg_rtx (V16QImode);
18305 op = gen_lowpart (V16QImode, d->op1);
18306 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18307
18308 op = d->target;
18309 if (d->vmode != V16QImode)
18310 op = gen_reg_rtx (V16QImode);
18311 emit_insn (gen_iorv16qi3 (op, l, h));
18312 if (op != d->target)
18313 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18314
18315 return true;
18316 }
18317
18318 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18319 with two vpshufb insns, vpermq and vpor. We should have already failed
18320 all two or three instruction sequences. */
18321
18322 static bool
18323 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18324 {
18325 rtx rperm[2][32], vperm, l, h, hp, op, m128;
18326 unsigned int i, nelt, eltsz;
18327
18328 if (!TARGET_AVX2
18329 || !d->one_operand_p
18330 || (d->vmode != V32QImode && d->vmode != V16HImode))
18331 return false;
18332
18333 if (d->testing_p)
18334 return true;
18335
18336 nelt = d->nelt;
18337 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18338
18339 /* Generate two permutation masks. If the required element is within
18340 the same lane, it is shuffled in. If the required element from the
18341 other lane, force a zero by setting bit 7 in the permutation mask.
18342 In the other mask the mask has non-negative elements if element
18343 is requested from the other lane, but also moved to the other lane,
18344 so that the result of vpshufb can have the two V2TImode halves
18345 swapped. */
18346 m128 = GEN_INT (-128);
18347 for (i = 0; i < nelt; ++i)
18348 {
18349 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18350 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18351
18352 for (j = 0; j < eltsz; ++j)
18353 {
18354 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18355 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18356 }
18357 }
18358
18359 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18360 vperm = force_reg (V32QImode, vperm);
18361
18362 h = gen_reg_rtx (V32QImode);
18363 op = gen_lowpart (V32QImode, d->op0);
18364 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18365
18366 /* Swap the 128-byte lanes of h into hp. */
18367 hp = gen_reg_rtx (V4DImode);
18368 op = gen_lowpart (V4DImode, h);
18369 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18370 const1_rtx));
18371
18372 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18373 vperm = force_reg (V32QImode, vperm);
18374
18375 l = gen_reg_rtx (V32QImode);
18376 op = gen_lowpart (V32QImode, d->op0);
18377 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18378
18379 op = d->target;
18380 if (d->vmode != V32QImode)
18381 op = gen_reg_rtx (V32QImode);
18382 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18383 if (op != d->target)
18384 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18385
18386 return true;
18387 }
18388
18389 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18390 and extract-odd permutations of two V32QImode and V16QImode operand
18391 with two vpshufb insns, vpor and vpermq. We should have already
18392 failed all two or three instruction sequences. */
18393
18394 static bool
18395 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18396 {
18397 rtx rperm[2][32], vperm, l, h, ior, op, m128;
18398 unsigned int i, nelt, eltsz;
18399
18400 if (!TARGET_AVX2
18401 || d->one_operand_p
18402 || (d->vmode != V32QImode && d->vmode != V16HImode))
18403 return false;
18404
18405 for (i = 0; i < d->nelt; ++i)
18406 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18407 return false;
18408
18409 if (d->testing_p)
18410 return true;
18411
18412 nelt = d->nelt;
18413 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18414
18415 /* Generate two permutation masks. In the first permutation mask
18416 the first quarter will contain indexes for the first half
18417 of the op0, the second quarter will contain bit 7 set, third quarter
18418 will contain indexes for the second half of the op0 and the
18419 last quarter bit 7 set. In the second permutation mask
18420 the first quarter will contain bit 7 set, the second quarter
18421 indexes for the first half of the op1, the third quarter bit 7 set
18422 and last quarter indexes for the second half of the op1.
18423 I.e. the first mask e.g. for V32QImode extract even will be:
18424 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18425 (all values masked with 0xf except for -128) and second mask
18426 for extract even will be
18427 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18428 m128 = GEN_INT (-128);
18429 for (i = 0; i < nelt; ++i)
18430 {
18431 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18432 unsigned which = d->perm[i] >= nelt;
18433 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18434
18435 for (j = 0; j < eltsz; ++j)
18436 {
18437 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18438 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18439 }
18440 }
18441
18442 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18443 vperm = force_reg (V32QImode, vperm);
18444
18445 l = gen_reg_rtx (V32QImode);
18446 op = gen_lowpart (V32QImode, d->op0);
18447 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18448
18449 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18450 vperm = force_reg (V32QImode, vperm);
18451
18452 h = gen_reg_rtx (V32QImode);
18453 op = gen_lowpart (V32QImode, d->op1);
18454 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18455
18456 ior = gen_reg_rtx (V32QImode);
18457 emit_insn (gen_iorv32qi3 (ior, l, h));
18458
18459 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18460 op = gen_reg_rtx (V4DImode);
18461 ior = gen_lowpart (V4DImode, ior);
18462 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18463 const1_rtx, GEN_INT (3)));
18464 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18465
18466 return true;
18467 }
18468
18469 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18470 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18471 with two "and" and "pack" or two "shift" and "pack" insns. We should
18472 have already failed all two instruction sequences. */
18473
18474 static bool
18475 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18476 {
18477 rtx op, dop0, dop1, t;
18478 unsigned i, odd, c, s, nelt = d->nelt;
18479 bool end_perm = false;
18480 machine_mode half_mode;
18481 rtx (*gen_and) (rtx, rtx, rtx);
18482 rtx (*gen_pack) (rtx, rtx, rtx);
18483 rtx (*gen_shift) (rtx, rtx, rtx);
18484
18485 if (d->one_operand_p)
18486 return false;
18487
18488 switch (d->vmode)
18489 {
18490 case E_V8HImode:
18491 /* Required for "pack". */
18492 if (!TARGET_SSE4_1)
18493 return false;
18494 c = 0xffff;
18495 s = 16;
18496 half_mode = V4SImode;
18497 gen_and = gen_andv4si3;
18498 gen_pack = gen_sse4_1_packusdw;
18499 gen_shift = gen_lshrv4si3;
18500 break;
18501 case E_V16QImode:
18502 /* No check as all instructions are SSE2. */
18503 c = 0xff;
18504 s = 8;
18505 half_mode = V8HImode;
18506 gen_and = gen_andv8hi3;
18507 gen_pack = gen_sse2_packuswb;
18508 gen_shift = gen_lshrv8hi3;
18509 break;
18510 case E_V16HImode:
18511 if (!TARGET_AVX2)
18512 return false;
18513 c = 0xffff;
18514 s = 16;
18515 half_mode = V8SImode;
18516 gen_and = gen_andv8si3;
18517 gen_pack = gen_avx2_packusdw;
18518 gen_shift = gen_lshrv8si3;
18519 end_perm = true;
18520 break;
18521 case E_V32QImode:
18522 if (!TARGET_AVX2)
18523 return false;
18524 c = 0xff;
18525 s = 8;
18526 half_mode = V16HImode;
18527 gen_and = gen_andv16hi3;
18528 gen_pack = gen_avx2_packuswb;
18529 gen_shift = gen_lshrv16hi3;
18530 end_perm = true;
18531 break;
18532 default:
18533 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18534 general shuffles. */
18535 return false;
18536 }
18537
18538 /* Check that permutation is even or odd. */
18539 odd = d->perm[0];
18540 if (odd > 1)
18541 return false;
18542
18543 for (i = 1; i < nelt; ++i)
18544 if (d->perm[i] != 2 * i + odd)
18545 return false;
18546
18547 if (d->testing_p)
18548 return true;
18549
18550 dop0 = gen_reg_rtx (half_mode);
18551 dop1 = gen_reg_rtx (half_mode);
18552 if (odd == 0)
18553 {
18554 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
18555 t = force_reg (half_mode, t);
18556 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
18557 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
18558 }
18559 else
18560 {
18561 emit_insn (gen_shift (dop0,
18562 gen_lowpart (half_mode, d->op0),
18563 GEN_INT (s)));
18564 emit_insn (gen_shift (dop1,
18565 gen_lowpart (half_mode, d->op1),
18566 GEN_INT (s)));
18567 }
18568 /* In AVX2 for 256 bit case we need to permute pack result. */
18569 if (TARGET_AVX2 && end_perm)
18570 {
18571 op = gen_reg_rtx (d->vmode);
18572 t = gen_reg_rtx (V4DImode);
18573 emit_insn (gen_pack (op, dop0, dop1));
18574 emit_insn (gen_avx2_permv4di_1 (t,
18575 gen_lowpart (V4DImode, op),
18576 const0_rtx,
18577 const2_rtx,
18578 const1_rtx,
18579 GEN_INT (3)));
18580 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
18581 }
18582 else
18583 emit_insn (gen_pack (d->target, dop0, dop1));
18584
18585 return true;
18586 }
18587
18588 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18589 and extract-odd permutations of two V64QI operands
18590 with two "shifts", two "truncs" and one "concat" insns for "odd"
18591 and two "truncs" and one concat insn for "even."
18592 Have already failed all two instruction sequences. */
18593
18594 static bool
18595 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
18596 {
18597 rtx t1, t2, t3, t4;
18598 unsigned i, odd, nelt = d->nelt;
18599
18600 if (!TARGET_AVX512BW
18601 || d->one_operand_p
18602 || d->vmode != V64QImode)
18603 return false;
18604
18605 /* Check that permutation is even or odd. */
18606 odd = d->perm[0];
18607 if (odd > 1)
18608 return false;
18609
18610 for (i = 1; i < nelt; ++i)
18611 if (d->perm[i] != 2 * i + odd)
18612 return false;
18613
18614 if (d->testing_p)
18615 return true;
18616
18617
18618 if (odd)
18619 {
18620 t1 = gen_reg_rtx (V32HImode);
18621 t2 = gen_reg_rtx (V32HImode);
18622 emit_insn (gen_lshrv32hi3 (t1,
18623 gen_lowpart (V32HImode, d->op0),
18624 GEN_INT (8)));
18625 emit_insn (gen_lshrv32hi3 (t2,
18626 gen_lowpart (V32HImode, d->op1),
18627 GEN_INT (8)));
18628 }
18629 else
18630 {
18631 t1 = gen_lowpart (V32HImode, d->op0);
18632 t2 = gen_lowpart (V32HImode, d->op1);
18633 }
18634
18635 t3 = gen_reg_rtx (V32QImode);
18636 t4 = gen_reg_rtx (V32QImode);
18637 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
18638 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
18639 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
18640
18641 return true;
18642 }
18643
18644 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
18645 and extract-odd permutations. */
18646
18647 static bool
18648 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
18649 {
18650 rtx t1, t2, t3, t4, t5;
18651
18652 switch (d->vmode)
18653 {
18654 case E_V4DFmode:
18655 if (d->testing_p)
18656 break;
18657 t1 = gen_reg_rtx (V4DFmode);
18658 t2 = gen_reg_rtx (V4DFmode);
18659
18660 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18661 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
18662 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
18663
18664 /* Now an unpck[lh]pd will produce the result required. */
18665 if (odd)
18666 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
18667 else
18668 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
18669 emit_insn (t3);
18670 break;
18671
18672 case E_V8SFmode:
18673 {
18674 int mask = odd ? 0xdd : 0x88;
18675
18676 if (d->testing_p)
18677 break;
18678 t1 = gen_reg_rtx (V8SFmode);
18679 t2 = gen_reg_rtx (V8SFmode);
18680 t3 = gen_reg_rtx (V8SFmode);
18681
18682 /* Shuffle within the 128-bit lanes to produce:
18683 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18684 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
18685 GEN_INT (mask)));
18686
18687 /* Shuffle the lanes around to produce:
18688 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18689 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
18690 GEN_INT (0x3)));
18691
18692 /* Shuffle within the 128-bit lanes to produce:
18693 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18694 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
18695
18696 /* Shuffle within the 128-bit lanes to produce:
18697 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18698 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
18699
18700 /* Shuffle the lanes around to produce:
18701 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18702 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
18703 GEN_INT (0x20)));
18704 }
18705 break;
18706
18707 case E_V2DFmode:
18708 case E_V4SFmode:
18709 case E_V2DImode:
18710 case E_V2SImode:
18711 case E_V4SImode:
18712 /* These are always directly implementable by expand_vec_perm_1. */
18713 gcc_unreachable ();
18714
18715 case E_V2SFmode:
18716 gcc_assert (TARGET_MMX_WITH_SSE);
18717 /* We have no suitable instructions. */
18718 if (d->testing_p)
18719 return false;
18720 break;
18721
18722 case E_V4HImode:
18723 if (d->testing_p)
18724 break;
18725 /* We need 2*log2(N)-1 operations to achieve odd/even
18726 with interleave. */
18727 t1 = gen_reg_rtx (V4HImode);
18728 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
18729 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
18730 if (odd)
18731 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
18732 else
18733 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
18734 emit_insn (t2);
18735 break;
18736
18737 case E_V8HImode:
18738 if (TARGET_SSE4_1)
18739 return expand_vec_perm_even_odd_pack (d);
18740 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
18741 return expand_vec_perm_pshufb2 (d);
18742 else
18743 {
18744 if (d->testing_p)
18745 break;
18746 /* We need 2*log2(N)-1 operations to achieve odd/even
18747 with interleave. */
18748 t1 = gen_reg_rtx (V8HImode);
18749 t2 = gen_reg_rtx (V8HImode);
18750 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
18751 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
18752 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
18753 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
18754 if (odd)
18755 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
18756 else
18757 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
18758 emit_insn (t3);
18759 }
18760 break;
18761
18762 case E_V16QImode:
18763 return expand_vec_perm_even_odd_pack (d);
18764
18765 case E_V16HImode:
18766 case E_V32QImode:
18767 return expand_vec_perm_even_odd_pack (d);
18768
18769 case E_V64QImode:
18770 return expand_vec_perm_even_odd_trunc (d);
18771
18772 case E_V4DImode:
18773 if (!TARGET_AVX2)
18774 {
18775 struct expand_vec_perm_d d_copy = *d;
18776 d_copy.vmode = V4DFmode;
18777 if (d->testing_p)
18778 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
18779 else
18780 d_copy.target = gen_reg_rtx (V4DFmode);
18781 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
18782 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
18783 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18784 {
18785 if (!d->testing_p)
18786 emit_move_insn (d->target,
18787 gen_lowpart (V4DImode, d_copy.target));
18788 return true;
18789 }
18790 return false;
18791 }
18792
18793 if (d->testing_p)
18794 break;
18795
18796 t1 = gen_reg_rtx (V4DImode);
18797 t2 = gen_reg_rtx (V4DImode);
18798
18799 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18800 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
18801 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
18802
18803 /* Now an vpunpck[lh]qdq will produce the result required. */
18804 if (odd)
18805 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
18806 else
18807 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
18808 emit_insn (t3);
18809 break;
18810
18811 case E_V8SImode:
18812 if (!TARGET_AVX2)
18813 {
18814 struct expand_vec_perm_d d_copy = *d;
18815 d_copy.vmode = V8SFmode;
18816 if (d->testing_p)
18817 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
18818 else
18819 d_copy.target = gen_reg_rtx (V8SFmode);
18820 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
18821 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
18822 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18823 {
18824 if (!d->testing_p)
18825 emit_move_insn (d->target,
18826 gen_lowpart (V8SImode, d_copy.target));
18827 return true;
18828 }
18829 return false;
18830 }
18831
18832 if (d->testing_p)
18833 break;
18834
18835 t1 = gen_reg_rtx (V8SImode);
18836 t2 = gen_reg_rtx (V8SImode);
18837 t3 = gen_reg_rtx (V4DImode);
18838 t4 = gen_reg_rtx (V4DImode);
18839 t5 = gen_reg_rtx (V4DImode);
18840
18841 /* Shuffle the lanes around into
18842 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18843 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
18844 gen_lowpart (V4DImode, d->op1),
18845 GEN_INT (0x20)));
18846 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
18847 gen_lowpart (V4DImode, d->op1),
18848 GEN_INT (0x31)));
18849
18850 /* Swap the 2nd and 3rd position in each lane into
18851 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18852 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
18853 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18854 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
18855 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18856
18857 /* Now an vpunpck[lh]qdq will produce
18858 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18859 if (odd)
18860 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
18861 gen_lowpart (V4DImode, t2));
18862 else
18863 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
18864 gen_lowpart (V4DImode, t2));
18865 emit_insn (t3);
18866 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
18867 break;
18868
18869 default:
18870 gcc_unreachable ();
18871 }
18872
18873 return true;
18874 }
18875
18876 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18877 extract-even and extract-odd permutations. */
18878
18879 static bool
18880 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
18881 {
18882 unsigned i, odd, nelt = d->nelt;
18883
18884 odd = d->perm[0];
18885 if (odd != 0 && odd != 1)
18886 return false;
18887
18888 for (i = 1; i < nelt; ++i)
18889 if (d->perm[i] != 2 * i + odd)
18890 return false;
18891
18892 return expand_vec_perm_even_odd_1 (d, odd);
18893 }
18894
18895 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
18896 permutations. We assume that expand_vec_perm_1 has already failed. */
18897
18898 static bool
18899 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
18900 {
18901 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
18902 machine_mode vmode = d->vmode;
18903 unsigned char perm2[4];
18904 rtx op0 = d->op0, dest;
18905 bool ok;
18906
18907 switch (vmode)
18908 {
18909 case E_V4DFmode:
18910 case E_V8SFmode:
18911 /* These are special-cased in sse.md so that we can optionally
18912 use the vbroadcast instruction. They expand to two insns
18913 if the input happens to be in a register. */
18914 gcc_unreachable ();
18915
18916 case E_V2DFmode:
18917 case E_V2SFmode:
18918 case E_V4SFmode:
18919 case E_V2DImode:
18920 case E_V2SImode:
18921 case E_V4SImode:
18922 /* These are always implementable using standard shuffle patterns. */
18923 gcc_unreachable ();
18924
18925 case E_V8HImode:
18926 case E_V16QImode:
18927 /* These can be implemented via interleave. We save one insn by
18928 stopping once we have promoted to V4SImode and then use pshufd. */
18929 if (d->testing_p)
18930 return true;
18931 do
18932 {
18933 rtx dest;
18934 rtx (*gen) (rtx, rtx, rtx)
18935 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
18936 : gen_vec_interleave_lowv8hi;
18937
18938 if (elt >= nelt2)
18939 {
18940 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
18941 : gen_vec_interleave_highv8hi;
18942 elt -= nelt2;
18943 }
18944 nelt2 /= 2;
18945
18946 dest = gen_reg_rtx (vmode);
18947 emit_insn (gen (dest, op0, op0));
18948 vmode = get_mode_wider_vector (vmode);
18949 op0 = gen_lowpart (vmode, dest);
18950 }
18951 while (vmode != V4SImode);
18952
18953 memset (perm2, elt, 4);
18954 dest = gen_reg_rtx (V4SImode);
18955 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
18956 gcc_assert (ok);
18957 if (!d->testing_p)
18958 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
18959 return true;
18960
18961 case E_V64QImode:
18962 case E_V32QImode:
18963 case E_V16HImode:
18964 case E_V8SImode:
18965 case E_V4DImode:
18966 /* For AVX2 broadcasts of the first element vpbroadcast* or
18967 vpermq should be used by expand_vec_perm_1. */
18968 gcc_assert (!TARGET_AVX2 || d->perm[0]);
18969 return false;
18970
18971 default:
18972 gcc_unreachable ();
18973 }
18974 }
18975
18976 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
18977 broadcast permutations. */
18978
18979 static bool
18980 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
18981 {
18982 unsigned i, elt, nelt = d->nelt;
18983
18984 if (!d->one_operand_p)
18985 return false;
18986
18987 elt = d->perm[0];
18988 for (i = 1; i < nelt; ++i)
18989 if (d->perm[i] != elt)
18990 return false;
18991
18992 return expand_vec_perm_broadcast_1 (d);
18993 }
18994
18995 /* Implement arbitrary permutations of two V64QImode operands
18996 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18997 static bool
18998 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
18999 {
19000 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
19001 return false;
19002
19003 if (d->testing_p)
19004 return true;
19005
19006 struct expand_vec_perm_d ds[2];
19007 rtx rperm[128], vperm, target0, target1;
19008 unsigned int i, nelt;
19009 machine_mode vmode;
19010
19011 nelt = d->nelt;
19012 vmode = V64QImode;
19013
19014 for (i = 0; i < 2; i++)
19015 {
19016 ds[i] = *d;
19017 ds[i].vmode = V32HImode;
19018 ds[i].nelt = 32;
19019 ds[i].target = gen_reg_rtx (V32HImode);
19020 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
19021 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
19022 }
19023
19024 /* Prepare permutations such that the first one takes care of
19025 putting the even bytes into the right positions or one higher
19026 positions (ds[0]) and the second one takes care of
19027 putting the odd bytes into the right positions or one below
19028 (ds[1]). */
19029
19030 for (i = 0; i < nelt; i++)
19031 {
19032 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
19033 if (i & 1)
19034 {
19035 rperm[i] = constm1_rtx;
19036 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19037 }
19038 else
19039 {
19040 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19041 rperm[i + 64] = constm1_rtx;
19042 }
19043 }
19044
19045 bool ok = expand_vec_perm_1 (&ds[0]);
19046 gcc_assert (ok);
19047 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
19048
19049 ok = expand_vec_perm_1 (&ds[1]);
19050 gcc_assert (ok);
19051 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
19052
19053 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
19054 vperm = force_reg (vmode, vperm);
19055 target0 = gen_reg_rtx (V64QImode);
19056 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
19057
19058 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
19059 vperm = force_reg (vmode, vperm);
19060 target1 = gen_reg_rtx (V64QImode);
19061 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
19062
19063 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
19064 return true;
19065 }
19066
19067 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
19068 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
19069 all the shorter instruction sequences. */
19070
19071 static bool
19072 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
19073 {
19074 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
19075 unsigned int i, nelt, eltsz;
19076 bool used[4];
19077
19078 if (!TARGET_AVX2
19079 || d->one_operand_p
19080 || (d->vmode != V32QImode && d->vmode != V16HImode))
19081 return false;
19082
19083 if (d->testing_p)
19084 return true;
19085
19086 nelt = d->nelt;
19087 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19088
19089 /* Generate 4 permutation masks. If the required element is within
19090 the same lane, it is shuffled in. If the required element from the
19091 other lane, force a zero by setting bit 7 in the permutation mask.
19092 In the other mask the mask has non-negative elements if element
19093 is requested from the other lane, but also moved to the other lane,
19094 so that the result of vpshufb can have the two V2TImode halves
19095 swapped. */
19096 m128 = GEN_INT (-128);
19097 for (i = 0; i < 32; ++i)
19098 {
19099 rperm[0][i] = m128;
19100 rperm[1][i] = m128;
19101 rperm[2][i] = m128;
19102 rperm[3][i] = m128;
19103 }
19104 used[0] = false;
19105 used[1] = false;
19106 used[2] = false;
19107 used[3] = false;
19108 for (i = 0; i < nelt; ++i)
19109 {
19110 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
19111 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
19112 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
19113
19114 for (j = 0; j < eltsz; ++j)
19115 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
19116 used[which] = true;
19117 }
19118
19119 for (i = 0; i < 2; ++i)
19120 {
19121 if (!used[2 * i + 1])
19122 {
19123 h[i] = NULL_RTX;
19124 continue;
19125 }
19126 vperm = gen_rtx_CONST_VECTOR (V32QImode,
19127 gen_rtvec_v (32, rperm[2 * i + 1]));
19128 vperm = force_reg (V32QImode, vperm);
19129 h[i] = gen_reg_rtx (V32QImode);
19130 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19131 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
19132 }
19133
19134 /* Swap the 128-byte lanes of h[X]. */
19135 for (i = 0; i < 2; ++i)
19136 {
19137 if (h[i] == NULL_RTX)
19138 continue;
19139 op = gen_reg_rtx (V4DImode);
19140 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
19141 const2_rtx, GEN_INT (3), const0_rtx,
19142 const1_rtx));
19143 h[i] = gen_lowpart (V32QImode, op);
19144 }
19145
19146 for (i = 0; i < 2; ++i)
19147 {
19148 if (!used[2 * i])
19149 {
19150 l[i] = NULL_RTX;
19151 continue;
19152 }
19153 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19154 vperm = force_reg (V32QImode, vperm);
19155 l[i] = gen_reg_rtx (V32QImode);
19156 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19157 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19158 }
19159
19160 for (i = 0; i < 2; ++i)
19161 {
19162 if (h[i] && l[i])
19163 {
19164 op = gen_reg_rtx (V32QImode);
19165 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19166 l[i] = op;
19167 }
19168 else if (h[i])
19169 l[i] = h[i];
19170 }
19171
19172 gcc_assert (l[0] && l[1]);
19173 op = d->target;
19174 if (d->vmode != V32QImode)
19175 op = gen_reg_rtx (V32QImode);
19176 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19177 if (op != d->target)
19178 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19179 return true;
19180 }
19181
19182 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19183 taken care of, perform the expansion in D and return true on success. */
19184
19185 static bool
19186 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19187 {
19188 /* Try a single instruction expansion. */
19189 if (expand_vec_perm_1 (d))
19190 return true;
19191
19192 /* Try sequences of two instructions. */
19193
19194 if (expand_vec_perm_pshuflw_pshufhw (d))
19195 return true;
19196
19197 if (expand_vec_perm_palignr (d, false))
19198 return true;
19199
19200 if (expand_vec_perm_interleave2 (d))
19201 return true;
19202
19203 if (expand_vec_perm_broadcast (d))
19204 return true;
19205
19206 if (expand_vec_perm_vpermq_perm_1 (d))
19207 return true;
19208
19209 if (expand_vec_perm_vperm2f128 (d))
19210 return true;
19211
19212 if (expand_vec_perm_pblendv (d))
19213 return true;
19214
19215 /* Try sequences of three instructions. */
19216
19217 if (expand_vec_perm_even_odd_pack (d))
19218 return true;
19219
19220 if (expand_vec_perm_2vperm2f128_vshuf (d))
19221 return true;
19222
19223 if (expand_vec_perm_pshufb2 (d))
19224 return true;
19225
19226 if (expand_vec_perm_interleave3 (d))
19227 return true;
19228
19229 if (expand_vec_perm_vperm2f128_vblend (d))
19230 return true;
19231
19232 /* Try sequences of four instructions. */
19233
19234 if (expand_vec_perm_even_odd_trunc (d))
19235 return true;
19236 if (expand_vec_perm_vpshufb2_vpermq (d))
19237 return true;
19238
19239 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19240 return true;
19241
19242 if (expand_vec_perm_vpermt2_vpshub2 (d))
19243 return true;
19244
19245 /* ??? Look for narrow permutations whose element orderings would
19246 allow the promotion to a wider mode. */
19247
19248 /* ??? Look for sequences of interleave or a wider permute that place
19249 the data into the correct lanes for a half-vector shuffle like
19250 pshuf[lh]w or vpermilps. */
19251
19252 /* ??? Look for sequences of interleave that produce the desired results.
19253 The combinatorics of punpck[lh] get pretty ugly... */
19254
19255 if (expand_vec_perm_even_odd (d))
19256 return true;
19257
19258 /* Even longer sequences. */
19259 if (expand_vec_perm_vpshufb4_vpermq2 (d))
19260 return true;
19261
19262 /* See if we can get the same permutation in different vector integer
19263 mode. */
19264 struct expand_vec_perm_d nd;
19265 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19266 {
19267 if (!d->testing_p)
19268 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19269 return true;
19270 }
19271
19272 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19273 if (expand_vec_perm2_vperm2f128_vblend (d))
19274 return true;
19275
19276 return false;
19277 }
19278
19279 /* If a permutation only uses one operand, make it clear. Returns true
19280 if the permutation references both operands. */
19281
19282 static bool
19283 canonicalize_perm (struct expand_vec_perm_d *d)
19284 {
19285 int i, which, nelt = d->nelt;
19286
19287 for (i = which = 0; i < nelt; ++i)
19288 which |= (d->perm[i] < nelt ? 1 : 2);
19289
19290 d->one_operand_p = true;
19291 switch (which)
19292 {
19293 default:
19294 gcc_unreachable();
19295
19296 case 3:
19297 if (!rtx_equal_p (d->op0, d->op1))
19298 {
19299 d->one_operand_p = false;
19300 break;
19301 }
19302 /* The elements of PERM do not suggest that only the first operand
19303 is used, but both operands are identical. Allow easier matching
19304 of the permutation by folding the permutation into the single
19305 input vector. */
19306 /* FALLTHRU */
19307
19308 case 2:
19309 for (i = 0; i < nelt; ++i)
19310 d->perm[i] &= nelt - 1;
19311 d->op0 = d->op1;
19312 break;
19313
19314 case 1:
19315 d->op1 = d->op0;
19316 break;
19317 }
19318
19319 return (which == 3);
19320 }
19321
19322 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19323
19324 bool
19325 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19326 rtx op1, const vec_perm_indices &sel)
19327 {
19328 struct expand_vec_perm_d d;
19329 unsigned char perm[MAX_VECT_LEN];
19330 unsigned int i, nelt, which;
19331 bool two_args;
19332
19333 d.target = target;
19334 d.op0 = op0;
19335 d.op1 = op1;
19336
19337 d.vmode = vmode;
19338 gcc_assert (VECTOR_MODE_P (d.vmode));
19339 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19340 d.testing_p = !target;
19341
19342 gcc_assert (sel.length () == nelt);
19343 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19344
19345 /* Given sufficient ISA support we can just return true here
19346 for selected vector modes. */
19347 switch (d.vmode)
19348 {
19349 case E_V16SFmode:
19350 case E_V16SImode:
19351 case E_V8DImode:
19352 case E_V8DFmode:
19353 if (!TARGET_AVX512F)
19354 return false;
19355 /* All implementable with a single vperm[it]2 insn. */
19356 if (d.testing_p)
19357 return true;
19358 break;
19359 case E_V32HImode:
19360 if (!TARGET_AVX512BW)
19361 return false;
19362 if (d.testing_p)
19363 /* All implementable with a single vperm[it]2 insn. */
19364 return true;
19365 break;
19366 case E_V64QImode:
19367 if (!TARGET_AVX512BW)
19368 return false;
19369 if (d.testing_p)
19370 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19371 return true;
19372 break;
19373 case E_V8SImode:
19374 case E_V8SFmode:
19375 case E_V4DFmode:
19376 case E_V4DImode:
19377 if (!TARGET_AVX)
19378 return false;
19379 if (d.testing_p && TARGET_AVX512VL)
19380 /* All implementable with a single vperm[it]2 insn. */
19381 return true;
19382 break;
19383 case E_V16HImode:
19384 if (!TARGET_SSE2)
19385 return false;
19386 if (d.testing_p && TARGET_AVX2)
19387 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19388 return true;
19389 break;
19390 case E_V32QImode:
19391 if (!TARGET_SSE2)
19392 return false;
19393 if (d.testing_p && TARGET_AVX2)
19394 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19395 return true;
19396 break;
19397 case E_V8HImode:
19398 case E_V16QImode:
19399 if (!TARGET_SSE2)
19400 return false;
19401 /* Fall through. */
19402 case E_V4SImode:
19403 case E_V4SFmode:
19404 if (!TARGET_SSE)
19405 return false;
19406 /* All implementable with a single vpperm insn. */
19407 if (d.testing_p && TARGET_XOP)
19408 return true;
19409 /* All implementable with 2 pshufb + 1 ior. */
19410 if (d.testing_p && TARGET_SSSE3)
19411 return true;
19412 break;
19413 case E_V2SFmode:
19414 case E_V2SImode:
19415 case E_V4HImode:
19416 if (!TARGET_MMX_WITH_SSE)
19417 return false;
19418 break;
19419 case E_V2DImode:
19420 case E_V2DFmode:
19421 if (!TARGET_SSE)
19422 return false;
19423 /* All implementable with shufpd or unpck[lh]pd. */
19424 if (d.testing_p)
19425 return true;
19426 break;
19427 default:
19428 return false;
19429 }
19430
19431 for (i = which = 0; i < nelt; ++i)
19432 {
19433 unsigned char e = sel[i];
19434 gcc_assert (e < 2 * nelt);
19435 d.perm[i] = e;
19436 perm[i] = e;
19437 which |= (e < nelt ? 1 : 2);
19438 }
19439
19440 if (d.testing_p)
19441 {
19442 /* For all elements from second vector, fold the elements to first. */
19443 if (which == 2)
19444 for (i = 0; i < nelt; ++i)
19445 d.perm[i] -= nelt;
19446
19447 /* Check whether the mask can be applied to the vector type. */
19448 d.one_operand_p = (which != 3);
19449
19450 /* Implementable with shufps or pshufd. */
19451 if (d.one_operand_p
19452 && (d.vmode == V4SFmode || d.vmode == V2SFmode
19453 || d.vmode == V4SImode || d.vmode == V2SImode))
19454 return true;
19455
19456 /* Otherwise we have to go through the motions and see if we can
19457 figure out how to generate the requested permutation. */
19458 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19459 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19460 if (!d.one_operand_p)
19461 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19462
19463 start_sequence ();
19464 bool ret = ix86_expand_vec_perm_const_1 (&d);
19465 end_sequence ();
19466
19467 return ret;
19468 }
19469
19470 two_args = canonicalize_perm (&d);
19471
19472 if (ix86_expand_vec_perm_const_1 (&d))
19473 return true;
19474
19475 /* If the selector says both arguments are needed, but the operands are the
19476 same, the above tried to expand with one_operand_p and flattened selector.
19477 If that didn't work, retry without one_operand_p; we succeeded with that
19478 during testing. */
19479 if (two_args && d.one_operand_p)
19480 {
19481 d.one_operand_p = false;
19482 memcpy (d.perm, perm, sizeof (perm));
19483 return ix86_expand_vec_perm_const_1 (&d);
19484 }
19485
19486 return false;
19487 }
19488
19489 void
19490 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
19491 {
19492 struct expand_vec_perm_d d;
19493 unsigned i, nelt;
19494
19495 d.target = targ;
19496 d.op0 = op0;
19497 d.op1 = op1;
19498 d.vmode = GET_MODE (targ);
19499 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19500 d.one_operand_p = false;
19501 d.testing_p = false;
19502
19503 for (i = 0; i < nelt; ++i)
19504 d.perm[i] = i * 2 + odd;
19505
19506 /* We'll either be able to implement the permutation directly... */
19507 if (expand_vec_perm_1 (&d))
19508 return;
19509
19510 /* ... or we use the special-case patterns. */
19511 expand_vec_perm_even_odd_1 (&d, odd);
19512 }
19513
19514 static void
19515 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
19516 {
19517 struct expand_vec_perm_d d;
19518 unsigned i, nelt, base;
19519 bool ok;
19520
19521 d.target = targ;
19522 d.op0 = op0;
19523 d.op1 = op1;
19524 d.vmode = GET_MODE (targ);
19525 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19526 d.one_operand_p = false;
19527 d.testing_p = false;
19528
19529 base = high_p ? nelt / 2 : 0;
19530 for (i = 0; i < nelt / 2; ++i)
19531 {
19532 d.perm[i * 2] = i + base;
19533 d.perm[i * 2 + 1] = i + base + nelt;
19534 }
19535
19536 /* Note that for AVX this isn't one instruction. */
19537 ok = ix86_expand_vec_perm_const_1 (&d);
19538 gcc_assert (ok);
19539 }
19540
19541 /* Optimize vector MUL generation for V8QI, V16QI and V32QI
19542 under TARGET_AVX512BW. i.e. for v16qi a * b, it has
19543
19544 vpmovzxbw ymm2, xmm0
19545 vpmovzxbw ymm3, xmm1
19546 vpmullw ymm4, ymm2, ymm3
19547 vpmovwb xmm0, ymm4
19548
19549 it would take less instructions than ix86_expand_vecop_qihi.
19550 Return true if success. */
19551
19552 bool
19553 ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
19554 {
19555 machine_mode himode, qimode = GET_MODE (dest);
19556 rtx hop1, hop2, hdest;
19557 rtx (*gen_extend)(rtx, rtx);
19558 rtx (*gen_truncate)(rtx, rtx);
19559
19560 /* There's no V64HImode multiplication instruction. */
19561 if (qimode == E_V64QImode)
19562 return false;
19563
19564 /* vpmovwb only available under AVX512BW. */
19565 if (!TARGET_AVX512BW)
19566 return false;
19567 if ((qimode == V8QImode || qimode == V16QImode)
19568 && !TARGET_AVX512VL)
19569 return false;
19570 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
19571 if (qimode == V32QImode
19572 && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
19573 return false;
19574
19575 switch (qimode)
19576 {
19577 case E_V8QImode:
19578 himode = V8HImode;
19579 gen_extend = gen_zero_extendv8qiv8hi2;
19580 gen_truncate = gen_truncv8hiv8qi2;
19581 break;
19582 case E_V16QImode:
19583 himode = V16HImode;
19584 gen_extend = gen_zero_extendv16qiv16hi2;
19585 gen_truncate = gen_truncv16hiv16qi2;
19586 break;
19587 case E_V32QImode:
19588 himode = V32HImode;
19589 gen_extend = gen_zero_extendv32qiv32hi2;
19590 gen_truncate = gen_truncv32hiv32qi2;
19591 break;
19592 default:
19593 gcc_unreachable ();
19594 }
19595
19596 hop1 = gen_reg_rtx (himode);
19597 hop2 = gen_reg_rtx (himode);
19598 hdest = gen_reg_rtx (himode);
19599 emit_insn (gen_extend (hop1, op1));
19600 emit_insn (gen_extend (hop2, op2));
19601 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
19602 hop1, hop2)));
19603 emit_insn (gen_truncate (dest, hdest));
19604 return true;
19605 }
19606
19607 /* Expand a vector operation shift by constant for a V*QImode in terms of the
19608 same operation on V*HImode. Return true if success. */
19609 bool
19610 ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
19611 {
19612 machine_mode qimode, himode;
19613 HOST_WIDE_INT and_constant, xor_constant;
19614 HOST_WIDE_INT shift_amount;
19615 rtx vec_const_and, vec_const_xor;
19616 rtx tmp, op1_subreg;
19617 rtx (*gen_shift) (rtx, rtx, rtx);
19618 rtx (*gen_and) (rtx, rtx, rtx);
19619 rtx (*gen_xor) (rtx, rtx, rtx);
19620 rtx (*gen_sub) (rtx, rtx, rtx);
19621
19622 /* Only optimize shift by constant. */
19623 if (!CONST_INT_P (op2))
19624 return false;
19625
19626 qimode = GET_MODE (dest);
19627 shift_amount = INTVAL (op2);
19628 /* Do nothing when shift amount greater equal 8. */
19629 if (shift_amount > 7)
19630 return false;
19631
19632 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
19633 /* Record sign bit. */
19634 xor_constant = 1 << (8 - shift_amount - 1);
19635
19636 /* Zero upper/lower bits shift from left/right element. */
19637 and_constant
19638 = (code == ASHIFT ? 256 - (1 << shift_amount)
19639 : (1 << (8 - shift_amount)) - 1);
19640
19641 switch (qimode)
19642 {
19643 case V16QImode:
19644 himode = V8HImode;
19645 gen_shift =
19646 ((code == ASHIFT)
19647 ? gen_ashlv8hi3
19648 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
19649 gen_and = gen_andv16qi3;
19650 gen_xor = gen_xorv16qi3;
19651 gen_sub = gen_subv16qi3;
19652 break;
19653 case V32QImode:
19654 himode = V16HImode;
19655 gen_shift =
19656 ((code == ASHIFT)
19657 ? gen_ashlv16hi3
19658 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
19659 gen_and = gen_andv32qi3;
19660 gen_xor = gen_xorv32qi3;
19661 gen_sub = gen_subv32qi3;
19662 break;
19663 case V64QImode:
19664 himode = V32HImode;
19665 gen_shift =
19666 ((code == ASHIFT)
19667 ? gen_ashlv32hi3
19668 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
19669 gen_and = gen_andv64qi3;
19670 gen_xor = gen_xorv64qi3;
19671 gen_sub = gen_subv64qi3;
19672 break;
19673 default:
19674 gcc_unreachable ();
19675 }
19676
19677 tmp = gen_reg_rtx (himode);
19678 vec_const_and = gen_reg_rtx (qimode);
19679 op1_subreg = lowpart_subreg (himode, op1, qimode);
19680
19681 /* For ASHIFT and LSHIFTRT, perform operation like
19682 vpsllw/vpsrlw $shift_amount, %op1, %dest.
19683 vpand %vec_const_and, %dest. */
19684 emit_insn (gen_shift (tmp, op1_subreg, op2));
19685 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
19686 emit_move_insn (vec_const_and,
19687 ix86_build_const_vector (qimode, true,
19688 gen_int_mode (and_constant, QImode)));
19689 emit_insn (gen_and (dest, dest, vec_const_and));
19690
19691 /* For ASHIFTRT, perform extra operation like
19692 vpxor %vec_const_xor, %dest, %dest
19693 vpsubb %vec_const_xor, %dest, %dest */
19694 if (code == ASHIFTRT)
19695 {
19696 vec_const_xor = gen_reg_rtx (qimode);
19697 emit_move_insn (vec_const_xor,
19698 ix86_build_const_vector (qimode, true,
19699 gen_int_mode (xor_constant, QImode)));
19700 emit_insn (gen_xor (dest, dest, vec_const_xor));
19701 emit_insn (gen_sub (dest, dest, vec_const_xor));
19702 }
19703 return true;
19704 }
19705
19706 /* Expand a vector operation CODE for a V*QImode in terms of the
19707 same operation on V*HImode. */
19708
19709 void
19710 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
19711 {
19712 machine_mode qimode = GET_MODE (dest);
19713 machine_mode himode;
19714 rtx (*gen_il) (rtx, rtx, rtx);
19715 rtx (*gen_ih) (rtx, rtx, rtx);
19716 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
19717 struct expand_vec_perm_d d;
19718 bool ok, full_interleave;
19719 bool uns_p = false;
19720 int i;
19721
19722 switch (qimode)
19723 {
19724 case E_V16QImode:
19725 himode = V8HImode;
19726 gen_il = gen_vec_interleave_lowv16qi;
19727 gen_ih = gen_vec_interleave_highv16qi;
19728 break;
19729 case E_V32QImode:
19730 himode = V16HImode;
19731 gen_il = gen_avx2_interleave_lowv32qi;
19732 gen_ih = gen_avx2_interleave_highv32qi;
19733 break;
19734 case E_V64QImode:
19735 himode = V32HImode;
19736 gen_il = gen_avx512bw_interleave_lowv64qi;
19737 gen_ih = gen_avx512bw_interleave_highv64qi;
19738 break;
19739 default:
19740 gcc_unreachable ();
19741 }
19742
19743 op2_l = op2_h = op2;
19744 switch (code)
19745 {
19746 case MULT:
19747 /* Unpack data such that we've got a source byte in each low byte of
19748 each word. We don't care what goes into the high byte of each word.
19749 Rather than trying to get zero in there, most convenient is to let
19750 it be a copy of the low byte. */
19751 op2_l = gen_reg_rtx (qimode);
19752 op2_h = gen_reg_rtx (qimode);
19753 emit_insn (gen_il (op2_l, op2, op2));
19754 emit_insn (gen_ih (op2_h, op2, op2));
19755
19756 op1_l = gen_reg_rtx (qimode);
19757 op1_h = gen_reg_rtx (qimode);
19758 emit_insn (gen_il (op1_l, op1, op1));
19759 emit_insn (gen_ih (op1_h, op1, op1));
19760 full_interleave = qimode == V16QImode;
19761 break;
19762
19763 case ASHIFT:
19764 case LSHIFTRT:
19765 uns_p = true;
19766 /* FALLTHRU */
19767 case ASHIFTRT:
19768 op1_l = gen_reg_rtx (himode);
19769 op1_h = gen_reg_rtx (himode);
19770 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
19771 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
19772 full_interleave = true;
19773 break;
19774 default:
19775 gcc_unreachable ();
19776 }
19777
19778 /* Perform the operation. */
19779 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
19780 1, OPTAB_DIRECT);
19781 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
19782 1, OPTAB_DIRECT);
19783 gcc_assert (res_l && res_h);
19784
19785 /* Merge the data back into the right place. */
19786 d.target = dest;
19787 d.op0 = gen_lowpart (qimode, res_l);
19788 d.op1 = gen_lowpart (qimode, res_h);
19789 d.vmode = qimode;
19790 d.nelt = GET_MODE_NUNITS (qimode);
19791 d.one_operand_p = false;
19792 d.testing_p = false;
19793
19794 if (full_interleave)
19795 {
19796 /* For SSE2, we used an full interleave, so the desired
19797 results are in the even elements. */
19798 for (i = 0; i < d.nelt; ++i)
19799 d.perm[i] = i * 2;
19800 }
19801 else
19802 {
19803 /* For AVX, the interleave used above was not cross-lane. So the
19804 extraction is evens but with the second and third quarter swapped.
19805 Happily, that is even one insn shorter than even extraction.
19806 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19807 always first from the first and then from the second source operand,
19808 the index bits above the low 4 bits remains the same.
19809 Thus, for d.nelt == 32 we want permutation
19810 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19811 and for d.nelt == 64 we want permutation
19812 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19813 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19814 for (i = 0; i < d.nelt; ++i)
19815 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
19816 }
19817
19818 ok = ix86_expand_vec_perm_const_1 (&d);
19819 gcc_assert (ok);
19820
19821 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19822 gen_rtx_fmt_ee (code, qimode, op1, op2));
19823 }
19824
19825 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
19826 if op is CONST_VECTOR with all odd elements equal to their
19827 preceding element. */
19828
19829 static bool
19830 const_vector_equal_evenodd_p (rtx op)
19831 {
19832 machine_mode mode = GET_MODE (op);
19833 int i, nunits = GET_MODE_NUNITS (mode);
19834 if (GET_CODE (op) != CONST_VECTOR
19835 || nunits != CONST_VECTOR_NUNITS (op))
19836 return false;
19837 for (i = 0; i < nunits; i += 2)
19838 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
19839 return false;
19840 return true;
19841 }
19842
19843 void
19844 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
19845 bool uns_p, bool odd_p)
19846 {
19847 machine_mode mode = GET_MODE (op1);
19848 machine_mode wmode = GET_MODE (dest);
19849 rtx x;
19850 rtx orig_op1 = op1, orig_op2 = op2;
19851
19852 if (!nonimmediate_operand (op1, mode))
19853 op1 = force_reg (mode, op1);
19854 if (!nonimmediate_operand (op2, mode))
19855 op2 = force_reg (mode, op2);
19856
19857 /* We only play even/odd games with vectors of SImode. */
19858 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
19859
19860 /* If we're looking for the odd results, shift those members down to
19861 the even slots. For some cpus this is faster than a PSHUFD. */
19862 if (odd_p)
19863 {
19864 /* For XOP use vpmacsdqh, but only for smult, as it is only
19865 signed. */
19866 if (TARGET_XOP && mode == V4SImode && !uns_p)
19867 {
19868 x = force_reg (wmode, CONST0_RTX (wmode));
19869 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
19870 return;
19871 }
19872
19873 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
19874 if (!const_vector_equal_evenodd_p (orig_op1))
19875 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
19876 x, NULL, 1, OPTAB_DIRECT);
19877 if (!const_vector_equal_evenodd_p (orig_op2))
19878 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
19879 x, NULL, 1, OPTAB_DIRECT);
19880 op1 = gen_lowpart (mode, op1);
19881 op2 = gen_lowpart (mode, op2);
19882 }
19883
19884 if (mode == V16SImode)
19885 {
19886 if (uns_p)
19887 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
19888 else
19889 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
19890 }
19891 else if (mode == V8SImode)
19892 {
19893 if (uns_p)
19894 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
19895 else
19896 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
19897 }
19898 else if (uns_p)
19899 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
19900 else if (TARGET_SSE4_1)
19901 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
19902 else
19903 {
19904 rtx s1, s2, t0, t1, t2;
19905
19906 /* The easiest way to implement this without PMULDQ is to go through
19907 the motions as if we are performing a full 64-bit multiply. With
19908 the exception that we need to do less shuffling of the elements. */
19909
19910 /* Compute the sign-extension, aka highparts, of the two operands. */
19911 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19912 op1, pc_rtx, pc_rtx);
19913 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19914 op2, pc_rtx, pc_rtx);
19915
19916 /* Multiply LO(A) * HI(B), and vice-versa. */
19917 t1 = gen_reg_rtx (wmode);
19918 t2 = gen_reg_rtx (wmode);
19919 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
19920 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
19921
19922 /* Multiply LO(A) * LO(B). */
19923 t0 = gen_reg_rtx (wmode);
19924 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
19925
19926 /* Combine and shift the highparts into place. */
19927 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
19928 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
19929 1, OPTAB_DIRECT);
19930
19931 /* Combine high and low parts. */
19932 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
19933 return;
19934 }
19935 emit_insn (x);
19936 }
19937
19938 void
19939 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
19940 bool uns_p, bool high_p)
19941 {
19942 machine_mode wmode = GET_MODE (dest);
19943 machine_mode mode = GET_MODE (op1);
19944 rtx t1, t2, t3, t4, mask;
19945
19946 switch (mode)
19947 {
19948 case E_V4SImode:
19949 t1 = gen_reg_rtx (mode);
19950 t2 = gen_reg_rtx (mode);
19951 if (TARGET_XOP && !uns_p)
19952 {
19953 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19954 shuffle the elements once so that all elements are in the right
19955 place for immediate use: { A C B D }. */
19956 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
19957 const1_rtx, GEN_INT (3)));
19958 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
19959 const1_rtx, GEN_INT (3)));
19960 }
19961 else
19962 {
19963 /* Put the elements into place for the multiply. */
19964 ix86_expand_vec_interleave (t1, op1, op1, high_p);
19965 ix86_expand_vec_interleave (t2, op2, op2, high_p);
19966 high_p = false;
19967 }
19968 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
19969 break;
19970
19971 case E_V8SImode:
19972 /* Shuffle the elements between the lanes. After this we
19973 have { A B E F | C D G H } for each operand. */
19974 t1 = gen_reg_rtx (V4DImode);
19975 t2 = gen_reg_rtx (V4DImode);
19976 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
19977 const0_rtx, const2_rtx,
19978 const1_rtx, GEN_INT (3)));
19979 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
19980 const0_rtx, const2_rtx,
19981 const1_rtx, GEN_INT (3)));
19982
19983 /* Shuffle the elements within the lanes. After this we
19984 have { A A B B | C C D D } or { E E F F | G G H H }. */
19985 t3 = gen_reg_rtx (V8SImode);
19986 t4 = gen_reg_rtx (V8SImode);
19987 mask = GEN_INT (high_p
19988 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19989 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19990 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
19991 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
19992
19993 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
19994 break;
19995
19996 case E_V8HImode:
19997 case E_V16HImode:
19998 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
19999 uns_p, OPTAB_DIRECT);
20000 t2 = expand_binop (mode,
20001 uns_p ? umul_highpart_optab : smul_highpart_optab,
20002 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
20003 gcc_assert (t1 && t2);
20004
20005 t3 = gen_reg_rtx (mode);
20006 ix86_expand_vec_interleave (t3, t1, t2, high_p);
20007 emit_move_insn (dest, gen_lowpart (wmode, t3));
20008 break;
20009
20010 case E_V16QImode:
20011 case E_V32QImode:
20012 case E_V32HImode:
20013 case E_V16SImode:
20014 case E_V64QImode:
20015 t1 = gen_reg_rtx (wmode);
20016 t2 = gen_reg_rtx (wmode);
20017 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
20018 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
20019
20020 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
20021 break;
20022
20023 default:
20024 gcc_unreachable ();
20025 }
20026 }
20027
20028 void
20029 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
20030 {
20031 rtx res_1, res_2, res_3, res_4;
20032
20033 res_1 = gen_reg_rtx (V4SImode);
20034 res_2 = gen_reg_rtx (V4SImode);
20035 res_3 = gen_reg_rtx (V2DImode);
20036 res_4 = gen_reg_rtx (V2DImode);
20037 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
20038 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
20039
20040 /* Move the results in element 2 down to element 1; we don't care
20041 what goes in elements 2 and 3. Then we can merge the parts
20042 back together with an interleave.
20043
20044 Note that two other sequences were tried:
20045 (1) Use interleaves at the start instead of psrldq, which allows
20046 us to use a single shufps to merge things back at the end.
20047 (2) Use shufps here to combine the two vectors, then pshufd to
20048 put the elements in the correct order.
20049 In both cases the cost of the reformatting stall was too high
20050 and the overall sequence slower. */
20051
20052 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
20053 const0_rtx, const2_rtx,
20054 const0_rtx, const0_rtx));
20055 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
20056 const0_rtx, const2_rtx,
20057 const0_rtx, const0_rtx));
20058 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
20059
20060 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
20061 }
20062
20063 void
20064 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
20065 {
20066 machine_mode mode = GET_MODE (op0);
20067 rtx t1, t2, t3, t4, t5, t6;
20068
20069 if (TARGET_AVX512DQ && mode == V8DImode)
20070 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
20071 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
20072 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
20073 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
20074 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
20075 else if (TARGET_XOP && mode == V2DImode)
20076 {
20077 /* op1: A,B,C,D, op2: E,F,G,H */
20078 op1 = gen_lowpart (V4SImode, op1);
20079 op2 = gen_lowpart (V4SImode, op2);
20080
20081 t1 = gen_reg_rtx (V4SImode);
20082 t2 = gen_reg_rtx (V4SImode);
20083 t3 = gen_reg_rtx (V2DImode);
20084 t4 = gen_reg_rtx (V2DImode);
20085
20086 /* t1: B,A,D,C */
20087 emit_insn (gen_sse2_pshufd_1 (t1, op1,
20088 GEN_INT (1),
20089 GEN_INT (0),
20090 GEN_INT (3),
20091 GEN_INT (2)));
20092
20093 /* t2: (B*E),(A*F),(D*G),(C*H) */
20094 emit_insn (gen_mulv4si3 (t2, t1, op2));
20095
20096 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
20097 emit_insn (gen_xop_phadddq (t3, t2));
20098
20099 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
20100 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
20101
20102 /* Multiply lower parts and add all */
20103 t5 = gen_reg_rtx (V2DImode);
20104 emit_insn (gen_vec_widen_umult_even_v4si (t5,
20105 gen_lowpart (V4SImode, op1),
20106 gen_lowpart (V4SImode, op2)));
20107 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
20108 }
20109 else
20110 {
20111 machine_mode nmode;
20112 rtx (*umul) (rtx, rtx, rtx);
20113
20114 if (mode == V2DImode)
20115 {
20116 umul = gen_vec_widen_umult_even_v4si;
20117 nmode = V4SImode;
20118 }
20119 else if (mode == V4DImode)
20120 {
20121 umul = gen_vec_widen_umult_even_v8si;
20122 nmode = V8SImode;
20123 }
20124 else if (mode == V8DImode)
20125 {
20126 umul = gen_vec_widen_umult_even_v16si;
20127 nmode = V16SImode;
20128 }
20129 else
20130 gcc_unreachable ();
20131
20132
20133 /* Multiply low parts. */
20134 t1 = gen_reg_rtx (mode);
20135 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
20136
20137 /* Shift input vectors right 32 bits so we can multiply high parts. */
20138 t6 = GEN_INT (32);
20139 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
20140 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
20141
20142 /* Multiply high parts by low parts. */
20143 t4 = gen_reg_rtx (mode);
20144 t5 = gen_reg_rtx (mode);
20145 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
20146 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
20147
20148 /* Combine and shift the highparts back. */
20149 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
20150 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
20151
20152 /* Combine high and low parts. */
20153 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
20154 }
20155
20156 set_unique_reg_note (get_last_insn (), REG_EQUAL,
20157 gen_rtx_MULT (mode, op1, op2));
20158 }
20159
20160 /* Return 1 if control tansfer instruction INSN
20161 should be encoded with notrack prefix. */
20162
20163 bool
20164 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
20165 {
20166 if (!insn || !((flag_cf_protection & CF_BRANCH)))
20167 return false;
20168
20169 if (CALL_P (insn))
20170 {
20171 rtx call = get_call_rtx_from (insn);
20172 gcc_assert (call != NULL_RTX);
20173 rtx addr = XEXP (call, 0);
20174
20175 /* Do not emit 'notrack' if it's not an indirect call. */
20176 if (MEM_P (addr)
20177 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
20178 return false;
20179 else
20180 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
20181 }
20182
20183 if (JUMP_P (insn) && !flag_cet_switch)
20184 {
20185 rtx target = JUMP_LABEL (insn);
20186 if (target == NULL_RTX || ANY_RETURN_P (target))
20187 return false;
20188
20189 /* Check the jump is a switch table. */
20190 rtx_insn *label = as_a<rtx_insn *> (target);
20191 rtx_insn *table = next_insn (label);
20192 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
20193 return false;
20194 else
20195 return true;
20196 }
20197 return false;
20198 }
20199
20200 /* Calculate integer abs() using only SSE2 instructions. */
20201
20202 void
20203 ix86_expand_sse2_abs (rtx target, rtx input)
20204 {
20205 machine_mode mode = GET_MODE (target);
20206 rtx tmp0, tmp1, x;
20207
20208 switch (mode)
20209 {
20210 case E_V2DImode:
20211 case E_V4DImode:
20212 /* For 64-bit signed integer X, with SSE4.2 use
20213 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20214 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20215 32 and use logical instead of arithmetic right shift (which is
20216 unimplemented) and subtract. */
20217 if (TARGET_SSE4_2)
20218 {
20219 tmp0 = gen_reg_rtx (mode);
20220 tmp1 = gen_reg_rtx (mode);
20221 emit_move_insn (tmp1, CONST0_RTX (mode));
20222 if (mode == E_V2DImode)
20223 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
20224 else
20225 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
20226 }
20227 else
20228 {
20229 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
20230 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
20231 - 1), NULL, 0, OPTAB_DIRECT);
20232 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
20233 }
20234
20235 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20236 NULL, 0, OPTAB_DIRECT);
20237 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20238 target, 0, OPTAB_DIRECT);
20239 break;
20240
20241 case E_V4SImode:
20242 /* For 32-bit signed integer X, the best way to calculate the absolute
20243 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20244 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
20245 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
20246 NULL, 0, OPTAB_DIRECT);
20247 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20248 NULL, 0, OPTAB_DIRECT);
20249 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20250 target, 0, OPTAB_DIRECT);
20251 break;
20252
20253 case E_V8HImode:
20254 /* For 16-bit signed integer X, the best way to calculate the absolute
20255 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20256 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20257
20258 x = expand_simple_binop (mode, SMAX, tmp0, input,
20259 target, 0, OPTAB_DIRECT);
20260 break;
20261
20262 case E_V16QImode:
20263 /* For 8-bit signed integer X, the best way to calculate the absolute
20264 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20265 as SSE2 provides the PMINUB insn. */
20266 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20267
20268 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
20269 target, 0, OPTAB_DIRECT);
20270 break;
20271
20272 default:
20273 gcc_unreachable ();
20274 }
20275
20276 if (x != target)
20277 emit_move_insn (target, x);
20278 }
20279
20280 /* Expand an extract from a vector register through pextr insn.
20281 Return true if successful. */
20282
20283 bool
20284 ix86_expand_pextr (rtx *operands)
20285 {
20286 rtx dst = operands[0];
20287 rtx src = operands[1];
20288
20289 unsigned int size = INTVAL (operands[2]);
20290 unsigned int pos = INTVAL (operands[3]);
20291
20292 if (SUBREG_P (dst))
20293 {
20294 /* Reject non-lowpart subregs. */
20295 if (SUBREG_BYTE (dst) > 0)
20296 return false;
20297 dst = SUBREG_REG (dst);
20298 }
20299
20300 if (SUBREG_P (src))
20301 {
20302 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
20303 src = SUBREG_REG (src);
20304 }
20305
20306 switch (GET_MODE (src))
20307 {
20308 case E_V16QImode:
20309 case E_V8HImode:
20310 case E_V4SImode:
20311 case E_V2DImode:
20312 case E_V1TImode:
20313 {
20314 machine_mode srcmode, dstmode;
20315 rtx d, pat;
20316
20317 if (!int_mode_for_size (size, 0).exists (&dstmode))
20318 return false;
20319
20320 switch (dstmode)
20321 {
20322 case E_QImode:
20323 if (!TARGET_SSE4_1)
20324 return false;
20325 srcmode = V16QImode;
20326 break;
20327
20328 case E_HImode:
20329 if (!TARGET_SSE2)
20330 return false;
20331 srcmode = V8HImode;
20332 break;
20333
20334 case E_SImode:
20335 if (!TARGET_SSE4_1)
20336 return false;
20337 srcmode = V4SImode;
20338 break;
20339
20340 case E_DImode:
20341 gcc_assert (TARGET_64BIT);
20342 if (!TARGET_SSE4_1)
20343 return false;
20344 srcmode = V2DImode;
20345 break;
20346
20347 default:
20348 return false;
20349 }
20350
20351 /* Reject extractions from misaligned positions. */
20352 if (pos & (size-1))
20353 return false;
20354
20355 if (GET_MODE (dst) == dstmode)
20356 d = dst;
20357 else
20358 d = gen_reg_rtx (dstmode);
20359
20360 /* Construct insn pattern. */
20361 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20362 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20363
20364 /* Let the rtl optimizers know about the zero extension performed. */
20365 if (dstmode == QImode || dstmode == HImode)
20366 {
20367 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20368 d = gen_lowpart (SImode, d);
20369 }
20370
20371 emit_insn (gen_rtx_SET (d, pat));
20372
20373 if (d != dst)
20374 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20375 return true;
20376 }
20377
20378 default:
20379 return false;
20380 }
20381 }
20382
20383 /* Expand an insert into a vector register through pinsr insn.
20384 Return true if successful. */
20385
20386 bool
20387 ix86_expand_pinsr (rtx *operands)
20388 {
20389 rtx dst = operands[0];
20390 rtx src = operands[3];
20391
20392 unsigned int size = INTVAL (operands[1]);
20393 unsigned int pos = INTVAL (operands[2]);
20394
20395 if (SUBREG_P (dst))
20396 {
20397 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20398 dst = SUBREG_REG (dst);
20399 }
20400
20401 switch (GET_MODE (dst))
20402 {
20403 case E_V16QImode:
20404 case E_V8HImode:
20405 case E_V4SImode:
20406 case E_V2DImode:
20407 case E_V1TImode:
20408 {
20409 machine_mode srcmode, dstmode;
20410 rtx (*pinsr)(rtx, rtx, rtx, rtx);
20411 rtx d;
20412
20413 if (!int_mode_for_size (size, 0).exists (&srcmode))
20414 return false;
20415
20416 switch (srcmode)
20417 {
20418 case E_QImode:
20419 if (!TARGET_SSE4_1)
20420 return false;
20421 dstmode = V16QImode;
20422 pinsr = gen_sse4_1_pinsrb;
20423 break;
20424
20425 case E_HImode:
20426 if (!TARGET_SSE2)
20427 return false;
20428 dstmode = V8HImode;
20429 pinsr = gen_sse2_pinsrw;
20430 break;
20431
20432 case E_SImode:
20433 if (!TARGET_SSE4_1)
20434 return false;
20435 dstmode = V4SImode;
20436 pinsr = gen_sse4_1_pinsrd;
20437 break;
20438
20439 case E_DImode:
20440 gcc_assert (TARGET_64BIT);
20441 if (!TARGET_SSE4_1)
20442 return false;
20443 dstmode = V2DImode;
20444 pinsr = gen_sse4_1_pinsrq;
20445 break;
20446
20447 default:
20448 return false;
20449 }
20450
20451 /* Reject insertions to misaligned positions. */
20452 if (pos & (size-1))
20453 return false;
20454
20455 if (SUBREG_P (src))
20456 {
20457 unsigned int srcpos = SUBREG_BYTE (src);
20458
20459 if (srcpos > 0)
20460 {
20461 rtx extr_ops[4];
20462
20463 extr_ops[0] = gen_reg_rtx (srcmode);
20464 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
20465 extr_ops[2] = GEN_INT (size);
20466 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
20467
20468 if (!ix86_expand_pextr (extr_ops))
20469 return false;
20470
20471 src = extr_ops[0];
20472 }
20473 else
20474 src = gen_lowpart (srcmode, SUBREG_REG (src));
20475 }
20476
20477 if (GET_MODE (dst) == dstmode)
20478 d = dst;
20479 else
20480 d = gen_reg_rtx (dstmode);
20481
20482 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
20483 gen_lowpart (srcmode, src),
20484 GEN_INT (1 << (pos / size))));
20485 if (d != dst)
20486 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20487 return true;
20488 }
20489
20490 default:
20491 return false;
20492 }
20493 }
20494
20495 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20496 upper against lower halves up to SSE reg size. */
20497
20498 machine_mode
20499 ix86_split_reduction (machine_mode mode)
20500 {
20501 /* Reduce lowpart against highpart until we reach SSE reg width to
20502 avoid cross-lane operations. */
20503 switch (mode)
20504 {
20505 case E_V8DImode:
20506 case E_V4DImode:
20507 return V2DImode;
20508 case E_V16SImode:
20509 case E_V8SImode:
20510 return V4SImode;
20511 case E_V32HImode:
20512 case E_V16HImode:
20513 return V8HImode;
20514 case E_V64QImode:
20515 case E_V32QImode:
20516 return V16QImode;
20517 case E_V16SFmode:
20518 case E_V8SFmode:
20519 return V4SFmode;
20520 case E_V8DFmode:
20521 case E_V4DFmode:
20522 return V2DFmode;
20523 default:
20524 return mode;
20525 }
20526 }
20527
20528 /* Generate call to __divmoddi4. */
20529
20530 void
20531 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
20532 rtx op0, rtx op1,
20533 rtx *quot_p, rtx *rem_p)
20534 {
20535 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
20536
20537 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
20538 mode, op0, mode, op1, mode,
20539 XEXP (rem, 0), Pmode);
20540 *quot_p = quot;
20541 *rem_p = rem;
20542 }
20543
20544 #include "gt-i386-expand.h"