+static void
+ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
+{
+ struct expand_vec_perm_d d;
+ unsigned i, nelt, base;
+ bool ok;
+
+ d.target = targ;
+ d.op0 = op0;
+ d.op1 = op1;
+ d.vmode = GET_MODE (targ);
+ d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ base = high_p ? nelt / 2 : 0;
+ for (i = 0; i < nelt / 2; ++i)
+ {
+ d.perm[i * 2] = i + base;
+ d.perm[i * 2 + 1] = i + base + nelt;
+ }
+
+ /* Note that for AVX this isn't one instruction. */
+ ok = ix86_expand_vec_perm_const_1 (&d);
+ gcc_assert (ok);
+}
+
+
+/* Expand a vector operation CODE for a V*QImode in terms of the
+ same operation on V*HImode. */
+
+void
+ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+ enum machine_mode qimode = GET_MODE (dest);
+ enum machine_mode himode;
+ rtx (*gen_il) (rtx, rtx, rtx);
+ rtx (*gen_ih) (rtx, rtx, rtx);
+ rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
+ struct expand_vec_perm_d d;
+ bool ok, full_interleave;
+ bool uns_p = false;
+ int i;
+
+ switch (qimode)
+ {
+ case V16QImode:
+ himode = V8HImode;
+ gen_il = gen_vec_interleave_lowv16qi;
+ gen_ih = gen_vec_interleave_highv16qi;
+ break;
+ case V32QImode:
+ himode = V16HImode;
+ gen_il = gen_avx2_interleave_lowv32qi;
+ gen_ih = gen_avx2_interleave_highv32qi;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ op2_l = op2_h = op2;
+ switch (code)
+ {
+ case MULT:
+ /* Unpack data such that we've got a source byte in each low byte of
+ each word. We don't care what goes into the high byte of each word.
+ Rather than trying to get zero in there, most convenient is to let
+ it be a copy of the low byte. */
+ op2_l = gen_reg_rtx (qimode);
+ op2_h = gen_reg_rtx (qimode);
+ emit_insn (gen_il (op2_l, op2, op2));
+ emit_insn (gen_ih (op2_h, op2, op2));
+ /* FALLTHRU */
+
+ op1_l = gen_reg_rtx (qimode);
+ op1_h = gen_reg_rtx (qimode);
+ emit_insn (gen_il (op1_l, op1, op1));
+ emit_insn (gen_ih (op1_h, op1, op1));
+ full_interleave = qimode == V16QImode;
+ break;
+
+ case ASHIFT:
+ case LSHIFTRT:
+ uns_p = true;
+ /* FALLTHRU */
+ case ASHIFTRT:
+ op1_l = gen_reg_rtx (himode);
+ op1_h = gen_reg_rtx (himode);
+ ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
+ ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
+ full_interleave = true;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Perform the operation. */
+ res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
+ 1, OPTAB_DIRECT);
+ res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
+ 1, OPTAB_DIRECT);
+ gcc_assert (res_l && res_h);
+
+ /* Merge the data back into the right place. */
+ d.target = dest;
+ d.op0 = gen_lowpart (qimode, res_l);
+ d.op1 = gen_lowpart (qimode, res_h);
+ d.vmode = qimode;
+ d.nelt = GET_MODE_NUNITS (qimode);
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ if (full_interleave)
+ {
+ /* For SSE2, we used an full interleave, so the desired
+ results are in the even elements. */
+ for (i = 0; i < 32; ++i)
+ d.perm[i] = i * 2;
+ }
+ else
+ {
+ /* For AVX, the interleave used above was not cross-lane. So the
+ extraction is evens but with the second and third quarter swapped.
+ Happily, that is even one insn shorter than even extraction. */
+ for (i = 0; i < 32; ++i)
+ d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
+ }
+
+ ok = ix86_expand_vec_perm_const_1 (&d);
+ gcc_assert (ok);
+
+ set_unique_reg_note (get_last_insn (), REG_EQUAL,
+ gen_rtx_fmt_ee (code, qimode, op1, op2));
+}
+
+void
+ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
+ bool uns_p, bool odd_p)
+{
+ enum machine_mode mode = GET_MODE (op1);
+ enum machine_mode wmode = GET_MODE (dest);
+ rtx x;
+
+ /* We only play even/odd games with vectors of SImode. */
+ gcc_assert (mode == V4SImode || mode == V8SImode);
+
+ /* If we're looking for the odd results, shift those members down to
+ the even slots. For some cpus this is faster than a PSHUFD. */
+ if (odd_p)
+ {
+ if (TARGET_XOP && mode == V4SImode)
+ {
+ x = force_reg (wmode, CONST0_RTX (wmode));
+ emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
+ return;
+ }
+
+ x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
+ op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
+ x, NULL, 1, OPTAB_DIRECT);
+ op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
+ x, NULL, 1, OPTAB_DIRECT);
+ op1 = gen_lowpart (mode, op1);
+ op2 = gen_lowpart (mode, op2);
+ }
+
+ if (mode == V8SImode)
+ {
+ if (uns_p)
+ x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
+ else
+ x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
+ }
+ else if (uns_p)
+ x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
+ else if (TARGET_SSE4_1)
+ x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
+ else
+ {
+ rtx s1, s2, t0, t1, t2;
+
+ /* The easiest way to implement this without PMULDQ is to go through
+ the motions as if we are performing a full 64-bit multiply. With
+ the exception that we need to do less shuffling of the elements. */
+
+ /* Compute the sign-extension, aka highparts, of the two operands. */
+ s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+ op1, pc_rtx, pc_rtx);
+ s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+ op2, pc_rtx, pc_rtx);
+
+ /* Multiply LO(A) * HI(B), and vice-versa. */
+ t1 = gen_reg_rtx (wmode);
+ t2 = gen_reg_rtx (wmode);
+ emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
+ emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
+
+ /* Multiply LO(A) * LO(B). */
+ t0 = gen_reg_rtx (wmode);
+ emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
+
+ /* Combine and shift the highparts into place. */
+ t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
+ t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
+ 1, OPTAB_DIRECT);
+
+ /* Combine high and low parts. */
+ force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
+ return;
+ }
+ emit_insn (x);
+}
+
+void
+ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
+ bool uns_p, bool high_p)
+{
+ enum machine_mode wmode = GET_MODE (dest);
+ enum machine_mode mode = GET_MODE (op1);
+ rtx t1, t2, t3, t4, mask;
+
+ switch (mode)
+ {
+ case V4SImode:
+ t1 = gen_reg_rtx (mode);
+ t2 = gen_reg_rtx (mode);
+ if (TARGET_XOP && !uns_p)
+ {
+ /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
+ shuffle the elements once so that all elements are in the right
+ place for immediate use: { A C B D }. */
+ emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+ emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+ }
+ else
+ {
+ /* Put the elements into place for the multiply. */
+ ix86_expand_vec_interleave (t1, op1, op1, high_p);
+ ix86_expand_vec_interleave (t2, op2, op2, high_p);
+ high_p = false;
+ }
+ ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
+ break;
+
+ case V8SImode:
+ /* Shuffle the elements between the lanes. After this we
+ have { A B E F | C D G H } for each operand. */
+ t1 = gen_reg_rtx (V4DImode);
+ t2 = gen_reg_rtx (V4DImode);
+ emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
+ const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+ emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
+ const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+
+ /* Shuffle the elements within the lanes. After this we
+ have { A A B B | C C D D } or { E E F F | G G H H }. */
+ t3 = gen_reg_rtx (V8SImode);
+ t4 = gen_reg_rtx (V8SImode);
+ mask = GEN_INT (high_p
+ ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
+ : 0 + (0 << 2) + (1 << 4) + (1 << 6));
+ emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
+ emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
+
+ ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
+ break;
+
+ case V8HImode:
+ case V16HImode:
+ t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
+ uns_p, OPTAB_DIRECT);
+ t2 = expand_binop (mode,
+ uns_p ? umul_highpart_optab : smul_highpart_optab,
+ op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
+ gcc_assert (t1 && t2);
+
+ ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
+ break;
+
+ case V16QImode:
+ case V32QImode:
+ t1 = gen_reg_rtx (wmode);
+ t2 = gen_reg_rtx (wmode);
+ ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
+ ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
+
+ emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+}
+
+void
+ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
+{
+ rtx res_1, res_2;
+
+ res_1 = gen_reg_rtx (V4SImode);
+ res_2 = gen_reg_rtx (V4SImode);
+ ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
+ op1, op2, true, false);
+ ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
+ op1, op2, true, true);
+
+ /* Move the results in element 2 down to element 1; we don't care
+ what goes in elements 2 and 3. Then we can merge the parts
+ back together with an interleave.
+
+ Note that two other sequences were tried:
+ (1) Use interleaves at the start instead of psrldq, which allows
+ us to use a single shufps to merge things back at the end.
+ (2) Use shufps here to combine the two vectors, then pshufd to
+ put the elements in the correct order.
+ In both cases the cost of the reformatting stall was too high
+ and the overall sequence slower. */
+
+ emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
+ const0_rtx, const0_rtx));
+ emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
+ const0_rtx, const0_rtx));
+ res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
+
+ set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
+}
+
+void
+ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
+{
+ enum machine_mode mode = GET_MODE (op0);
+ rtx t1, t2, t3, t4, t5, t6;
+
+ if (TARGET_XOP && mode == V2DImode)
+ {
+ /* op1: A,B,C,D, op2: E,F,G,H */
+ op1 = gen_lowpart (V4SImode, op1);
+ op2 = gen_lowpart (V4SImode, op2);
+
+ t1 = gen_reg_rtx (V4SImode);
+ t2 = gen_reg_rtx (V4SImode);
+ t3 = gen_reg_rtx (V2DImode);
+ t4 = gen_reg_rtx (V2DImode);
+
+ /* t1: B,A,D,C */
+ emit_insn (gen_sse2_pshufd_1 (t1, op1,
+ GEN_INT (1),
+ GEN_INT (0),
+ GEN_INT (3),
+ GEN_INT (2)));
+
+ /* t2: (B*E),(A*F),(D*G),(C*H) */
+ emit_insn (gen_mulv4si3 (t2, t1, op2));
+
+ /* t3: (B*E)+(A*F), (D*G)+(C*H) */
+ emit_insn (gen_xop_phadddq (t3, t2));
+
+ /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
+ emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
+
+ /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
+ emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
+ }
+ else
+ {
+ enum machine_mode nmode;
+ rtx (*umul) (rtx, rtx, rtx);
+
+ if (mode == V2DImode)
+ {
+ umul = gen_vec_widen_umult_even_v4si;
+ nmode = V4SImode;
+ }
+ else if (mode == V4DImode)
+ {
+ umul = gen_vec_widen_umult_even_v8si;
+ nmode = V8SImode;
+ }
+ else
+ gcc_unreachable ();
+
+
+ /* Multiply low parts. */
+ t1 = gen_reg_rtx (mode);
+ emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
+
+ /* Shift input vectors right 32 bits so we can multiply high parts. */
+ t6 = GEN_INT (32);
+ t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
+ t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
+
+ /* Multiply high parts by low parts. */
+ t4 = gen_reg_rtx (mode);
+ t5 = gen_reg_rtx (mode);
+ emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
+ emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
+
+ /* Combine and shift the highparts back. */
+ t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
+ t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
+
+ /* Combine high and low parts. */
+ force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
+ }
+
+ set_unique_reg_note (get_last_insn (), REG_EQUAL,
+ gen_rtx_MULT (mode, op1, op2));
+}
+