+2018-02-08 Sergey Shalnov <sergey.shalnov@intel.com>
+
+ PR target/83008
+ * config/i386/x86-tune-costs.h (skylake_cost): Fix cost of
+ storing integer register in SImode. Fix cost of 256 and 512
+ byte aligned SSE register store.
+
+2018-02-08 Sergey Shalnov <sergey.shalnov@intel.com>
+
+ * config/i386/i386.c (ix86_multiplication_cost): Fix
+ multiplication cost for TARGET_AVX512DQ.
+
2018-02-08 Marek Polacek <polacek@redhat.com>
PR tree-optimization/84238
? cost->mulsd : cost->mulss, true);
else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
{
+ /* vpmullq is used in this case. No emulation is needed. */
+ if (TARGET_AVX512DQ)
+ return ix86_vec_cost (mode, cost->mulss, true);
+
/* V*QImode is emulated with 7-13 insns. */
if (mode == V16QImode || mode == V32QImode)
{
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
- {6, 6, 6}, /* cost of storing integer registers */
+ {6, 6, 3}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{6, 6, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 6, 10, 20}, /* cost of loading SSE registers
in 32,64,128,256 and 512-bit */
{6, 6, 6, 10, 20}, /* cost of unaligned loads. */
- {8, 8, 8, 8, 16}, /* cost of storing SSE registers
+ {8, 8, 8, 12, 24}, /* cost of storing SSE registers
in 32,64,128,256 and 512-bit */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2, 2, /* SSE->integer and integer->SSE moves */
+2018-02-08 Sergey Shalnov <sergey.shalnov@intel.com>
+
+ PR target/83008
+ * gcc.target/i386/pr83008.c: New test.
+
2018-02-08 Peter Bergner <bergner@vnet.ibm.com>
PR target/81143
--- /dev/null
+/* PR target/83008 */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -funroll-loops -march=skylake-avx512 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-not "vmovdq(a|u)(32|64)" } } */
+
+int
+pr83008 (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+ unsigned int tmp[4][4];
+ unsigned int a0, a1, a2, a3;
+ int sum = 0;
+ for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2)
+ {
+ a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
+ a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
+ a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
+ a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
+ int t0 = a0 + a1;
+ int t1 = a0 - a1;
+ int t2 = a2 + a3;
+ int t3 = a2 - a3;
+ tmp[i][0] = t0 + t2;
+ tmp[i][2] = t0 - t2;
+ tmp[i][1] = t1 + t3;
+ tmp[i][3] = t1 - t3;
+ }
+ for (int i = 0; i < 4; i++)
+ {
+ int t0 = tmp[0][i] + tmp[1][i];
+ int t1 = tmp[0][i] - tmp[1][i];
+ int t2 = tmp[2][i] + tmp[3][i];
+ int t3 = tmp[2][i] - tmp[3][i];
+ a0 = t0 + t2;
+ a2 = t0 - t2;
+ a1 = t1 + t3;
+ a3 = t1 - t3;
+ sum += (a0) + (a1) + (a2) + (a3);
+ }
+ return (sum + ((unsigned int) sum >> 16)) >> 1;
+}