From 2cdb31489836779b7316f60cb49c5eb83de88adb Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 5 Jan 2005 11:14:39 -0800 Subject: [PATCH] re PR target/12902 (Invalid assembly generated when using SSE / xmmintrin.h) PR target/12902 * config/i386/i386.md (sse_movhps, sse_movlps): Remove. (sse_shufps): Change operand 3 to const_int_operand. (sse2_storelps): Fix typo in template. (sse_storehps, sse_loadhps, sse_storelps, sse_loadlps): New. * config/i386/i386.c (ix86_expand_vector_move_misalign): Use them. (ix86_expand_builtin): Likewise. From-SVN: r92967 --- gcc/ChangeLog | 12 ++- gcc/config/i386/i386.c | 47 +++------- gcc/config/i386/i386.md | 118 ++++++++++++++++++++------ gcc/testsuite/gcc.target/i386/sse-1.c | 25 ++++++ 4 files changed, 143 insertions(+), 59 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/sse-1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 39da29dadc6..08b41f583f3 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,4 +1,14 @@ -2004-01-05 Julian Brown +2005-01-05 Richard Henderson + + PR target/12902 + * config/i386/i386.md (sse_movhps, sse_movlps): Remove. + (sse_shufps): Change operand 3 to const_int_operand. + (sse2_storelps): Fix typo in template. + (sse_storehps, sse_loadhps, sse_storelps, sse_loadlps): New. + * config/i386/i386.c (ix86_expand_vector_move_misalign): Use them. + (ix86_expand_builtin): Likewise. + +2005-01-05 Julian Brown * config/arm/arm.c (arm_return_in_memory): Treat complex types as aggregates for AAPCS ABIs. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 23129a08943..51d36f18626 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1,6 +1,6 @@ /* Subroutines used for code generation on IA-32. Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, - 2002, 2003, 2004 Free Software Foundation, Inc. + 2002, 2003, 2004, 2005 Free Software Foundation, Inc. This file is part of GCC. @@ -7645,11 +7645,10 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) else emit_insn (gen_rtx_CLOBBER (VOIDmode, op0)); - op0 = gen_lowpart (V4SFmode, op0); - m = adjust_address (op1, V4SFmode, 0); - emit_insn (gen_sse_movlps (op0, op0, m)); - m = adjust_address (op1, V4SFmode, 8); - emit_insn (gen_sse_movhps (op0, op0, m)); + m = adjust_address (op1, V2SFmode, 0); + emit_insn (gen_sse_loadlps (op0, op0, m)); + m = adjust_address (op1, V2SFmode, 8); + emit_insn (gen_sse_loadhps (op0, op0, m)); } } else if (MEM_P (op0)) @@ -7684,11 +7683,10 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) } else { - op1 = gen_lowpart (V4SFmode, op1); - m = adjust_address (op0, V4SFmode, 0); - emit_insn (gen_sse_movlps (m, m, op1)); - m = adjust_address (op0, V4SFmode, 8); - emit_insn (gen_sse_movhps (m, m, op1)); + m = adjust_address (op0, V2SFmode, 0); + emit_insn (gen_sse_storelps (m, op1)); + m = adjust_address (op0, V2SFmode, 8); + emit_insn (gen_sse_storehps (m, op1)); return; } } @@ -13508,8 +13506,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, case IX86_BUILTIN_LOADLPS: case IX86_BUILTIN_LOADHPD: case IX86_BUILTIN_LOADLPD: - icode = (fcode == IX86_BUILTIN_LOADHPS ? CODE_FOR_sse_movhps - : fcode == IX86_BUILTIN_LOADLPS ? CODE_FOR_sse_movlps + icode = (fcode == IX86_BUILTIN_LOADHPS ? CODE_FOR_sse_loadhps + : fcode == IX86_BUILTIN_LOADLPS ? CODE_FOR_sse_loadlps : fcode == IX86_BUILTIN_LOADHPD ? CODE_FOR_sse2_loadhpd : CODE_FOR_sse2_loadlpd); arg0 = TREE_VALUE (arglist); @@ -13535,28 +13533,11 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, case IX86_BUILTIN_STOREHPS: case IX86_BUILTIN_STORELPS: - icode = (fcode == IX86_BUILTIN_STOREHPS ? CODE_FOR_sse_movhps - : CODE_FOR_sse_movlps); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); - op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0); - op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0); - mode0 = insn_data[icode].operand[1].mode; - mode1 = insn_data[icode].operand[2].mode; - - op0 = gen_rtx_MEM (mode0, copy_to_mode_reg (Pmode, op0)); - if (! (*insn_data[icode].operand[2].predicate) (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - pat = GEN_FCN (icode) (op0, op0, op1); - if (! pat) - return 0; - emit_insn (pat); - return const0_rtx; - case IX86_BUILTIN_STOREHPD: case IX86_BUILTIN_STORELPD: - icode = (fcode == IX86_BUILTIN_STOREHPD ? CODE_FOR_sse2_storehpd + icode = (fcode == IX86_BUILTIN_STOREHPS ? CODE_FOR_sse_storehps + : fcode == IX86_BUILTIN_STORELPS ? CODE_FOR_sse_storelps + : fcode == IX86_BUILTIN_STOREHPD ? CODE_FOR_sse2_storehpd : CODE_FOR_sse2_storelpd); arg0 = TREE_VALUE (arglist); arg1 = TREE_VALUE (TREE_CHAIN (arglist)); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index eb2eee895fc..7848579153e 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -1,6 +1,6 @@ ;; GCC machine description for IA-32 and x86-64. ;; Copyright (C) 1988, 1994, 1995, 1996, 1997, 1998, 1999, 2000, -;; 2001, 2002, 2003, 2004 +;; 2001, 2002, 2003, 2004, 2005 ;; Free Software Foundation, Inc. ;; Mostly by William Schelter. ;; x86_64 support added by Jan Hubicka @@ -20335,29 +20335,98 @@ [(set_attr "type" "ssecvt") (set_attr "mode" "V4SF")]) -(define_insn "sse_movhps" - [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,m") - (vec_merge:V4SF - (match_operand:V4SF 1 "nonimmediate_operand" "0,0") - (match_operand:V4SF 2 "nonimmediate_operand" "m,x") - (const_int 12)))] - "TARGET_SSE - && (GET_CODE (operands[1]) == MEM || GET_CODE (operands[2]) == MEM)" - "movhps\t{%2, %0|%0, %2}" +;; Store the high V2SF of the source vector to the destination. +(define_insn "sse_storehps" + [(set (match_operand:V2SF 0 "nonimmediate_operand" "=m,x,x") + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "x,x,o") + (parallel [(const_int 2) (const_int 3)])))] + "TARGET_SSE" + "@ + movhps\t{%1, %0|%0, %1} + movhlps\t{%1, %0|%0, %1} + #" [(set_attr "type" "ssecvt") - (set_attr "mode" "V4SF")]) + (set_attr "mode" "V2SF")]) -(define_insn "sse_movlps" - [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,m") - (vec_merge:V4SF - (match_operand:V4SF 1 "nonimmediate_operand" "0,0") - (match_operand:V4SF 2 "nonimmediate_operand" "m,x") - (const_int 3)))] - "TARGET_SSE - && (GET_CODE (operands[1]) == MEM || GET_CODE (operands[2]) == MEM)" - "movlps\t{%2, %0|%0, %2}" +(define_split + [(set (match_operand:V2SF 0 "register_operand" "") + (vec_select:V2SF + (match_operand:V4SF 1 "memory_operand" "") + (parallel [(const_int 2) (const_int 3)])))] + "TARGET_SSE && reload_completed" + [(const_int 0)] +{ + emit_move_insn (operands[0], adjust_address (operands[1], V2SFmode, 8)); + DONE; +}) + +;; Load the high V2SF of the target vector from the source vector. +(define_insn "sse_loadhps" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,o") + (vec_concat:V4SF + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "0,0,0") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:V2SF 2 "nonimmediate_operand" "m,x,x")))] + "TARGET_SSE" + "@ + movhps\t{%2, %0|%0, %2} + movlhps\t{%2, %0|%0, %2} + #" [(set_attr "type" "ssecvt") - (set_attr "mode" "V4SF")]) + (set_attr "mode" "V2SF")]) + +(define_split + [(set (match_operand:V4SF 0 "memory_operand" "") + (vec_concat:V4SF + (vec_select:V2SF + (match_dup 0) + (parallel [(const_int 0) (const_int 1)])) + (match_operand:V2SF 2 "register_operand" "")))] + "TARGET_SSE && reload_completed" + [(const_int 0)] +{ + emit_move_insn (adjust_address (operands[0], V2SFmode, 8), operands[1]); + DONE; +}) + +;; Store the low V2SF of the source vector to the destination. +(define_expand "sse_storelps" + [(set (match_operand:V2SF 0 "nonimmediate_operand" "") + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "") + (parallel [(const_int 0) (const_int 1)])))] + "TARGET_SSE" +{ + operands[1] = gen_lowpart (V2SFmode, operands[1]); + emit_move_insn (operands[0], operands[1]); + DONE; +}) + +;; Load the low V2SF of the target vector from the source vector. +(define_insn "sse_loadlps" + [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,m") + (vec_concat:V4SF + (match_operand:V2SF 2 "nonimmediate_operand" "m,0,x") + (vec_select:V2SF + (match_operand:V4SF 1 "nonimmediate_operand" "0,x,0") + (parallel [(const_int 2) (const_int 3)]))))] + "TARGET_SSE" +{ + static const char * const alt[] = { + "movlps\t{%2, %0|%0, %2}", + "shufps\t{%2, %1, %0|%0, %1, %2}", + "movlps\t{%2, %0|%0, %2}" + }; + + if (which_alternative == 1) + operands[2] = GEN_INT (0xe4); + + return alt[which_alternative]; +} + [(set_attr "type" "ssecvt") + (set_attr "mode" "V2SF")]) (define_expand "sse_loadss" [(match_operand:V4SF 0 "register_operand" "") @@ -20405,10 +20474,9 @@ [(set (match_operand:V4SF 0 "register_operand" "=x") (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "0") (match_operand:V4SF 2 "nonimmediate_operand" "xm") - (match_operand:SI 3 "immediate_operand" "i")] + (match_operand:SI 3 "const_int_operand" "n")] UNSPEC_SHUFFLE))] "TARGET_SSE" - ;; @@@ check operand order for intel/nonintel syntax "shufps\t{%3, %2, %0|%0, %2, %3}" [(set_attr "type" "ssecvt") (set_attr "mode" "V4SF")]) @@ -23902,7 +23970,7 @@ [(set (match_operand:DF 0 "nonimmediate_operand" "") (vec_select:DF (match_operand:V2DF 1 "nonimmediate_operand" "") - (parallel [(const_int 1)])))] + (parallel [(const_int 0)])))] "TARGET_SSE2" { operands[1] = gen_lowpart (DFmode, operands[1]); @@ -23910,7 +23978,7 @@ DONE; }) -;; Load the load double of the target vector from the source scalar. +;; Load the low double of the target vector from the source scalar. (define_insn "sse2_loadlpd" [(set (match_operand:V2DF 0 "nonimmediate_operand" "=Y,Y,m") (vec_concat:V2DF diff --git a/gcc/testsuite/gcc.target/i386/sse-1.c b/gcc/testsuite/gcc.target/i386/sse-1.c new file mode 100644 index 00000000000..afae22d3705 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse-1.c @@ -0,0 +1,25 @@ +/* PR 12902 */ +/* { dg-do compile } */ +/* { dg-options "-O1 -msse" } */ + +#include + +typedef union +{ + int i[4]; + float f[4]; + __m128 v; +} vector4_t; + +void +swizzle (const void *a, vector4_t * b, vector4_t * c) +{ + b->v = _mm_loadl_pi (b->v, (__m64 *) a); + c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1); +} + +/* While one legal rendering of each statement would be movaps;movlps;movaps, + we can implmenent this with just movlps;movlps. Since we do now, anything + less would be a regression. */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler "movlps" } } */ -- 2.30.2