From: Andreas Krebbel Date: Mon, 20 Apr 2020 17:36:33 +0000 (+0200) Subject: PR94613: Fix vec_sel builtin for IBM Z X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=2930bb321794;p=gcc.git PR94613: Fix vec_sel builtin for IBM Z The vsel instruction is a bit-wise select instruction. Using an IF_THEN_ELSE to express it in RTL is wrong and leads to wrong code being generated in the combine pass. With the patch the pattern is written using bit operations. However, I've just noticed that the manual still demands a fixed point mode for AND/IOR and friends although several targets emit bit ops on floating point vectors (including i386, Power, and s390). So I assume this is a safe thing to do?! gcc/ChangeLog: 2020-04-20 Andreas Krebbel PR target/94613 * config/s390/s390-builtin-types.def: Add 3 new function modes. * config/s390/s390-builtins.def: Add mode dependent low-level builtin and map the overloaded builtins to these. * config/s390/vx-builtins.md ("vec_selV_HW"): Rename to ... ("vsel PR target/94613 * gcc.target/s390/zvector/pr94613.c: New test. * gcc.target/s390/zvector/vec_sel-1.c: New test. --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 433b976077c..ed3541d1264 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2020-04-20 Andreas Krebbel + + PR target/94613 + * config/s390/s390-builtin-types.def: Add 3 new function modes. + * config/s390/s390-builtins.def: Add mode dependent low-level + builtin and map the overloaded builtins to these. + * config/s390/vx-builtins.md ("vec_selV_HW"): Rename to ... + ("vsel * tree-vect-loop.c (vect_better_loop_vinfo_p): If old_loop_vinfo diff --git a/gcc/config/s390/s390-builtin-types.def b/gcc/config/s390/s390-builtin-types.def index 63b1c1ffd70..76ae8fed1ae 100644 --- a/gcc/config/s390/s390-builtin-types.def +++ b/gcc/config/s390/s390-builtin-types.def @@ -301,6 +301,7 @@ DEF_FN_TYPE_3 (BT_FN_UV16QI_UV2DI_UV2DI_UV16QI, BT_UV16QI, BT_UV2DI, BT_UV2DI, B DEF_FN_TYPE_3 (BT_FN_UV16QI_UV8HI_UV8HI_INTPTR, BT_UV16QI, BT_UV8HI, BT_UV8HI, BT_INTPTR) DEF_FN_TYPE_3 (BT_FN_UV2DI_UV2DI_ULONGLONG_INT, BT_UV2DI, BT_UV2DI, BT_ULONGLONG, BT_INT) DEF_FN_TYPE_3 (BT_FN_UV2DI_UV2DI_UV2DI_INT, BT_UV2DI, BT_UV2DI, BT_UV2DI, BT_INT) +DEF_FN_TYPE_3 (BT_FN_UV2DI_UV2DI_UV2DI_UV2DI, BT_UV2DI, BT_UV2DI, BT_UV2DI, BT_UV2DI) DEF_FN_TYPE_3 (BT_FN_UV2DI_UV4SI_UV4SI_UV2DI, BT_UV2DI, BT_UV4SI, BT_UV4SI, BT_UV2DI) DEF_FN_TYPE_3 (BT_FN_UV4SI_UV2DI_UV2DI_INTPTR, BT_UV4SI, BT_UV2DI, BT_UV2DI, BT_INTPTR) DEF_FN_TYPE_3 (BT_FN_UV4SI_UV4SI_UINT_INT, BT_UV4SI, BT_UV4SI, BT_UINT, BT_INT) @@ -322,6 +323,7 @@ DEF_FN_TYPE_3 (BT_FN_V2DF_V2DF_DBL_INT, BT_V2DF, BT_V2DF, BT_DBL, BT_INT) DEF_FN_TYPE_3 (BT_FN_V2DF_V2DF_UCHAR_UCHAR, BT_V2DF, BT_V2DF, BT_UCHAR, BT_UCHAR) DEF_FN_TYPE_3 (BT_FN_V2DF_V2DF_UINT_UINT, BT_V2DF, BT_V2DF, BT_UINT, BT_UINT) DEF_FN_TYPE_3 (BT_FN_V2DF_V2DF_V2DF_INT, BT_V2DF, BT_V2DF, BT_V2DF, BT_INT) +DEF_FN_TYPE_3 (BT_FN_V2DF_V2DF_V2DF_UV2DI, BT_V2DF, BT_V2DF, BT_V2DF, BT_UV2DI) DEF_FN_TYPE_3 (BT_FN_V2DF_V2DF_V2DF_V2DF, BT_V2DF, BT_V2DF, BT_V2DF, BT_V2DF) DEF_FN_TYPE_3 (BT_FN_V2DI_UV2DI_UV2DI_INTPTR, BT_V2DI, BT_UV2DI, BT_UV2DI, BT_INTPTR) DEF_FN_TYPE_3 (BT_FN_V2DI_V2DF_INT_INTPTR, BT_V2DI, BT_V2DF, BT_INT, BT_INTPTR) @@ -332,6 +334,7 @@ DEF_FN_TYPE_3 (BT_FN_V4SF_V2DF_INT_INT, BT_V4SF, BT_V2DF, BT_INT, BT_INT) DEF_FN_TYPE_3 (BT_FN_V4SF_V4SF_FLT_INT, BT_V4SF, BT_V4SF, BT_FLT, BT_INT) DEF_FN_TYPE_3 (BT_FN_V4SF_V4SF_UCHAR_UCHAR, BT_V4SF, BT_V4SF, BT_UCHAR, BT_UCHAR) DEF_FN_TYPE_3 (BT_FN_V4SF_V4SF_V4SF_INT, BT_V4SF, BT_V4SF, BT_V4SF, BT_INT) +DEF_FN_TYPE_3 (BT_FN_V4SF_V4SF_V4SF_UV4SI, BT_V4SF, BT_V4SF, BT_V4SF, BT_UV4SI) DEF_FN_TYPE_3 (BT_FN_V4SF_V4SF_V4SF_V4SF, BT_V4SF, BT_V4SF, BT_V4SF, BT_V4SF) DEF_FN_TYPE_3 (BT_FN_V4SI_UV4SI_UV4SI_INTPTR, BT_V4SI, BT_UV4SI, BT_UV4SI, BT_INTPTR) DEF_FN_TYPE_3 (BT_FN_V4SI_V2DI_V2DI_INTPTR, BT_V4SI, BT_V2DI, BT_V2DI, BT_INTPTR) diff --git a/gcc/config/s390/s390-builtins.def b/gcc/config/s390/s390-builtins.def index d05570cdeba..c69573df695 100644 --- a/gcc/config/s390/s390-builtins.def +++ b/gcc/config/s390/s390-builtins.def @@ -687,36 +687,41 @@ B_DEF (s390_vsceg, vec_scatter_elementv2di,0, /* First two operands are swapped in s390-c.c */ OB_DEF (s390_vec_sel, s390_vec_sel_b8_a, s390_vec_sel_dbl_b, B_VX, BT_FN_OV4SI_OV4SI_OV4SI_OV4SI) -OB_DEF_VAR (s390_vec_sel_b8_a, s390_vsel, 0, 0, BT_OV_BV16QI_BV16QI_BV16QI_UV16QI) -OB_DEF_VAR (s390_vec_sel_b8_b, s390_vsel, 0, 0, BT_OV_BV16QI_BV16QI_BV16QI_BV16QI) -OB_DEF_VAR (s390_vec_sel_s8_a, s390_vsel, 0, 0, BT_OV_V16QI_V16QI_V16QI_UV16QI) -OB_DEF_VAR (s390_vec_sel_s8_b, s390_vsel, 0, 0, BT_OV_V16QI_V16QI_V16QI_BV16QI) -OB_DEF_VAR (s390_vec_sel_u8_a, s390_vsel, 0, 0, BT_OV_UV16QI_UV16QI_UV16QI_UV16QI) -OB_DEF_VAR (s390_vec_sel_u8_b, s390_vsel, 0, 0, BT_OV_UV16QI_UV16QI_UV16QI_BV16QI) -OB_DEF_VAR (s390_vec_sel_b16_a, s390_vsel, 0, 0, BT_OV_BV8HI_BV8HI_BV8HI_UV8HI) -OB_DEF_VAR (s390_vec_sel_b16_b, s390_vsel, 0, 0, BT_OV_BV8HI_BV8HI_BV8HI_BV8HI) -OB_DEF_VAR (s390_vec_sel_s16_a, s390_vsel, 0, 0, BT_OV_V8HI_V8HI_V8HI_UV8HI) -OB_DEF_VAR (s390_vec_sel_s16_b, s390_vsel, 0, 0, BT_OV_V8HI_V8HI_V8HI_BV8HI) -OB_DEF_VAR (s390_vec_sel_u16_a, s390_vsel, 0, 0, BT_OV_UV8HI_UV8HI_UV8HI_UV8HI) -OB_DEF_VAR (s390_vec_sel_u16_b, s390_vsel, 0, 0, BT_OV_UV8HI_UV8HI_UV8HI_BV8HI) -OB_DEF_VAR (s390_vec_sel_b32_a, s390_vsel, 0, 0, BT_OV_BV4SI_BV4SI_BV4SI_UV4SI) -OB_DEF_VAR (s390_vec_sel_b32_b, s390_vsel, 0, 0, BT_OV_BV4SI_BV4SI_BV4SI_BV4SI) -OB_DEF_VAR (s390_vec_sel_s32_a, s390_vsel, 0, 0, BT_OV_V4SI_V4SI_V4SI_UV4SI) -OB_DEF_VAR (s390_vec_sel_s32_b, s390_vsel, 0, 0, BT_OV_V4SI_V4SI_V4SI_BV4SI) -OB_DEF_VAR (s390_vec_sel_u32_a, s390_vsel, 0, 0, BT_OV_UV4SI_UV4SI_UV4SI_UV4SI) -OB_DEF_VAR (s390_vec_sel_u32_b, s390_vsel, 0, 0, BT_OV_UV4SI_UV4SI_UV4SI_BV4SI) -OB_DEF_VAR (s390_vec_sel_b64_a, s390_vsel, 0, 0, BT_OV_BV2DI_BV2DI_BV2DI_UV2DI) -OB_DEF_VAR (s390_vec_sel_b64_b, s390_vsel, 0, 0, BT_OV_BV2DI_BV2DI_BV2DI_BV2DI) -OB_DEF_VAR (s390_vec_sel_s64_a, s390_vsel, 0, 0, BT_OV_V2DI_V2DI_V2DI_UV2DI) -OB_DEF_VAR (s390_vec_sel_s64_b, s390_vsel, 0, 0, BT_OV_V2DI_V2DI_V2DI_BV2DI) -OB_DEF_VAR (s390_vec_sel_u64_a, s390_vsel, 0, 0, BT_OV_UV2DI_UV2DI_UV2DI_UV2DI) -OB_DEF_VAR (s390_vec_sel_u64_b, s390_vsel, 0, 0, BT_OV_UV2DI_UV2DI_UV2DI_BV2DI) -OB_DEF_VAR (s390_vec_sel_flt_a, s390_vsel, B_VXE, 0, BT_OV_V4SF_V4SF_V4SF_UV4SI) -OB_DEF_VAR (s390_vec_sel_flt_b, s390_vsel, B_VXE, 0, BT_OV_V4SF_V4SF_V4SF_BV4SI) -OB_DEF_VAR (s390_vec_sel_dbl_a, s390_vsel, 0, 0, BT_OV_V2DF_V2DF_V2DF_UV2DI) -OB_DEF_VAR (s390_vec_sel_dbl_b, s390_vsel, 0, 0, BT_OV_V2DF_V2DF_V2DF_BV2DI) - -B_DEF (s390_vsel, vec_selv16qi, 0, B_VX, 0, BT_FN_UV16QI_UV16QI_UV16QI_UV16QI) +OB_DEF_VAR (s390_vec_sel_b8_a, s390_vselb, 0, 0, BT_OV_BV16QI_BV16QI_BV16QI_UV16QI) +OB_DEF_VAR (s390_vec_sel_b8_b, s390_vselb, 0, 0, BT_OV_BV16QI_BV16QI_BV16QI_BV16QI) +OB_DEF_VAR (s390_vec_sel_s8_a, s390_vselb, 0, 0, BT_OV_V16QI_V16QI_V16QI_UV16QI) +OB_DEF_VAR (s390_vec_sel_s8_b, s390_vselb, 0, 0, BT_OV_V16QI_V16QI_V16QI_BV16QI) +OB_DEF_VAR (s390_vec_sel_u8_a, s390_vselb, 0, 0, BT_OV_UV16QI_UV16QI_UV16QI_UV16QI) +OB_DEF_VAR (s390_vec_sel_u8_b, s390_vselb, 0, 0, BT_OV_UV16QI_UV16QI_UV16QI_BV16QI) +OB_DEF_VAR (s390_vec_sel_b16_a, s390_vselh, 0, 0, BT_OV_BV8HI_BV8HI_BV8HI_UV8HI) +OB_DEF_VAR (s390_vec_sel_b16_b, s390_vselh, 0, 0, BT_OV_BV8HI_BV8HI_BV8HI_BV8HI) +OB_DEF_VAR (s390_vec_sel_s16_a, s390_vselh, 0, 0, BT_OV_V8HI_V8HI_V8HI_UV8HI) +OB_DEF_VAR (s390_vec_sel_s16_b, s390_vselh, 0, 0, BT_OV_V8HI_V8HI_V8HI_BV8HI) +OB_DEF_VAR (s390_vec_sel_u16_a, s390_vselh, 0, 0, BT_OV_UV8HI_UV8HI_UV8HI_UV8HI) +OB_DEF_VAR (s390_vec_sel_u16_b, s390_vselh, 0, 0, BT_OV_UV8HI_UV8HI_UV8HI_BV8HI) +OB_DEF_VAR (s390_vec_sel_b32_a, s390_vself, 0, 0, BT_OV_BV4SI_BV4SI_BV4SI_UV4SI) +OB_DEF_VAR (s390_vec_sel_b32_b, s390_vself, 0, 0, BT_OV_BV4SI_BV4SI_BV4SI_BV4SI) +OB_DEF_VAR (s390_vec_sel_s32_a, s390_vself, 0, 0, BT_OV_V4SI_V4SI_V4SI_UV4SI) +OB_DEF_VAR (s390_vec_sel_s32_b, s390_vself, 0, 0, BT_OV_V4SI_V4SI_V4SI_BV4SI) +OB_DEF_VAR (s390_vec_sel_u32_a, s390_vself, 0, 0, BT_OV_UV4SI_UV4SI_UV4SI_UV4SI) +OB_DEF_VAR (s390_vec_sel_u32_b, s390_vself, 0, 0, BT_OV_UV4SI_UV4SI_UV4SI_BV4SI) +OB_DEF_VAR (s390_vec_sel_b64_a, s390_vselg, 0, 0, BT_OV_BV2DI_BV2DI_BV2DI_UV2DI) +OB_DEF_VAR (s390_vec_sel_b64_b, s390_vselg, 0, 0, BT_OV_BV2DI_BV2DI_BV2DI_BV2DI) +OB_DEF_VAR (s390_vec_sel_s64_a, s390_vselg, 0, 0, BT_OV_V2DI_V2DI_V2DI_UV2DI) +OB_DEF_VAR (s390_vec_sel_s64_b, s390_vselg, 0, 0, BT_OV_V2DI_V2DI_V2DI_BV2DI) +OB_DEF_VAR (s390_vec_sel_u64_a, s390_vselg, 0, 0, BT_OV_UV2DI_UV2DI_UV2DI_UV2DI) +OB_DEF_VAR (s390_vec_sel_u64_b, s390_vselg, 0, 0, BT_OV_UV2DI_UV2DI_UV2DI_BV2DI) +OB_DEF_VAR (s390_vec_sel_flt_a, s390_vself_flt, B_VXE, 0, BT_OV_V4SF_V4SF_V4SF_UV4SI) +OB_DEF_VAR (s390_vec_sel_flt_b, s390_vself_flt, B_VXE, 0, BT_OV_V4SF_V4SF_V4SF_BV4SI) +OB_DEF_VAR (s390_vec_sel_dbl_a, s390_vselg_dbl, 0, 0, BT_OV_V2DF_V2DF_V2DF_UV2DI) +OB_DEF_VAR (s390_vec_sel_dbl_b, s390_vselg_dbl, 0, 0, BT_OV_V2DF_V2DF_V2DF_BV2DI) + +B_DEF (s390_vselb, vselv16qi, 0, B_VX, 0, BT_FN_UV16QI_UV16QI_UV16QI_UV16QI) +B_DEF (s390_vselh, vselv8hi, 0, B_VX, 0, BT_FN_UV8HI_UV8HI_UV8HI_UV8HI) +B_DEF (s390_vself, vselv4si, 0, B_VX, 0, BT_FN_UV4SI_UV4SI_UV4SI_UV4SI) +B_DEF (s390_vselg, vselv2di, 0, B_VX, 0, BT_FN_UV2DI_UV2DI_UV2DI_UV2DI) +B_DEF (s390_vself_flt, vselv4sf, 0, B_VXE, 0, BT_FN_V4SF_V4SF_V4SF_UV4SI) +B_DEF (s390_vselg_dbl, vselv2df, 0, B_VX, 0, BT_FN_V2DF_V2DF_V2DF_UV2DI) OB_DEF (s390_vec_extend_s64, s390_vec_extend_s64_s8,s390_vec_extend_s64_s32,B_VX, BT_FN_OV4SI_OV4SI) OB_DEF_VAR (s390_vec_extend_s64_s8, s390_vsegb, 0, 0, BT_OV_V2DI_V16QI) diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md index 9a7f3dae1df..0eed31923c5 100644 --- a/gcc/config/s390/vx-builtins.md +++ b/gcc/config/s390/vx-builtins.md @@ -499,23 +499,22 @@ ; Vector select -; Operand 3 selects bits from either OP1 (0) or OP2 (1) +; for all b in bits op0[b] = op3[b] == 0 ? op2[b] : op1[b] +; implemented as: op0 = (op1 & op3) | (op2 & ~op3) -; Comparison operator should not matter as long as we always use the same ?! +; Used to expand the vec_sel builtin. Operands op1 and op2 already got +; swapped in s390-c.c when we get here. -; Operands 1 and 2 are swapped in order to match the altivec builtin. -; If operand 3 is a const_int bitmask this would be vec_merge -(define_expand "vec_sel" - [(set (match_operand:V_HW 0 "register_operand" "") - (if_then_else:V_HW - (eq (match_operand: 3 "register_operand" "") - (match_dup 4)) - (match_operand:V_HW 2 "register_operand" "") - (match_operand:V_HW 1 "register_operand" "")))] +(define_insn "vsel" + [(set (match_operand:V_HW 0 "register_operand" "=v") + (ior:V_HW + (and:V_HW (match_operand:V_HW 1 "register_operand" "v") + (match_operand:V_HW 3 "register_operand" "v")) + (and:V_HW (not:V_HW (match_dup 3)) + (match_operand:V_HW 2 "register_operand" "v"))))] "TARGET_VX" -{ - operands[4] = CONST0_RTX (mode); -}) + "vsel\t%v0,%1,%2,%3" + [(set_attr "op_type" "VRR")]) ; Vector sign extend to doubleword diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index ea3ba36d97f..117eed9e35b 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2020-04-20 Andreas Krebbel + + PR target/94613 + * gcc.target/s390/zvector/pr94613.c: New test. + * gcc.target/s390/zvector/vec_sel-1.c: New test. + 2020-04-20 Richard Sandiford * gcc.target/aarch64/sve/cost_model_8.c: New test. diff --git a/gcc/testsuite/gcc.target/s390/zvector/pr94613.c b/gcc/testsuite/gcc.target/s390/zvector/pr94613.c new file mode 100644 index 00000000000..0d71042f881 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/zvector/pr94613.c @@ -0,0 +1,38 @@ +/* { dg-do run } */ +/* { dg-require-effective-target s390_vx } */ +/* { dg-options "-O3 -mzarch -march=arch13 -mzvector -std=gnu99 --save-temps" } */ + +#include + +/* The initial implementation of vec_sel used an IF_THEN_ELSE rtx. + This did NOT match what the vsel instruction does. vsel is a + bit-wise operation. Using IF_THEN_ELSE made the + operation to be + simplified away in combine. A plus operation affects other bits in + the same element. Hence per-element simplifications are wrong for + vsel. */ +vector unsigned char __attribute__((noinline)) +foo (vector unsigned char a, vector unsigned char b, vector unsigned char c) +{ + return vec_sel (a + b, c, a); +} + +/* FIXME: The OR operation still should be optimized away in that case. */ +vector unsigned char __attribute__((noinline)) +bar (vector unsigned char a, vector unsigned char b, vector unsigned char c) +{ + return vec_sel (a | b, c, a); +} + +int +main () +{ + vector unsigned char v = (vector unsigned char){ 1 }; + + if (foo (v, v, v)[0] != 3) + __builtin_abort (); + + if (bar (v, v, v)[0] != 1) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec_sel-1.c b/gcc/testsuite/gcc.target/s390/zvector/vec_sel-1.c new file mode 100644 index 00000000000..d310f70d3a5 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/zvector/vec_sel-1.c @@ -0,0 +1,211 @@ +/* { dg-do run } */ +/* { dg-require-effective-target s390_vxe } */ +/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps -Wno-attributes" } */ + +#include +#include + +typedef vector signed char v16qi; +typedef vector unsigned char uv16qi; +typedef vector bool char bv16qi; + +typedef vector signed short int v8hi; +typedef vector unsigned short int uv8hi; +typedef vector bool short int bv8hi; + +typedef vector signed int v4si; +typedef vector unsigned int uv4si; +typedef vector bool int bv4si; + +typedef vector signed long long v2di; +typedef vector unsigned long long uv2di; +typedef vector bool long long bv2di; + +typedef vector float v4sf; +typedef vector double v2df; + +#define NUM_CONSTS 8 + +const v16qi v16qi_vals[NUM_CONSTS] = + { (v16qi){ 1 }, + (v16qi){ 2 }, + (v16qi){ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 }, + (v16qi){ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 }, + (v16qi){ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 }, + (v16qi){ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, + (v16qi){ 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 }, + (v16qi){ 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1 } + }; + +const v8hi v8hi_vals[NUM_CONSTS] = + { (v8hi){ 1 }, + (v8hi){ 2 }, + (v8hi){ 1,1,1,1,1,1,1,1 }, + (v8hi){ 2,2,2,2,2,2,2,2 }, + (v8hi){ -1,-1,-1,-1,-1,-1,-1,-1 }, + (v8hi){ 0,0,0,0,0,0,0,0 }, + (v8hi){ 1,2,3,4,5,6,7,8 }, + (v8hi){ 8,7,6,5,4,3,2,1 } + }; + +const v4si v4si_vals[NUM_CONSTS] = + { (v4si){ 1 }, + (v4si){ 2 }, + (v4si){ 1,1,1,1 }, + (v4si){ 2,2,2,2 }, + (v4si){ -1,-1,-1,-1 }, + (v4si){ 0,0,0,0 }, + (v4si){ 1,2,3,4 }, + (v4si){ 4,3,2,1 } + }; + +const v2di v2di_vals[NUM_CONSTS] = + { (v2di){ 1 }, + (v2di){ 2 }, + (v2di){ 1,1 }, + (v2di){ 2,2 }, + (v2di){ -1,-1 }, + (v2di){ 0,0 }, + (v2di){ 1,2 }, + (v2di){ 2,1 } + }; + +const v4sf v4sf_vals[NUM_CONSTS] = + { (v4sf){ 1.0f }, + (v4sf){ 2.0f }, + (v4sf){ 1.0f,1.0f,1.0f,1.0f }, + (v4sf){ 2.0f,2.0f,2.0f,2.0f }, + (v4sf){ -1.0f,-1.0f,-1.0f,-1.0f }, + (v4sf){ 0.0f,0.0f,0.0f,0.0f }, + (v4sf){ 1.1f,2.1f,3.1f,4.1f }, + (v4sf){ 4.1f,3.1f,2.1f,1.1f } + }; + +const v2df v2df_vals[NUM_CONSTS] = + { (v2df){ 1.0 }, + (v2df){ 2.0 }, + (v2df){ 1.0,1.0 }, + (v2df){ 2.0,2.0 }, + (v2df){ -1.0,-1.0 }, + (v2df){ 0.0,0.0 }, + (v2df){ 1.1,2.1 }, + (v2df){ 2.1,1.1 } + }; + +/* Each bit of the result vector has the value of the corresponding + bit of A if the corresponding bit of C is 0, or the value of the + corresponding bit of B otherwise. */ +void __attribute__((noinline, noclone, target ("arch=zEC12"))) +emul (unsigned char *result, unsigned char *a, + unsigned char *b, unsigned char *c) +{ + for (int i = 0; i < 16; i++) + result[i] = (a[i] & ~c[i]) | (b[i] & c[i]); +} + +#define GENFUNC(NAME, T1, T2) \ + T1 __attribute__((noinline, noclone)) \ + NAME##_reg (T1 a, T1 b, T2 c) { return vec_sel (a, b, c); } \ + void __attribute__((noinline, noclone)) \ + NAME##_mem (T1 *a, T1 *b, T2 *c, T1 *out) { *out = vec_sel (*a, *b, *c); } \ + T1 __attribute__((always_inline)) \ + NAME##_const (T1 a, T1 b, T2 c) { return vec_sel (a, b, c); } + +GENFUNC (vec_sel_b8_a, bv16qi, uv16qi) +GENFUNC (vec_sel_b8_b, bv16qi, bv16qi) +GENFUNC (vec_sel_s8_a, v16qi, uv16qi) +GENFUNC (vec_sel_s8_b, v16qi, bv16qi) +GENFUNC (vec_sel_u8_a, uv16qi, uv16qi) +GENFUNC (vec_sel_u8_b, uv16qi, bv16qi) + +GENFUNC (vec_sel_b16_a, bv8hi, uv8hi) +GENFUNC (vec_sel_b16_b, bv8hi, bv8hi) +GENFUNC (vec_sel_s16_a, v8hi, uv8hi) +GENFUNC (vec_sel_s16_b, v8hi, bv8hi) +GENFUNC (vec_sel_u16_a, uv8hi, uv8hi) +GENFUNC (vec_sel_u16_b, uv8hi, bv8hi) + +GENFUNC (vec_sel_b32_a, bv4si, uv4si) +GENFUNC (vec_sel_b32_b, bv4si, bv4si) +GENFUNC (vec_sel_s32_a, v4si, uv4si) +GENFUNC (vec_sel_s32_b, v4si, bv4si) +GENFUNC (vec_sel_u32_a, uv4si, uv4si) +GENFUNC (vec_sel_u32_b, uv4si, bv4si) + +GENFUNC (vec_sel_b64_a, bv2di, uv2di) +GENFUNC (vec_sel_b64_b, bv2di, bv2di) +GENFUNC (vec_sel_s64_a, v2di, uv2di) +GENFUNC (vec_sel_s64_b, v2di, bv2di) +GENFUNC (vec_sel_u64_a, uv2di, uv2di) +GENFUNC (vec_sel_u64_b, uv2di, bv2di) + +GENFUNC (vec_sel_flt_a, v4sf, uv4si) +GENFUNC (vec_sel_flt_b, v4sf, bv4si) + +GENFUNC (vec_sel_dbl_a, v2df, uv2di) +GENFUNC (vec_sel_dbl_b, v2df, bv2di) + +#define TESTFUNC(NAME, T1, T2, VAL_TYPE) \ + for (int i = 0; i < NUM_CONSTS; i++) \ + for (int j = 0; j < NUM_CONSTS; j++) \ + for (int k = 0; k < NUM_CONSTS; k++) \ + { \ + unsigned char result[16]; \ + T1 in1 = (T1)VAL_TYPE##_vals[i]; \ + T1 in2 = (T1)VAL_TYPE##_vals[j]; \ + T2 in3 = (T2)VAL_TYPE##_vals[k]; \ + emul (result, (char*)&in1, (char*)&in2, (char*)&in3); \ + \ + T1 reg = NAME##_reg (in1, in2, in3); \ + if (memcmp ((char*)®, result, 16) != 0) \ + __builtin_abort (); \ + \ + T1 mem; \ + NAME##_mem (&in1, &in2, &in3, &mem); \ + if (memcmp ((char*)&mem, result, 16) != 0) \ + __builtin_abort (); \ + \ + T1 cons = NAME##_const (in1, in2, in3); \ + if (memcmp ((char*)&cons, result, 16) != 0) \ + __builtin_abort (); \ + } + +int +main () +{ + TESTFUNC (vec_sel_b8_a, bv16qi, uv16qi, v16qi); + TESTFUNC (vec_sel_b8_b, bv16qi, bv16qi, v16qi); + TESTFUNC (vec_sel_s8_a, v16qi, uv16qi, v16qi); + TESTFUNC (vec_sel_s8_b, v16qi, bv16qi, v16qi); + TESTFUNC (vec_sel_u8_a, uv16qi, uv16qi, v16qi); + TESTFUNC (vec_sel_u8_b, uv16qi, bv16qi, v16qi); + + TESTFUNC (vec_sel_b16_a, bv8hi, uv8hi, v8hi); + TESTFUNC (vec_sel_b16_b, bv8hi, bv8hi, v8hi); + TESTFUNC (vec_sel_s16_a, v8hi, uv8hi, v8hi); + TESTFUNC (vec_sel_s16_b, v8hi, bv8hi, v8hi); + TESTFUNC (vec_sel_u16_a, uv8hi, uv8hi, v8hi); + TESTFUNC (vec_sel_u16_b, uv8hi, bv8hi, v8hi); + + TESTFUNC (vec_sel_b32_a, bv4si, uv4si, v4si); + TESTFUNC (vec_sel_b32_b, bv4si, bv4si, v4si); + TESTFUNC (vec_sel_s32_a, v4si, uv4si, v4si); + TESTFUNC (vec_sel_s32_b, v4si, bv4si, v4si); + TESTFUNC (vec_sel_u32_a, uv4si, uv4si, v4si); + TESTFUNC (vec_sel_u32_b, uv4si, bv4si, v4si); + + TESTFUNC (vec_sel_b64_a, bv2di, uv2di, v2di); + TESTFUNC (vec_sel_b64_b, bv2di, bv2di, v2di); + TESTFUNC (vec_sel_s64_a, v2di, uv2di, v2di); + TESTFUNC (vec_sel_s64_b, v2di, bv2di, v2di); + TESTFUNC (vec_sel_u64_a, uv2di, uv2di, v2di); + TESTFUNC (vec_sel_u64_b, uv2di, bv2di, v2di); + + TESTFUNC (vec_sel_flt_a, v4sf, uv4si, v4sf); + TESTFUNC (vec_sel_flt_b, v4sf, bv4si, v4sf); + + TESTFUNC (vec_sel_dbl_a, v2df, uv2di, v2df); + TESTFUNC (vec_sel_dbl_b, v2df, bv2di, v2df); +} + +/* { dg-final { scan-assembler {\n\tvsel\t} } } */