From a5879399fc0dcdd6b1ebe9a3b0d03db5ca30150a Mon Sep 17 00:00:00 2001
From: Andrew Stubbs <ams@codesourcery.com>
Date: Fri, 7 Feb 2020 11:14:43 +0000
Subject: [PATCH] amdgcn: Extend reductions to all types

Add support for V64DFmode addition, and V64DImode min, max.  There's no
direct hardware support for these, so we use regular vector instructions
and separate lane shift instructions.

Also add support for V64QI and V64HI reductions. Some of these require
additional extends and truncates, because AMD GCN has 32-bit vector lanes.

2020-03-02  Andrew Stubbs  <ams@codesourcery.com>

	gcc/
	* config/gcn/gcn-valu.md (dpp_move<mode>): New.
	(reduc_insn): Use 'U' and 'B' operand codes.
	(reduc_<reduc_op>_scal_<mode>): Allow all types.
	(reduc_<reduc_op>_scal_v64di): Delete.
	(*<reduc_op>_dpp_shr_<mode>): Allow all 1reg types.
	(*plus_carry_dpp_shr_v64si): Change to ...
	(*plus_carry_dpp_shr_<mode>): ... this and allow all 1reg int types.
	(mov_from_lane63_v64di): Change to ...
	(mov_from_lane63_<mode>): ... this, and allow all 64-bit modes.
	* config/gcn/gcn.c (gcn_expand_dpp_shr_insn): Increase buffer size.
	Support UNSPEC_MOV_DPP_SHR output formats.
	(gcn_expand_reduc_scalar): Add "use_moves" reductions.
	Add "use_extends" reductions.
	(print_operand_address): Add 'I' and 'U' codes.
	* config/gcn/gcn.md (unspec): Add UNSPEC_MOV_DPP_SHR.
---
 gcc/ChangeLog              |  18 ++++++
 gcc/config/gcn/gcn-valu.md |  76 +++++++++++++------------
 gcc/config/gcn/gcn.c       | 111 ++++++++++++++++++++++++++++++-------
 gcc/config/gcn/gcn.md      |   1 +
 4 files changed, 147 insertions(+), 59 deletions(-)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 05bfa476e14..0d65434de4d 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,21 @@
+2020-03-02  Andrew Stubbs  <ams@codesourcery.com>
+
+	* config/gcn/gcn-valu.md (dpp_move<mode>): New.
+	(reduc_insn): Use 'U' and 'B' operand codes.
+	(reduc_<reduc_op>_scal_<mode>): Allow all types.
+	(reduc_<reduc_op>_scal_v64di): Delete.
+	(*<reduc_op>_dpp_shr_<mode>): Allow all 1reg types.
+	(*plus_carry_dpp_shr_v64si): Change to ...
+	(*plus_carry_dpp_shr_<mode>): ... this and allow all 1reg int types.
+	(mov_from_lane63_v64di): Change to ...
+	(mov_from_lane63_<mode>): ... this, and allow all 64-bit modes.
+	* config/gcn/gcn.c (gcn_expand_dpp_shr_insn): Increase buffer size.
+	Support UNSPEC_MOV_DPP_SHR output formats.
+	(gcn_expand_reduc_scalar): Add "use_moves" reductions.
+	Add "use_extends" reductions.
+	(print_operand_address): Add 'I' and 'U' codes.
+	* config/gcn/gcn.md (unspec): Add UNSPEC_MOV_DPP_SHR.
+
 2020-03-02  Martin Liska  <mliska@suse.cz>
 
 	* lto-wrapper.c: Fix typo in comment about
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 40e864a8de7..a8034f77798 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -985,6 +985,20 @@
   [(set_attr "type" "vmult")
    (set_attr "length" "24")])
 
+(define_insn "@dpp_move<mode>"
+  [(set (match_operand:VEC_REG_MODE 0 "register_operand"    "=v")
+	(unspec:VEC_REG_MODE
+	  [(match_operand:VEC_REG_MODE 1 "register_operand" " v")
+	   (match_operand:SI 2 "const_int_operand"	    " n")]
+	  UNSPEC_MOV_DPP_SHR))]
+  ""
+  {
+    return gcn_expand_dpp_shr_insn (<MODE>mode, "v_mov_b32",
+				    UNSPEC_MOV_DPP_SHR, INTVAL (operands[2]));
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "length" "16")])
+
 ;; }}}
 ;; {{{ ALU special case: add/sub
 
@@ -2969,15 +2983,15 @@
 			     (UNSPEC_SMAX_DPP_SHR "v_max%i0")
 			     (UNSPEC_UMIN_DPP_SHR "v_min%u0")
 			     (UNSPEC_UMAX_DPP_SHR "v_max%u0")
-			     (UNSPEC_PLUS_DPP_SHR "v_add%u0")
-			     (UNSPEC_AND_DPP_SHR  "v_and%b0")
-			     (UNSPEC_IOR_DPP_SHR  "v_or%b0")
-			     (UNSPEC_XOR_DPP_SHR  "v_xor%b0")])
+			     (UNSPEC_PLUS_DPP_SHR "v_add%U0")
+			     (UNSPEC_AND_DPP_SHR  "v_and%B0")
+			     (UNSPEC_IOR_DPP_SHR  "v_or%B0")
+			     (UNSPEC_XOR_DPP_SHR  "v_xor%B0")])
 
 (define_expand "reduc_<reduc_op>_scal_<mode>"
   [(set (match_operand:<SCALAR_MODE> 0 "register_operand")
 	(unspec:<SCALAR_MODE>
-	  [(match_operand:VEC_1REG_MODE 1 "register_operand")]
+	  [(match_operand:VEC_ALLREG_MODE 1 "register_operand")]
 	  REDUC_UNSPEC))]
   ""
   {
@@ -2990,29 +3004,15 @@
     DONE;
   })
 
-(define_expand "reduc_<reduc_op>_scal_v64di"
-  [(set (match_operand:DI 0 "register_operand")
-	(unspec:DI
-	  [(match_operand:V64DI 1 "register_operand")]
-	  REDUC_2REG_UNSPEC))]
-  ""
-  {
-    rtx tmp = gcn_expand_reduc_scalar (V64DImode, operands[1],
-				       <reduc_unspec>);
-
-    /* The result of the reduction is in lane 63 of tmp.  */
-    emit_insn (gen_mov_from_lane63_v64di (operands[0], tmp));
-
-    DONE;
-  })
 
 (define_insn "*<reduc_op>_dpp_shr_<mode>"
-  [(set (match_operand:VEC_1REG_MODE 0 "register_operand"   "=v")
-	(unspec:VEC_1REG_MODE
-	  [(match_operand:VEC_1REG_MODE 1 "register_operand" "v")
-	   (match_operand:VEC_1REG_MODE 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"	     "n")]
+  [(set (match_operand:VEC_ALL1REG_MODE 0 "register_operand"   "=v")
+	(unspec:VEC_ALL1REG_MODE
+	  [(match_operand:VEC_ALL1REG_MODE 1 "register_operand" "v")
+	   (match_operand:VEC_ALL1REG_MODE 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"		"n")]
 	  REDUC_UNSPEC))]
+  ; GCN3 requires a carry out, GCN5 not
   "!(TARGET_GCN3 && SCALAR_INT_MODE_P (<SCALAR_MODE>mode)
      && <reduc_unspec> == UNSPEC_PLUS_DPP_SHR)"
   {
@@ -3051,18 +3051,17 @@
 
 ; Special cases for addition.
 
-(define_insn "*plus_carry_dpp_shr_v64si"
-  [(set (match_operand:V64SI 0 "register_operand"   "=v")
-	(unspec:V64SI
-	  [(match_operand:V64SI 1 "register_operand" "v")
-	   (match_operand:V64SI 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"   "n")]
+(define_insn "*plus_carry_dpp_shr_<mode>"
+  [(set (match_operand:VEC_ALL1REG_INT_MODE 0 "register_operand"   "=v")
+	(unspec:VEC_ALL1REG_INT_MODE
+	  [(match_operand:VEC_ALL1REG_INT_MODE 1 "register_operand" "v")
+	   (match_operand:VEC_ALL1REG_INT_MODE 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"		    "n")]
 	  UNSPEC_PLUS_CARRY_DPP_SHR))
    (clobber (reg:DI VCC_REG))]
   ""
   {
-    const char *insn = TARGET_GCN3 ? "v_add%u0" : "v_add_co%u0";
-    return gcn_expand_dpp_shr_insn (V64SImode, insn,
+    return gcn_expand_dpp_shr_insn (V64SImode, "v_add%^_u32",
 				    UNSPEC_PLUS_CARRY_DPP_SHR,
 				    INTVAL (operands[3]));
   }
@@ -3080,8 +3079,7 @@
    (clobber (reg:DI VCC_REG))]
   ""
   {
-    const char *insn = TARGET_GCN3 ? "v_addc%u0" : "v_addc_co%u0";
-    return gcn_expand_dpp_shr_insn (V64SImode, insn,
+    return gcn_expand_dpp_shr_insn (V64SImode, "v_addc%^_u32",
 				    UNSPEC_PLUS_CARRY_IN_DPP_SHR,
 				    INTVAL (operands[3]));
   }
@@ -3134,10 +3132,10 @@
    (set_attr "exec" "none,*")
    (set_attr "length" "8")])
 
-(define_insn "mov_from_lane63_v64di"
-  [(set (match_operand:DI 0 "register_operand"	     "=Sg,v")
-	(unspec:DI
-	  [(match_operand:V64DI 1 "register_operand"   "v,v")]
+(define_insn "mov_from_lane63_<mode>"
+  [(set (match_operand:<SCALAR_MODE> 0 "register_operand"  "=Sg,v")
+	(unspec:<SCALAR_MODE>
+	  [(match_operand:VEC_2REG_MODE 1 "register_operand" "v,v")]
 	  UNSPEC_MOV_FROM_LANE63))]
   ""
   "@
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index a8fd7d6d52f..12438cf6eb4 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -4121,7 +4121,7 @@ char *
 gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
 			 int unspec, int shift)
 {
-  static char buf[64];
+  static char buf[128];
   const char *dpp;
   const char *vcc_in = "";
   const char *vcc_out = "";
@@ -4162,7 +4162,13 @@ gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
       gcc_unreachable ();
     }
 
-  sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
+  if (unspec == UNSPEC_MOV_DPP_SHR && vgpr_2reg_mode_p (mode))
+    sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
+	     insn, dpp, insn, dpp);
+  else if (unspec == UNSPEC_MOV_DPP_SHR)
+    sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
+  else
+    sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
 
   return buf;
 }
@@ -4176,7 +4182,28 @@ gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
 rtx
 gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
 {
-  rtx tmp = gen_reg_rtx (mode);
+  machine_mode orig_mode = mode;
+  bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
+		      || unspec == UNSPEC_SMAX_DPP_SHR
+		      || unspec == UNSPEC_UMIN_DPP_SHR
+		      || unspec == UNSPEC_UMAX_DPP_SHR)
+		     && mode == V64DImode)
+		    || (unspec == UNSPEC_PLUS_DPP_SHR
+			&& mode == V64DFmode));
+  rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
+		   : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
+		   : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
+		   : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
+		   : unspec == UNSPEC_PLUS_DPP_SHR ? PLUS
+		   : UNKNOWN);
+  bool use_extends = ((unspec == UNSPEC_SMIN_DPP_SHR
+		       || unspec == UNSPEC_SMAX_DPP_SHR
+		       || unspec == UNSPEC_UMIN_DPP_SHR
+		       || unspec == UNSPEC_UMAX_DPP_SHR)
+		      && (mode == V64QImode
+			  || mode == V64HImode));
+  bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
+		    || unspec == UNSPEC_UMAX_DPP_SHR);
   bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
 			&& GET_MODE_CLASS (mode) == MODE_VECTOR_INT
 			&& (TARGET_GCN3 || mode == V64DImode);
@@ -4184,36 +4211,60 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
   if (use_plus_carry)
     unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
 
+  if (use_extends)
+    {
+      rtx tmp = gen_reg_rtx (V64SImode);
+      convert_move (tmp, src, unsignedp);
+      src = tmp;
+      mode = V64SImode;
+    }
+
   /* Perform reduction by first performing the reduction operation on every
      pair of lanes, then on every pair of results from the previous
      iteration (thereby effectively reducing every 4 lanes) and so on until
      all lanes are reduced.  */
+  rtx in, out = src;
   for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
     {
       rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
-      rtx insn = gen_rtx_SET (tmp,
-			      gen_rtx_UNSPEC (mode,
-					      gen_rtvec (3,
-							 src, src, shift_val),
-					      unspec));
-
-      /* Add clobber for instructions that set the carry flags.  */
-      if (use_plus_carry)
+      in = out;
+      out = gen_reg_rtx (mode);
+
+      if (use_moves)
 	{
-	  rtx clobber = gen_rtx_CLOBBER (VOIDmode,
-					 gen_rtx_REG (DImode, VCC_REG));
-	  insn = gen_rtx_PARALLEL (VOIDmode,
-				   gen_rtvec (2, insn, clobber));
+	  rtx tmp = gen_reg_rtx (mode);
+	  emit_insn (gen_dpp_move (mode, tmp, in, shift_val));
+	  emit_insn (gen_rtx_SET (out, gen_rtx_fmt_ee (code, mode, tmp, in)));
 	}
+      else
+	{
+	  rtx insn = gen_rtx_SET (out,
+				  gen_rtx_UNSPEC (mode,
+						  gen_rtvec (3, in, in,
+							     shift_val),
+						  unspec));
+
+	  /* Add clobber for instructions that set the carry flags.  */
+	  if (use_plus_carry)
+	    {
+	      rtx clobber = gen_rtx_CLOBBER (VOIDmode,
+					     gen_rtx_REG (DImode, VCC_REG));
+	      insn = gen_rtx_PARALLEL (VOIDmode,
+				       gen_rtvec (2, insn, clobber));
+	    }
 
-      emit_insn (insn);
+	  emit_insn (insn);
+	}
+    }
 
-      /* The source operands for every iteration after the first
-	   should be TMP.  */
-      src = tmp;
+  if (use_extends)
+    {
+      rtx tmp = gen_reg_rtx (orig_mode);
+      convert_move (tmp, out, unsignedp);
+      out = tmp;
     }
 
-  return tmp;
+  return out;
 }
 
 /* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST.  */
@@ -5442,7 +5493,9 @@ print_operand_address (FILE *file, rtx mem)
    b - print operand size as untyped operand (b8/b16/b32/b64)
    B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
    i - print operand size as untyped operand (i16/b32/i64)
+   I - print operand size as SI/DI untyped operand(i32/b32/i64)
    u - print operand size as untyped operand (u16/u32/u64)
+   U - print operand size as SI/DI untyped operand(u32/u64)
    o - print operand size as memory access size for loads
        (ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
    s - print operand size as memory access size for stores
@@ -5537,9 +5590,12 @@ print_operand (FILE *file, rtx x, int code)
       fputs (")", file);
       return;
     case 'i':
+    case 'I':
     case 'u':
+    case 'U':
       {
 	bool signed_p = code == 'i';
+	bool min32_p = code == 'I' || code == 'U';
 	const char *s = "";
 	machine_mode mode = GET_MODE (x);
 	if (VECTOR_MODE_P (mode))
@@ -5573,6 +5629,21 @@ print_operand (FILE *file, rtx x, int code)
 	      output_operand_lossage ("invalid operand %%xn code");
 	      return;
 	    }
+	else if (min32_p)
+	  switch (GET_MODE_SIZE (mode))
+	    {
+	    case 1:
+	    case 2:
+	    case 4:
+	      s = signed_p ? "_i32" : "_u32";
+	      break;
+	    case 8:
+	      s = signed_p ? "_i64" : "_u64";
+	      break;
+	    default:
+	      output_operand_lossage ("invalid operand %%xn code");
+	      return;
+	    }
 	else
 	  switch (GET_MODE_SIZE (mode))
 	    {
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 1bd3330f90b..d1a94cbe7b0 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -76,6 +76,7 @@
   UNSPEC_PLUS_DPP_SHR
   UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
   UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
+  UNSPEC_MOV_DPP_SHR
   UNSPEC_MOV_FROM_LANE63
   UNSPEC_GATHER
   UNSPEC_SCATTER])
-- 
2.30.2