aco: implement 8-bit/16-bit reductions

author Samuel Pitoiset <samuel.pitoiset@gmail.com>

Wed, 8 Apr 2020 06:39:28 +0000 (08:39 +0200)

committer Marge Bot <eric+marge@anholt.net>

Thu, 21 May 2020 15:06:48 +0000 (15:06 +0000)
author Samuel Pitoiset <samuel.pitoiset@gmail.com>
Wed, 8 Apr 2020 06:39:28 +0000 (08:39 +0200)
committer Marge Bot <eric+marge@anholt.net>
Thu, 21 May 2020 15:06:48 +0000 (15:06 +0000)
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index 7747e6bbeea63f63a22899a2183c80af40eca9e2..a5eee1123710711577ddc95a3d2f88c6eeaba652 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -7669,27 +7669,31 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        } else if (cluster_size == 1) {
           bld.copy(Definition(dst), src);
        } else {
-         src = as_vgpr(ctx, src);
+         unsigned bit_size = instr->src[0].ssa->bit_size;
+
+         src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
  
           ReduceOp reduce_op;
           switch (op) {
-         #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
-            CASE(iadd)
-            CASE(imul)
-            CASE(fadd)
-            CASE(fmul)
-            CASE(imin)
-            CASE(umin)
-            CASE(fmin)
-            CASE(imax)
-            CASE(umax)
-            CASE(fmax)
-            CASE(iand)
-            CASE(ior)
-            CASE(ixor)
+         #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break;
+         #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break;
+            CASEI(iadd)
+            CASEI(imul)
+            CASEI(imin)
+            CASEI(umin)
+            CASEI(imax)
+            CASEI(umax)
+            CASEI(iand)
+            CASEI(ior)
+            CASEI(ixor)
+            CASEF(fadd)
+            CASEF(fmul)
+            CASEF(fmin)
+            CASEF(fmax)
              default:
                 unreachable("unknown reduction op");
-         #undef CASE
+         #undef CASEI
+         #undef CASEF
           }
  
           aco_opcode aco_op;
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h

index 8fa6e48d45240bf36b69af5c8c515b7348d27b07..1529f78cef78e86eabd029e6805dc2c543151d7e 100644 (file)
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1127,14 +1127,14 @@ static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Un
  enum ReduceOp : uint16_t {
     iadd8, iadd16, iadd32, iadd64,
     imul8, imul16, imul32, imul64,
-   fadd8, fadd16, fadd32, fadd64,
-   fmul8, fmul16, fmul32, fmul64,
+          fadd16, fadd32, fadd64,
+          fmul16, fmul32, fmul64,
     imin8, imin16, imin32, imin64,
     imax8, imax16, imax32, imax64,
     umin8, umin16, umin32, umin64,
     umax8, umax16, umax32, umax64,
-   fmin8, fmin16, fmin32, fmin64,
-   fmax8, fmax16, fmax32, fmax64,
+          fmin16, fmin32, fmin64,
+          fmax16, fmax32, fmax64,
     iand8, iand16, iand32, iand64,
     ior8, ior16, ior32, ior64,
     ixor8, ixor16, ixor32, ixor64,
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp

index 765a7f63a983afca282749e890adb3e7eeb10fd6..1d3061d5dd94b539c48d0f08370d6db032c07369 100644 (file)
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -43,6 +43,22 @@ struct lower_context {
  
  aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) {
     switch (op) {
+   case iadd8:
+   case iadd16: return aco_opcode::v_add_u16;
+   case imul8:
+   case imul16: return aco_opcode::v_mul_lo_u16;
+   case fadd16: return aco_opcode::v_add_f16;
+   case fmul16: return aco_opcode::v_mul_f16;
+   case imax8:
+   case imax16: return aco_opcode::v_max_i16;
+   case imin8:
+   case imin16: return aco_opcode::v_min_i16;
+   case umin8:
+   case umin16: return aco_opcode::v_min_u16;
+   case umax8:
+   case umax16: return aco_opcode::v_max_u16;
+   case fmin16: return aco_opcode::v_min_f16;
+   case fmax16: return aco_opcode::v_max_f16;
     case iadd32: return chip >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
     case imul32: return aco_opcode::v_mul_lo_u32;
     case fadd32: return aco_opcode::v_add_f32;
@@ -53,8 +69,14 @@ aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) {
     case umax32: return aco_opcode::v_max_u32;
     case fmin32: return aco_opcode::v_min_f32;
     case fmax32: return aco_opcode::v_max_f32;
+   case iand8:
+   case iand16:
     case iand32: return aco_opcode::v_and_b32;
+   case ixor8:
+   case ixor16:
     case ixor32: return aco_opcode::v_xor_b32;
+   case ior8:
+   case ior16:
     case ior32: return aco_opcode::v_or_b32;
     case iadd64: return aco_opcode::num_opcodes;
     case imul64: return aco_opcode::num_opcodes;
@@ -363,41 +385,71 @@ void emit_dpp_mov(lower_context *ctx, PhysReg dst, PhysReg src0, unsigned size,
  uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
  {
     switch (op) {
+   case iadd8:
+   case iadd16:
     case iadd32:
     case iadd64:
+   case fadd16:
     case fadd32:
     case fadd64:
+   case ior8:
+   case ior16:
     case ior32:
     case ior64:
+   case ixor8:
+   case ixor16:
     case ixor32:
     case ixor64:
+   case umax8:
+   case umax16:
     case umax32:
     case umax64:
        return 0;
+   case imul8:
+   case imul16:
     case imul32:
     case imul64:
        return idx ? 0 : 1;
+   case fmul16:
+      return 0x3c00u; /* 1.0 */
     case fmul32:
        return 0x3f800000u; /* 1.0 */
     case fmul64:
        return idx ? 0x3ff00000u : 0u; /* 1.0 */
+   case imin8:
+      return INT8_MAX;
+   case imin16:
+      return INT16_MAX;
     case imin32:
        return INT32_MAX;
     case imin64:
        return idx ? 0x7fffffffu : 0xffffffffu;
+   case imax8:
+      return INT8_MIN;
+   case imax16:
+      return INT16_MIN;
     case imax32:
        return INT32_MIN;
     case imax64:
        return idx ? 0x80000000u : 0;
+   case umin8:
+   case umin16:
+   case iand8:
+   case iand16:
+      return 0xffffffffu;
     case umin32:
     case umin64:
     case iand32:
     case iand64:
        return 0xffffffffu;
+   case fmin16:
+      return 0x7c00u; /* infinity */
     case fmin32:
        return 0x7f800000u; /* infinity */
     case fmin64:
        return idx ? 0x7ff00000u : 0u; /* infinity */
+   case fmax16:
+      return 0xfc00u; /* negative infinity */
     case fmax32:
        return 0xff800000u; /* negative infinity */
     case fmax64:
diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp

index e3c8cd81add857dfe33aa57b590bfdd629f4a5aa..2b18daef1544d96d94261ca7d3db79eb0bb8bf54 100644 (file)
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@@ -15,11 +15,9 @@ static const char *reduce_ops[] = {
     [imul16] = "imul16",
     [imul32] = "imul32",
     [imul64] = "imul64",
-   [fadd8] = "fadd8",
     [fadd16] = "fadd16",
     [fadd32] = "fadd32",
     [fadd64] = "fadd64",
-   [fmul8] = "fmul8",
     [fmul16] = "fmul16",
     [fmul32] = "fmul32",
     [fmul64] = "fmul64",
@@ -39,11 +37,9 @@ static const char *reduce_ops[] = {
     [umax16] = "umax16",
     [umax32] = "umax32",
     [umax64] = "umax64",
-   [fmin8] = "fmin8",
     [fmin16] = "fmin16",
     [fmin32] = "fmin32",
     [fmin64] = "fmin64",
-   [fmax8] = "fmax8",
     [fmax16] = "fmax16",
     [fmax32] = "fmax32",
     [fmax64] = "fmax64",
author	Samuel Pitoiset <samuel.pitoiset@gmail.com>
	Wed, 8 Apr 2020 06:39:28 +0000 (08:39 +0200)
committer	Marge Bot <eric+marge@anholt.net>
	Thu, 21 May 2020 15:06:48 +0000 (15:06 +0000)
src/amd/compiler/aco_instruction_selection.cpp		patch \| blob \| history
src/amd/compiler/aco_ir.h		patch \| blob \| history
src/amd/compiler/aco_lower_to_hw_instr.cpp		patch \| blob \| history
src/amd/compiler/aco_print_ir.cpp		patch \| blob \| history