From 86e2b03e3f8862d52fd7ff0945eab423ba03ad26 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Wed, 8 Apr 2020 08:39:28 +0200
Subject: [PATCH] aco: implement 8-bit/16-bit reductions

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4494>
---
 .../compiler/aco_instruction_selection.cpp    | 36 +++++++------
 src/amd/compiler/aco_ir.h                     |  8 +--
 src/amd/compiler/aco_lower_to_hw_instr.cpp    | 52 +++++++++++++++++++
 src/amd/compiler/aco_print_ir.cpp             |  4 --
 4 files changed, 76 insertions(+), 24 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 7747e6bbeea..a5eee112371 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -7669,27 +7669,31 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       } else if (cluster_size == 1) {
          bld.copy(Definition(dst), src);
       } else {
-         src = as_vgpr(ctx, src);
+         unsigned bit_size = instr->src[0].ssa->bit_size;
+
+         src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
 
          ReduceOp reduce_op;
          switch (op) {
-         #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
-            CASE(iadd)
-            CASE(imul)
-            CASE(fadd)
-            CASE(fmul)
-            CASE(imin)
-            CASE(umin)
-            CASE(fmin)
-            CASE(imax)
-            CASE(umax)
-            CASE(fmax)
-            CASE(iand)
-            CASE(ior)
-            CASE(ixor)
+         #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break;
+         #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break;
+            CASEI(iadd)
+            CASEI(imul)
+            CASEI(imin)
+            CASEI(umin)
+            CASEI(imax)
+            CASEI(umax)
+            CASEI(iand)
+            CASEI(ior)
+            CASEI(ixor)
+            CASEF(fadd)
+            CASEF(fmul)
+            CASEF(fmin)
+            CASEF(fmax)
             default:
                unreachable("unknown reduction op");
-         #undef CASE
+         #undef CASEI
+         #undef CASEF
          }
 
          aco_opcode aco_op;
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 8fa6e48d452..1529f78cef7 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1127,14 +1127,14 @@ static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Un
 enum ReduceOp : uint16_t {
    iadd8, iadd16, iadd32, iadd64,
    imul8, imul16, imul32, imul64,
-   fadd8, fadd16, fadd32, fadd64,
-   fmul8, fmul16, fmul32, fmul64,
+          fadd16, fadd32, fadd64,
+          fmul16, fmul32, fmul64,
    imin8, imin16, imin32, imin64,
    imax8, imax16, imax32, imax64,
    umin8, umin16, umin32, umin64,
    umax8, umax16, umax32, umax64,
-   fmin8, fmin16, fmin32, fmin64,
-   fmax8, fmax16, fmax32, fmax64,
+          fmin16, fmin32, fmin64,
+          fmax16, fmax32, fmax64,
    iand8, iand16, iand32, iand64,
    ior8, ior16, ior32, ior64,
    ixor8, ixor16, ixor32, ixor64,
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 765a7f63a98..1d3061d5dd9 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -43,6 +43,22 @@ struct lower_context {
 
 aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) {
    switch (op) {
+   case iadd8:
+   case iadd16: return aco_opcode::v_add_u16;
+   case imul8:
+   case imul16: return aco_opcode::v_mul_lo_u16;
+   case fadd16: return aco_opcode::v_add_f16;
+   case fmul16: return aco_opcode::v_mul_f16;
+   case imax8:
+   case imax16: return aco_opcode::v_max_i16;
+   case imin8:
+   case imin16: return aco_opcode::v_min_i16;
+   case umin8:
+   case umin16: return aco_opcode::v_min_u16;
+   case umax8:
+   case umax16: return aco_opcode::v_max_u16;
+   case fmin16: return aco_opcode::v_min_f16;
+   case fmax16: return aco_opcode::v_max_f16;
    case iadd32: return chip >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
    case imul32: return aco_opcode::v_mul_lo_u32;
    case fadd32: return aco_opcode::v_add_f32;
@@ -53,8 +69,14 @@ aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) {
    case umax32: return aco_opcode::v_max_u32;
    case fmin32: return aco_opcode::v_min_f32;
    case fmax32: return aco_opcode::v_max_f32;
+   case iand8:
+   case iand16:
    case iand32: return aco_opcode::v_and_b32;
+   case ixor8:
+   case ixor16:
    case ixor32: return aco_opcode::v_xor_b32;
+   case ior8:
+   case ior16:
    case ior32: return aco_opcode::v_or_b32;
    case iadd64: return aco_opcode::num_opcodes;
    case imul64: return aco_opcode::num_opcodes;
@@ -363,41 +385,71 @@ void emit_dpp_mov(lower_context *ctx, PhysReg dst, PhysReg src0, unsigned size,
 uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
 {
    switch (op) {
+   case iadd8:
+   case iadd16:
    case iadd32:
    case iadd64:
+   case fadd16:
    case fadd32:
    case fadd64:
+   case ior8:
+   case ior16:
    case ior32:
    case ior64:
+   case ixor8:
+   case ixor16:
    case ixor32:
    case ixor64:
+   case umax8:
+   case umax16:
    case umax32:
    case umax64:
       return 0;
+   case imul8:
+   case imul16:
    case imul32:
    case imul64:
       return idx ? 0 : 1;
+   case fmul16:
+      return 0x3c00u; /* 1.0 */
    case fmul32:
       return 0x3f800000u; /* 1.0 */
    case fmul64:
       return idx ? 0x3ff00000u : 0u; /* 1.0 */
+   case imin8:
+      return INT8_MAX;
+   case imin16:
+      return INT16_MAX;
    case imin32:
       return INT32_MAX;
    case imin64:
       return idx ? 0x7fffffffu : 0xffffffffu;
+   case imax8:
+      return INT8_MIN;
+   case imax16:
+      return INT16_MIN;
    case imax32:
       return INT32_MIN;
    case imax64:
       return idx ? 0x80000000u : 0;
+   case umin8:
+   case umin16:
+   case iand8:
+   case iand16:
+      return 0xffffffffu;
    case umin32:
    case umin64:
    case iand32:
    case iand64:
       return 0xffffffffu;
+   case fmin16:
+      return 0x7c00u; /* infinity */
    case fmin32:
       return 0x7f800000u; /* infinity */
    case fmin64:
       return idx ? 0x7ff00000u : 0u; /* infinity */
+   case fmax16:
+      return 0xfc00u; /* negative infinity */
    case fmax32:
       return 0xff800000u; /* negative infinity */
    case fmax64:
diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp
index e3c8cd81add..2b18daef154 100644
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@@ -15,11 +15,9 @@ static const char *reduce_ops[] = {
    [imul16] = "imul16",
    [imul32] = "imul32",
    [imul64] = "imul64",
-   [fadd8] = "fadd8",
    [fadd16] = "fadd16",
    [fadd32] = "fadd32",
    [fadd64] = "fadd64",
-   [fmul8] = "fmul8",
    [fmul16] = "fmul16",
    [fmul32] = "fmul32",
    [fmul64] = "fmul64",
@@ -39,11 +37,9 @@ static const char *reduce_ops[] = {
    [umax16] = "umax16",
    [umax32] = "umax32",
    [umax64] = "umax64",
-   [fmin8] = "fmin8",
    [fmin16] = "fmin16",
    [fmin32] = "fmin32",
    [fmin64] = "fmin64",
-   [fmax8] = "fmax8",
    [fmax16] = "fmax16",
    [fmax32] = "fmax32",
    [fmax64] = "fmax64",
-- 
2.30.2