From 86e2b03e3f8862d52fd7ff0945eab423ba03ad26 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 8 Apr 2020 08:39:28 +0200 Subject: [PATCH] aco: implement 8-bit/16-bit reductions Signed-off-by: Samuel Pitoiset Reviewed-by: Rhys Perry Part-of: --- .../compiler/aco_instruction_selection.cpp | 36 +++++++------ src/amd/compiler/aco_ir.h | 8 +-- src/amd/compiler/aco_lower_to_hw_instr.cpp | 52 +++++++++++++++++++ src/amd/compiler/aco_print_ir.cpp | 4 -- 4 files changed, 76 insertions(+), 24 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 7747e6bbeea..a5eee112371 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -7669,27 +7669,31 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } else if (cluster_size == 1) { bld.copy(Definition(dst), src); } else { - src = as_vgpr(ctx, src); + unsigned bit_size = instr->src[0].ssa->bit_size; + + src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8)); ReduceOp reduce_op; switch (op) { - #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break; - CASE(iadd) - CASE(imul) - CASE(fadd) - CASE(fmul) - CASE(imin) - CASE(umin) - CASE(fmin) - CASE(imax) - CASE(umax) - CASE(fmax) - CASE(iand) - CASE(ior) - CASE(ixor) + #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break; + #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break; + CASEI(iadd) + CASEI(imul) + CASEI(imin) + CASEI(umin) + CASEI(imax) + CASEI(umax) + CASEI(iand) + CASEI(ior) + CASEI(ixor) + CASEF(fadd) + CASEF(fmul) + CASEF(fmin) + CASEF(fmax) default: unreachable("unknown reduction op"); - #undef CASE + #undef CASEI + #undef CASEF } aco_opcode aco_op; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 8fa6e48d452..1529f78cef7 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1127,14 +1127,14 @@ static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Un enum ReduceOp : uint16_t { iadd8, iadd16, iadd32, iadd64, imul8, imul16, imul32, imul64, - fadd8, fadd16, fadd32, fadd64, - fmul8, fmul16, fmul32, fmul64, + fadd16, fadd32, fadd64, + fmul16, fmul32, fmul64, imin8, imin16, imin32, imin64, imax8, imax16, imax32, imax64, umin8, umin16, umin32, umin64, umax8, umax16, umax32, umax64, - fmin8, fmin16, fmin32, fmin64, - fmax8, fmax16, fmax32, fmax64, + fmin16, fmin32, fmin64, + fmax16, fmax32, fmax64, iand8, iand16, iand32, iand64, ior8, ior16, ior32, ior64, ixor8, ixor16, ixor32, ixor64, diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 765a7f63a98..1d3061d5dd9 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -43,6 +43,22 @@ struct lower_context { aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) { switch (op) { + case iadd8: + case iadd16: return aco_opcode::v_add_u16; + case imul8: + case imul16: return aco_opcode::v_mul_lo_u16; + case fadd16: return aco_opcode::v_add_f16; + case fmul16: return aco_opcode::v_mul_f16; + case imax8: + case imax16: return aco_opcode::v_max_i16; + case imin8: + case imin16: return aco_opcode::v_min_i16; + case umin8: + case umin16: return aco_opcode::v_min_u16; + case umax8: + case umax16: return aco_opcode::v_max_u16; + case fmin16: return aco_opcode::v_min_f16; + case fmax16: return aco_opcode::v_max_f16; case iadd32: return chip >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32; case imul32: return aco_opcode::v_mul_lo_u32; case fadd32: return aco_opcode::v_add_f32; @@ -53,8 +69,14 @@ aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) { case umax32: return aco_opcode::v_max_u32; case fmin32: return aco_opcode::v_min_f32; case fmax32: return aco_opcode::v_max_f32; + case iand8: + case iand16: case iand32: return aco_opcode::v_and_b32; + case ixor8: + case ixor16: case ixor32: return aco_opcode::v_xor_b32; + case ior8: + case ior16: case ior32: return aco_opcode::v_or_b32; case iadd64: return aco_opcode::num_opcodes; case imul64: return aco_opcode::num_opcodes; @@ -363,41 +385,71 @@ void emit_dpp_mov(lower_context *ctx, PhysReg dst, PhysReg src0, unsigned size, uint32_t get_reduction_identity(ReduceOp op, unsigned idx) { switch (op) { + case iadd8: + case iadd16: case iadd32: case iadd64: + case fadd16: case fadd32: case fadd64: + case ior8: + case ior16: case ior32: case ior64: + case ixor8: + case ixor16: case ixor32: case ixor64: + case umax8: + case umax16: case umax32: case umax64: return 0; + case imul8: + case imul16: case imul32: case imul64: return idx ? 0 : 1; + case fmul16: + return 0x3c00u; /* 1.0 */ case fmul32: return 0x3f800000u; /* 1.0 */ case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */ + case imin8: + return INT8_MAX; + case imin16: + return INT16_MAX; case imin32: return INT32_MAX; case imin64: return idx ? 0x7fffffffu : 0xffffffffu; + case imax8: + return INT8_MIN; + case imax16: + return INT16_MIN; case imax32: return INT32_MIN; case imax64: return idx ? 0x80000000u : 0; + case umin8: + case umin16: + case iand8: + case iand16: + return 0xffffffffu; case umin32: case umin64: case iand32: case iand64: return 0xffffffffu; + case fmin16: + return 0x7c00u; /* infinity */ case fmin32: return 0x7f800000u; /* infinity */ case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */ + case fmax16: + return 0xfc00u; /* negative infinity */ case fmax32: return 0xff800000u; /* negative infinity */ case fmax64: diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index e3c8cd81add..2b18daef154 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -15,11 +15,9 @@ static const char *reduce_ops[] = { [imul16] = "imul16", [imul32] = "imul32", [imul64] = "imul64", - [fadd8] = "fadd8", [fadd16] = "fadd16", [fadd32] = "fadd32", [fadd64] = "fadd64", - [fmul8] = "fmul8", [fmul16] = "fmul16", [fmul32] = "fmul32", [fmul64] = "fmul64", @@ -39,11 +37,9 @@ static const char *reduce_ops[] = { [umax16] = "umax16", [umax32] = "umax32", [umax64] = "umax64", - [fmin8] = "fmin8", [fmin16] = "fmin16", [fmin32] = "fmin32", [fmin64] = "fmin64", - [fmax8] = "fmax8", [fmax16] = "fmax16", [fmax32] = "fmax32", [fmax64] = "fmax64", -- 2.30.2