} else if (cluster_size == 1) {
bld.copy(Definition(dst), src);
} else {
- src = as_vgpr(ctx, src);
+ unsigned bit_size = instr->src[0].ssa->bit_size;
+
+ src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
ReduceOp reduce_op;
switch (op) {
- #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
- CASE(iadd)
- CASE(imul)
- CASE(fadd)
- CASE(fmul)
- CASE(imin)
- CASE(umin)
- CASE(fmin)
- CASE(imax)
- CASE(umax)
- CASE(fmax)
- CASE(iand)
- CASE(ior)
- CASE(ixor)
+ #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break;
+ #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break;
+ CASEI(iadd)
+ CASEI(imul)
+ CASEI(imin)
+ CASEI(umin)
+ CASEI(imax)
+ CASEI(umax)
+ CASEI(iand)
+ CASEI(ior)
+ CASEI(ixor)
+ CASEF(fadd)
+ CASEF(fmul)
+ CASEF(fmin)
+ CASEF(fmax)
default:
unreachable("unknown reduction op");
- #undef CASE
+ #undef CASEI
+ #undef CASEF
}
aco_opcode aco_op;
enum ReduceOp : uint16_t {
iadd8, iadd16, iadd32, iadd64,
imul8, imul16, imul32, imul64,
- fadd8, fadd16, fadd32, fadd64,
- fmul8, fmul16, fmul32, fmul64,
+ fadd16, fadd32, fadd64,
+ fmul16, fmul32, fmul64,
imin8, imin16, imin32, imin64,
imax8, imax16, imax32, imax64,
umin8, umin16, umin32, umin64,
umax8, umax16, umax32, umax64,
- fmin8, fmin16, fmin32, fmin64,
- fmax8, fmax16, fmax32, fmax64,
+ fmin16, fmin32, fmin64,
+ fmax16, fmax32, fmax64,
iand8, iand16, iand32, iand64,
ior8, ior16, ior32, ior64,
ixor8, ixor16, ixor32, ixor64,
aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) {
switch (op) {
+ case iadd8:
+ case iadd16: return aco_opcode::v_add_u16;
+ case imul8:
+ case imul16: return aco_opcode::v_mul_lo_u16;
+ case fadd16: return aco_opcode::v_add_f16;
+ case fmul16: return aco_opcode::v_mul_f16;
+ case imax8:
+ case imax16: return aco_opcode::v_max_i16;
+ case imin8:
+ case imin16: return aco_opcode::v_min_i16;
+ case umin8:
+ case umin16: return aco_opcode::v_min_u16;
+ case umax8:
+ case umax16: return aco_opcode::v_max_u16;
+ case fmin16: return aco_opcode::v_min_f16;
+ case fmax16: return aco_opcode::v_max_f16;
case iadd32: return chip >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
case imul32: return aco_opcode::v_mul_lo_u32;
case fadd32: return aco_opcode::v_add_f32;
case umax32: return aco_opcode::v_max_u32;
case fmin32: return aco_opcode::v_min_f32;
case fmax32: return aco_opcode::v_max_f32;
+ case iand8:
+ case iand16:
case iand32: return aco_opcode::v_and_b32;
+ case ixor8:
+ case ixor16:
case ixor32: return aco_opcode::v_xor_b32;
+ case ior8:
+ case ior16:
case ior32: return aco_opcode::v_or_b32;
case iadd64: return aco_opcode::num_opcodes;
case imul64: return aco_opcode::num_opcodes;
uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
{
switch (op) {
+ case iadd8:
+ case iadd16:
case iadd32:
case iadd64:
+ case fadd16:
case fadd32:
case fadd64:
+ case ior8:
+ case ior16:
case ior32:
case ior64:
+ case ixor8:
+ case ixor16:
case ixor32:
case ixor64:
+ case umax8:
+ case umax16:
case umax32:
case umax64:
return 0;
+ case imul8:
+ case imul16:
case imul32:
case imul64:
return idx ? 0 : 1;
+ case fmul16:
+ return 0x3c00u; /* 1.0 */
case fmul32:
return 0x3f800000u; /* 1.0 */
case fmul64:
return idx ? 0x3ff00000u : 0u; /* 1.0 */
+ case imin8:
+ return INT8_MAX;
+ case imin16:
+ return INT16_MAX;
case imin32:
return INT32_MAX;
case imin64:
return idx ? 0x7fffffffu : 0xffffffffu;
+ case imax8:
+ return INT8_MIN;
+ case imax16:
+ return INT16_MIN;
case imax32:
return INT32_MIN;
case imax64:
return idx ? 0x80000000u : 0;
+ case umin8:
+ case umin16:
+ case iand8:
+ case iand16:
+ return 0xffffffffu;
case umin32:
case umin64:
case iand32:
case iand64:
return 0xffffffffu;
+ case fmin16:
+ return 0x7c00u; /* infinity */
case fmin32:
return 0x7f800000u; /* infinity */
case fmin64:
return idx ? 0x7ff00000u : 0u; /* infinity */
+ case fmax16:
+ return 0xfc00u; /* negative infinity */
case fmax32:
return 0xff800000u; /* negative infinity */
case fmax64:
[imul16] = "imul16",
[imul32] = "imul32",
[imul64] = "imul64",
- [fadd8] = "fadd8",
[fadd16] = "fadd16",
[fadd32] = "fadd32",
[fadd64] = "fadd64",
- [fmul8] = "fmul8",
[fmul16] = "fmul16",
[fmul32] = "fmul32",
[fmul64] = "fmul64",
[umax16] = "umax16",
[umax32] = "umax32",
[umax64] = "umax64",
- [fmin8] = "fmin8",
[fmin16] = "fmin16",
[fmin32] = "fmin32",
[fmin64] = "fmin64",
- [fmax8] = "fmax8",
[fmax16] = "fmax16",
[fmax32] = "fmax32",
[fmax64] = "fmax64",