X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_instruction_selection.cpp;h=9f97e1fc64759dc0474bc1dfe24bb4d2976d894c;hb=3d6cc14513c1032ff8b24b378354aa7fdb99c6fe;hp=b13b2372f4b68688d4d0c6328bd81d30ca287f3e;hpb=9392ddab4399d796fdf37602f586965ec17f2b2a;p=mesa.git diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index b13b2372f4b..9f97e1fc647 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -711,9 +711,8 @@ void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst, { aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes; aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op; - bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index]; bool use_valu = s_op == aco_opcode::num_opcodes || - divergent_vals || + nir_dest_is_divergent(instr->dest.dest) || ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr || ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr; aco_opcode op = use_valu ? v_op : s_op; @@ -750,18 +749,12 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) if (dst.type() == RegType::vgpr) { aco_ptr bcsel; - if (dst.regClass() == v2b) { - then = as_vgpr(ctx, then); - els = as_vgpr(ctx, els); - - Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), els, then, cond); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); - } else if (dst.regClass() == v1) { + if (dst.size() == 1) { then = as_vgpr(ctx, then); els = as_vgpr(ctx, els); bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond); - } else if (dst.regClass() == v2) { + } else if (dst.size() == 2) { Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then); Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1); @@ -785,7 +778,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) assert(els.regClass() == bld.lm); } - if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */ + if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */ if (dst.regClass() == s1 || dst.regClass() == s2) { assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass()); assert(dst.size() == then.size()); @@ -884,7 +877,8 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val); /* Extract the exponent and compute the unbiased value. */ - Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f64, bld.def(v1), val); + Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u)); + exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u)); /* Extract the fractional part. */ Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu)); @@ -900,7 +894,7 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp); /* Get the sign bit. */ - Temp sign = bld.vop2(aco_opcode::v_ashr_i32, bld.def(v1), Operand(31u), val_hi); + Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi); /* Decide the operation to apply depending on the unbiased exponent. */ Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u)); @@ -1062,13 +1056,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src); else unreachable("wrong src register class for nir_op_imov"); - } else if (dst.regClass() == v1) { - bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src); - } else if (dst.regClass() == v2) { - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src); } else { - nir_print_instr(&instr->instr, stderr); - unreachable("Should have been lowered to scalar."); + if (dst.regClass() == v1) + bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src); + else if (dst.regClass() == v1b || + dst.regClass() == v2b || + dst.regClass() == v2) + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src); + else + unreachable("wrong src register class for nir_op_imov"); } break; } @@ -1601,9 +1597,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1])); if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); - emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, tmp, true); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true); } else if (dst.regClass() == v2) { @@ -1619,9 +1613,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1])); if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); - emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, tmp, true); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true); } else if (dst.regClass() == v2) { @@ -1637,12 +1629,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr) - emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, tmp, false); + emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false); else - emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, tmp, true); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true); } else if (dst.regClass() == v1) { if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr) emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false); @@ -1665,9 +1655,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1])); if (dst.regClass() == v2b) { // TODO: check fp_mode.must_flush_denorms16_64 - Temp tmp = bld.tmp(v1); - emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, tmp, true); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32); } else if (dst.regClass() == v2) { @@ -1689,9 +1677,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1])); if (dst.regClass() == v2b) { // TODO: check fp_mode.must_flush_denorms16_64 - Temp tmp = bld.tmp(v1); - emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, tmp, true); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32); } else if (dst.regClass() == v2) { @@ -1710,9 +1696,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fmax3: { if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); - emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, tmp, false); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false); } else if (dst.regClass() == v1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); } else { @@ -1724,9 +1708,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fmin3: { if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); - emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, tmp, false); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false); } else if (dst.regClass() == v1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); } else { @@ -1738,9 +1720,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fmed3: { if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); - emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, tmp, false); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false); } else if (dst.regClass() == v1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); } else { @@ -1839,8 +1819,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_frsq: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_rsq_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst); } else if (dst.regClass() == v1) { emit_rsq(ctx, bld, Definition(dst), src); } else if (dst.regClass() == v2) { @@ -1855,8 +1834,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fneg: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x8000u), as_vgpr(ctx, src)); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src)); } else if (dst.regClass() == v1) { if (ctx->block->fp_mode.must_flush_denorms32) src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src)); @@ -1878,8 +1856,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fabs: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFu), as_vgpr(ctx, src)); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src)); } else if (dst.regClass() == v1) { if (ctx->block->fp_mode.must_flush_denorms32) src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src)); @@ -1901,8 +1878,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fsat: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop3(aco_opcode::v_med3_f16, bld.def(v1), Operand(0u), Operand(0x3f800000u), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand(0u), Operand(0x3f800000u), src); } else if (dst.regClass() == v1) { bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src); /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */ @@ -1921,8 +1897,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_flog2: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_log_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst); } else if (dst.regClass() == v1) { emit_log2(ctx, bld, Definition(dst), src); } else { @@ -1935,8 +1910,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_frcp: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_rcp_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst); } else if (dst.regClass() == v1) { emit_rcp(ctx, bld, Definition(dst), src); } else if (dst.regClass() == v2) { @@ -1950,9 +1924,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fexp2: { if (dst.regClass() == v2b) { - Temp src = get_alu_src(ctx, instr->src[0]); - Temp tmp = bld.vop1(aco_opcode::v_exp_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst); } else { @@ -1965,8 +1937,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fsqrt: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_sqrt_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst); } else if (dst.regClass() == v1) { emit_sqrt(ctx, bld, Definition(dst), src); } else if (dst.regClass() == v2) { @@ -1980,9 +1951,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_ffract: { if (dst.regClass() == v2b) { - Temp src = get_alu_src(ctx, instr->src[0]); - Temp tmp = bld.vop1(aco_opcode::v_fract_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst); } else if (dst.regClass() == v2) { @@ -1997,8 +1966,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_ffloor: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_floor_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst); } else if (dst.regClass() == v2) { @@ -2013,8 +1981,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fceil: { Temp src0 = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_ceil_f16, bld.def(v1), src0); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst); } else if (dst.regClass() == v2) { @@ -2044,8 +2011,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_ftrunc: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_trunc_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst); } else if (dst.regClass() == v2) { @@ -2060,8 +2026,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fround_even: { Temp src0 = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_rndne_f16, bld.def(v1), src0); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst); } else if (dst.regClass() == v2) { @@ -2106,8 +2071,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.regClass() == v2b) { Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src); aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16; - tmp = bld.vop1(opcode, bld.def(v1), tmp); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop1(opcode, Definition(dst), tmp); } else if (dst.regClass() == v1) { Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src); @@ -2128,9 +2092,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); - emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, tmp, false); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false); } else if (dst.regClass() == v1) { bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1); } else if (dst.regClass() == v2) { @@ -2145,8 +2107,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_frexp_sig: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_frexp_mant_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src); } else if (dst.regClass() == v1) { bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src); } else if (dst.regClass() == v2) { @@ -2183,8 +2144,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond); cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); - Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), minus_one, src, cond); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond); } else if (dst.regClass() == v1) { Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond); @@ -2212,16 +2172,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size == 64) src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src); - src = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src); + bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src); break; } case nir_op_f2f16_rtz: { Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size == 64) src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src); - src = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), src, Operand(0u)); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src); + bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u)); break; } case nir_op_f2f32: { @@ -2248,8 +2206,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size == 8) src = convert_int(bld, src, 8, 16, true); - Temp tmp = bld.vop1(aco_opcode::v_cvt_f16_i16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src); break; } case nir_op_i2f32: { @@ -2288,8 +2245,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size == 8) src = convert_int(bld, src, 8, 16, false); - Temp tmp = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src); break; } case nir_op_u2f32: { @@ -2581,8 +2537,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3c00u), src); } else if (dst.regClass() == v2b) { Temp one = bld.copy(bld.def(v1), Operand(0x3c00u)); - Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), one, src); } else { unreachable("Wrong destination register class for nir_op_b2f16."); } @@ -3987,7 +3942,7 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset bool allow_combining = true, bool reorder = true, bool slc = false) { Builder bld(ctx->program, ctx->block); - assert(elem_size_bytes == 4 || elem_size_bytes == 8); + assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8); assert(write_mask); write_mask = widen_mask(write_mask, elem_size_bytes); @@ -4007,8 +3962,8 @@ void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components, unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true) { - assert(elem_size_bytes == 4 || elem_size_bytes == 8); - assert((num_components * elem_size_bytes / 4) == dst.size()); + assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8); + assert((num_components * elem_size_bytes) == dst.bytes()); assert(!!stride != allow_combining); Builder bld(ctx->program, ctx->block); @@ -4153,11 +4108,9 @@ std::pair get_tcs_output_lds_offset(isel_context *ctx, nir_intri Builder bld(ctx->program, ctx->block); uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16; - uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); - uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written); - uint32_t output_vertex_size = num_tcs_outputs * 16; + uint32_t output_vertex_size = ctx->tcs_num_outputs * 16; uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; - uint32_t output_patch_stride = pervertex_output_patch_size + num_tcs_patch_outputs * 16; + uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16; std::pair offs = instr ? get_intrinsic_io_basic_offset(ctx, instr, 4u) @@ -4205,11 +4158,7 @@ std::pair get_tcs_per_patch_output_vmem_offset(isel_context *ctx { Builder bld(ctx->program, ctx->block); - unsigned num_tcs_outputs = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL - ? util_last_bit64(ctx->args->shader_info->tcs.outputs_written) - : ctx->args->options->key.tes.tcs_num_outputs; - - unsigned output_vertex_size = num_tcs_outputs * 16; + unsigned output_vertex_size = ctx->tcs_num_outputs * 16; unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches; unsigned attr_stride = ctx->tcs_num_patches; @@ -4230,10 +4179,12 @@ std::pair get_tcs_per_patch_output_vmem_offset(isel_context *ctx bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect) { + assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL); + if (mask == 0) return false; - unsigned off = nir_intrinsic_base(instr) * 4u; + unsigned drv_loc = nir_intrinsic_base(instr); nir_src *off_src = nir_get_io_offset_src(instr); if (!nir_src_is_const(*off_src)) { @@ -4242,15 +4193,10 @@ bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr } *indirect = false; - off += nir_src_as_uint(*off_src) * 16u; - - while (mask) { - unsigned slot = u_bit_scan64(&mask) + (per_vertex ? 0 : VARYING_SLOT_PATCH0); - if (off == shader_io_get_unique_index((gl_varying_slot) slot) * 16u) - return true; - } - - return false; + uint64_t slot = per_vertex + ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4] + : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0); + return (((uint64_t) 1) << slot) & mask; } bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr) @@ -4269,10 +4215,12 @@ bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr) if (instr->src[0].ssa->bit_size == 64) write_mask = widen_mask(write_mask, 2); + RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1; + for (unsigned i = 0; i < 8; ++i) { if (write_mask & (1 << i)) { ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u); - ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, v1); + ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc); } idx++; } @@ -4344,9 +4292,8 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr) /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS. * GFX9+: LS is merged into HS, but still uses the same LDS layout. */ - unsigned num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written); Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id); - lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, num_tcs_inputs * 16u); + lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u); } else { unreachable("Invalid LS or ES stage"); } @@ -4477,10 +4424,40 @@ void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp Temp coord2 = emit_extract_vector(ctx, src, 1, v1); Builder bld(ctx->program, ctx->block); - Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component); - if (ctx->program->has_16bank_lds) - interp_p1.instr->operands[0].setLateKill(true); - bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, component); + + if (dst.regClass() == v2b) { + if (ctx->program->has_16bank_lds) { + assert(ctx->options->chip_class <= GFX8); + Builder::Result interp_p1 = + bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), + Operand(2u) /* P0 */, bld.m0(prim_mask), idx, component); + interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), + coord1, bld.m0(prim_mask), interp_p1, idx, component); + bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, + bld.m0(prim_mask), interp_p1, idx, component); + } else { + aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16; + + if (ctx->options->chip_class == GFX8) + interp_p2_op = aco_opcode::v_interp_p2_legacy_f16; + + Builder::Result interp_p1 = + bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), + coord1, bld.m0(prim_mask), idx, component); + bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), + interp_p1, idx, component); + } + } else { + Builder::Result interp_p1 = + bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, + bld.m0(prim_mask), idx, component); + + if (ctx->program->has_16bank_lds) + interp_p1.instr->operands[0].setLateKill(true); + + bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, + bld.m0(prim_mask), interp_p1, idx, component); + } } void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components) @@ -4646,6 +4623,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset; unsigned component = nir_intrinsic_component(instr); + unsigned bitsize = instr->dest.ssa.bit_size; unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location]; uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location]; uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location]; @@ -4702,7 +4680,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) /* load channels */ while (channel_start < num_channels) { - unsigned fetch_size = num_channels - channel_start; + unsigned fetch_component = num_channels - channel_start; unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size; bool expanded = false; @@ -4714,15 +4692,17 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) vtx_info->chan_byte_size == 4; unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID; if (!use_mubuf) { - fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size); + fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component); } else { - if (fetch_size == 3 && ctx->options->chip_class == GFX6) { + if (fetch_component == 3 && ctx->options->chip_class == GFX6) { /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */ - fetch_size = 4; + fetch_component = 4; expanded = true; } } + unsigned fetch_bytes = fetch_component * bitsize / 8; + Temp fetch_index = index; if (attrib_stride != 0 && fetch_offset > attrib_stride) { fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index); @@ -4736,19 +4716,37 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) } aco_opcode opcode; - switch (fetch_size) { - case 1: - opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x; - break; + switch (fetch_bytes) { case 2: - opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy; + assert(!use_mubuf && bitsize == 16); + opcode = aco_opcode::tbuffer_load_format_d16_x; + break; + case 4: + if (bitsize == 16) { + assert(!use_mubuf); + opcode = aco_opcode::tbuffer_load_format_d16_xy; + } else { + opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x; + } + break; + case 6: + assert(!use_mubuf && bitsize == 16); + opcode = aco_opcode::tbuffer_load_format_d16_xyz; + break; + case 8: + if (bitsize == 16) { + assert(!use_mubuf); + opcode = aco_opcode::tbuffer_load_format_d16_xyzw; + } else { + opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy; + } break; - case 3: + case 12: assert(ctx->options->chip_class >= GFX7 || (!use_mubuf && ctx->options->chip_class == GFX6)); opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz; break; - case 4: + case 16: opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw; break; default: @@ -4756,13 +4754,13 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) } Temp fetch_dst; - if (channel_start == 0 && fetch_size == dst.size() && !post_shuffle && + if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE || num_channels <= 3)) { direct_fetch = true; fetch_dst = dst; } else { - fetch_dst = bld.tmp(RegType::vgpr, fetch_size); + fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes)); } if (use_mubuf) { @@ -4779,14 +4777,15 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) emit_split_vector(ctx, fetch_dst, fetch_dst.size()); - if (fetch_size == 1) { + if (fetch_component == 1) { channels[channel_start] = fetch_dst; } else { - for (unsigned i = 0; i < MIN2(fetch_size, num_channels - channel_start); i++) - channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, v1); + for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++) + channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, + bitsize == 16 ? v2b : v1); } - channel_start += fetch_size; + channel_start += fetch_component; } if (!direct_fetch) { @@ -5066,7 +5065,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) { Builder bld(ctx->program, ctx->block); Temp index = get_ssa_temp(ctx, instr->src[0].ssa); - if (!ctx->divergent_vals[instr->dest.ssa.index]) + if (!nir_dest_is_divergent(instr->dest)) index = bld.as_uniform(index); unsigned desc_set = nir_intrinsic_desc_set(instr); unsigned binding = nir_intrinsic_binding(instr); @@ -6142,7 +6141,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); - bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] && + bool smem = !nir_src_is_divergent(instr->src[2]) && ctx->options->chip_class >= GFX8 && elem_size_bytes >= 4; if (smem) @@ -7533,14 +7532,21 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_shuffle: case nir_intrinsic_read_invocation: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - if (!ctx->divergent_vals[instr->src[0].ssa->index]) { + if (!nir_src_is_divergent(instr->src[0])) { emit_uniform_subgroup(ctx, instr, src); } else { Temp tid = get_ssa_temp(ctx, instr->src[1].ssa); - if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index]) + if (instr->intrinsic == nir_intrinsic_read_invocation || !nir_src_is_divergent(instr->src[1])) tid = bld.as_uniform(tid); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - if (src.regClass() == v1) { + if (src.regClass() == v1b || src.regClass() == v2b) { + Temp tmp = bld.tmp(v1); + tmp = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), tmp); + if (dst.type() == RegType::vgpr) + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(src.regClass() == v1b ? v3b : v2b), tmp); + else + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); + } else if (src.regClass() == v1) { emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst); } else if (src.regClass() == v2) { Temp lo = bld.tmp(v1), hi = bld.tmp(v1); @@ -7585,7 +7591,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_read_first_invocation: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - if (src.regClass() == v1) { + if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) { emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst); @@ -7643,7 +7649,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) nir_intrinsic_cluster_size(instr) : 0; cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size)); - if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) { + if (!nir_src_is_divergent(instr->src[0]) && (op == nir_op_ior || op == nir_op_iand)) { emit_uniform_subgroup(ctx, instr, src); } else if (instr->dest.ssa.bit_size == 1) { if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin) @@ -7670,27 +7676,31 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } else if (cluster_size == 1) { bld.copy(Definition(dst), src); } else { - src = as_vgpr(ctx, src); + unsigned bit_size = instr->src[0].ssa->bit_size; + + src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8)); ReduceOp reduce_op; switch (op) { - #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break; - CASE(iadd) - CASE(imul) - CASE(fadd) - CASE(fmul) - CASE(imin) - CASE(umin) - CASE(fmin) - CASE(imax) - CASE(umax) - CASE(fmax) - CASE(iand) - CASE(ior) - CASE(ixor) + #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break; + #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break; + CASEI(iadd) + CASEI(imul) + CASEI(imin) + CASEI(umin) + CASEI(imax) + CASEI(umax) + CASEI(iand) + CASEI(ior) + CASEI(ixor) + CASEF(fadd) + CASEF(fmul) + CASEF(fmin) + CASEF(fmax) default: unreachable("unknown reduction op"); - #undef CASE + #undef CASEI + #undef CASEF } aco_opcode aco_op; @@ -7726,7 +7736,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } case nir_intrinsic_quad_broadcast: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - if (!ctx->divergent_vals[instr->dest.ssa.index]) { + if (!nir_dest_is_divergent(instr->dest)) { emit_uniform_subgroup(ctx, instr, src); } else { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); @@ -7743,6 +7753,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)))); emit_wqm(ctx, tmp, dst); + } else if (instr->dest.ssa.bit_size == 8) { + Temp tmp = bld.tmp(v1); + emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp); + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp); + } else if (instr->dest.ssa.bit_size == 16) { + Temp tmp = bld.tmp(v1); + emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp); + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); } else if (instr->dest.ssa.bit_size == 32) { if (ctx->program->chip_class >= GFX8) emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst); @@ -7773,7 +7791,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_quad_swap_diagonal: case nir_intrinsic_quad_swizzle_amd: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - if (!ctx->divergent_vals[instr->dest.ssa.index]) { + if (!nir_dest_is_divergent(instr->dest)) { emit_uniform_subgroup(ctx, instr, src); break; } @@ -7807,6 +7825,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl); Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src); emit_wqm(ctx, tmp, dst); + } else if (instr->dest.ssa.bit_size == 8) { + Temp tmp = bld.tmp(v1); + emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp); + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp); + } else if (instr->dest.ssa.bit_size == 16) { + Temp tmp = bld.tmp(v1); + emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp); + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); } else if (instr->dest.ssa.bit_size == 32) { Temp tmp; if (ctx->program->chip_class >= GFX8) @@ -7835,7 +7861,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } case nir_intrinsic_masked_swizzle_amd: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - if (!ctx->divergent_vals[instr->dest.ssa.index]) { + if (!nir_dest_is_divergent(instr->dest)) { emit_uniform_subgroup(ctx, instr, src); break; } @@ -7932,10 +7958,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) get_ssa_temp(ctx, &instr->dest.ssa)); break; } - case nir_intrinsic_shader_clock: - bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false); + case nir_intrinsic_shader_clock: { + aco_opcode opcode = + nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ? + aco_opcode::s_memrealtime : aco_opcode::s_memtime; + bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false); emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2); break; + } case nir_intrinsic_load_vertex_id_zero_base: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id)); @@ -8227,9 +8257,11 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) { Builder bld(ctx->program, ctx->block); bool has_bias = false, has_lod = false, level_zero = false, has_compare = false, - has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false; + has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false, + has_clamped_lod = false; Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(), - lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(); + lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), + clamped_lod = Temp(); std::vector coords; std::vector derivs; nir_const_value *sample_index_cv = NULL; @@ -8251,10 +8283,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) break; } case nir_tex_src_bias: - if (instr->op == nir_texop_txb) { - bias = get_ssa_temp(ctx, instr->src[i].src.ssa); - has_bias = true; - } + bias = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_bias = true; break; case nir_tex_src_lod: { nir_const_value *val = nir_src_as_const_value(instr->src[i].src); @@ -8267,6 +8297,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) } break; } + case nir_tex_src_min_lod: + clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_clamped_lod = true; + break; case nir_tex_src_comparator: if (instr->is_shadow) { compare = get_ssa_temp(ctx, instr->src[i].src.ssa); @@ -8667,6 +8701,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) args.emplace_back(sample_index); if (has_lod) args.emplace_back(lod); + if (has_clamped_lod) + args.emplace_back(clamped_lod); Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size())); aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)}; @@ -8711,7 +8747,21 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) // TODO: would be better to do this by adding offsets, but needs the opcodes ordered. aco_opcode opcode = aco_opcode::image_sample; if (has_offset) { /* image_sample_*_o */ - if (has_compare) { + if (has_clamped_lod) { + if (has_compare) { + opcode = aco_opcode::image_sample_c_cl_o; + if (has_derivs) + opcode = aco_opcode::image_sample_c_d_cl_o; + if (has_bias) + opcode = aco_opcode::image_sample_c_b_cl_o; + } else { + opcode = aco_opcode::image_sample_cl_o; + if (has_derivs) + opcode = aco_opcode::image_sample_d_cl_o; + if (has_bias) + opcode = aco_opcode::image_sample_b_cl_o; + } + } else if (has_compare) { opcode = aco_opcode::image_sample_c_o; if (has_derivs) opcode = aco_opcode::image_sample_c_d_o; @@ -8732,6 +8782,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) if (has_lod) opcode = aco_opcode::image_sample_l_o; } + } else if (has_clamped_lod) { /* image_sample_*_cl */ + if (has_compare) { + opcode = aco_opcode::image_sample_c_cl; + if (has_derivs) + opcode = aco_opcode::image_sample_c_d_cl; + if (has_bias) + opcode = aco_opcode::image_sample_c_b_cl; + } else { + opcode = aco_opcode::image_sample_cl; + if (has_derivs) + opcode = aco_opcode::image_sample_d_cl; + if (has_bias) + opcode = aco_opcode::image_sample_b_cl; + } } else { /* no offset */ if (has_compare) { opcode = aco_opcode::image_sample_c; @@ -8757,14 +8821,34 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) } if (instr->op == nir_texop_tg4) { - if (has_offset) { - opcode = aco_opcode::image_gather4_lz_o; - if (has_compare) + if (has_offset) { /* image_gather4_*_o */ + if (has_compare) { opcode = aco_opcode::image_gather4_c_lz_o; + if (has_lod) + opcode = aco_opcode::image_gather4_c_l_o; + if (has_bias) + opcode = aco_opcode::image_gather4_c_b_o; + } else { + opcode = aco_opcode::image_gather4_lz_o; + if (has_lod) + opcode = aco_opcode::image_gather4_l_o; + if (has_bias) + opcode = aco_opcode::image_gather4_b_o; + } } else { - opcode = aco_opcode::image_gather4_lz; - if (has_compare) + if (has_compare) { opcode = aco_opcode::image_gather4_c_lz; + if (has_lod) + opcode = aco_opcode::image_gather4_c_l; + if (has_bias) + opcode = aco_opcode::image_gather4_c_b; + } else { + opcode = aco_opcode::image_gather4_lz; + if (has_lod) + opcode = aco_opcode::image_gather4_l; + if (has_bias) + opcode = aco_opcode::image_gather4_b; + } } } else if (instr->op == nir_texop_lod) { opcode = aco_opcode::image_get_lod; @@ -8830,7 +8914,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask); - bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index]; + bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest); logical |= ctx->block->kind & block_kind_merge; aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi; @@ -9524,7 +9608,7 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt) aco_ptr branch; if_context ic; - if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */ + if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */ /** * Uniform conditionals are represented in the following way*) : * @@ -9868,6 +9952,7 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1; bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1; + bool is_16bit = values[0].regClass() == v2b; switch (col_format) { @@ -9898,16 +9983,38 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) case V_028714_SPI_SHADER_FP16_ABGR: enabled_channels = 0x5; compr_op = aco_opcode::v_cvt_pkrtz_f16_f32; + if (is_16bit) { + if (ctx->options->chip_class >= GFX9) { + /* Pack the FP16 values together instead of converting them to + * FP32 and back to FP16. + * TODO: use p_create_vector and let the compiler optimizes. + */ + compr_op = aco_opcode::v_pack_b32_f16; + } else { + for (unsigned i = 0; i < 4; i++) { + if ((write_mask >> i) & 1) + values[i] = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), values[i]); + } + } + } break; case V_028714_SPI_SHADER_UNORM16_ABGR: enabled_channels = 0x5; - compr_op = aco_opcode::v_cvt_pknorm_u16_f32; + if (is_16bit && ctx->options->chip_class >= GFX9) { + compr_op = aco_opcode::v_cvt_pknorm_u16_f16; + } else { + compr_op = aco_opcode::v_cvt_pknorm_u16_f32; + } break; case V_028714_SPI_SHADER_SNORM16_ABGR: enabled_channels = 0x5; - compr_op = aco_opcode::v_cvt_pknorm_i16_f32; + if (is_16bit && ctx->options->chip_class >= GFX9) { + compr_op = aco_opcode::v_cvt_pknorm_i16_f16; + } else { + compr_op = aco_opcode::v_cvt_pknorm_i16_f32; + } break; case V_028714_SPI_SHADER_UINT16_ABGR: { @@ -9925,6 +10032,13 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) values[i]); } } + } else if (is_16bit) { + for (unsigned i = 0; i < 4; i++) { + if ((write_mask >> i) & 1) { + Temp tmp = convert_int(bld, values[i].getTemp(), 16, 32, false); + values[i] = Operand(tmp); + } + } } break; } @@ -9949,6 +10063,13 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) values[i]); } } + } else if (is_16bit) { + for (unsigned i = 0; i < 4; i++) { + if ((write_mask >> i) & 1) { + Temp tmp = convert_int(bld, values[i].getTemp(), 16, 32, true); + values[i] = Operand(tmp); + } + } } break; @@ -10739,8 +10860,6 @@ void select_program(Program *program, if (ngg_no_gs && !ngg_early_prim_export(&ctx)) ngg_emit_nogs_output(&ctx); - ralloc_free(ctx.divergent_vals); - if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) { /* Outputs of the previous stage are inputs to the next stage */ ctx.inputs = ctx.outputs;