X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_instruction_selection.cpp;h=65cf38f57e9851e0783db430f324906a65e2484d;hb=50c2c76ea31edf987594e8b811b7d62be71f5a33;hp=aa1648f0d1b5eb89b1ae22fe59407db0b061c020;hpb=ffb4790279ca779572ec393ba84d71ef1036b437;p=mesa.git diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index aa1648f0d1b..65cf38f57e9 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include "ac_shader_util.h" @@ -70,7 +71,7 @@ public: ctx->cf_info.parent_if.is_divergent = divergent_if_old; ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1; if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) - ctx->cf_info.exec_potentially_empty = false; + ctx->cf_info.exec_potentially_empty_discard = false; } }; @@ -78,16 +79,19 @@ struct if_context { Temp cond; bool divergent_old; - bool exec_potentially_empty_old; + bool exec_potentially_empty_discard_old; + bool exec_potentially_empty_break_old; + uint16_t exec_potentially_empty_break_depth_old; unsigned BB_if_idx; unsigned invert_idx; + bool uniform_has_then_branch; bool then_branch_divergent; Block BB_invert; Block BB_endif; }; -static void visit_cf_list(struct isel_context *ctx, +static bool visit_cf_list(struct isel_context *ctx, struct exec_list *list); static void add_logical_edge(unsigned pred_idx, Block *succ) @@ -188,7 +192,7 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data Temp lane_id = emit_mbcnt(ctx, bld.def(v1)); Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id); Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index); - Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2, vcc), lane_is_hi, index_is_hi); + Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), lane_is_hi, index_is_hi); return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute); @@ -267,21 +271,24 @@ Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst assert(idx == 0); return src; } - assert(src.size() > idx); + + assert(src.bytes() > (idx * dst_rc.bytes())); Builder bld(ctx->program, ctx->block); auto it = ctx->allocated_vec.find(src.id()); - /* the size check needs to be early because elements other than 0 may be garbage */ - if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) { + if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) { if (it->second[idx].regClass() == dst_rc) { return it->second[idx]; } else { - assert(dst_rc.size() == it->second[idx].regClass().size()); + assert(!dst_rc.is_subdword()); assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr); return bld.copy(bld.def(dst_rc), it->second[idx]); } } - if (src.size() == dst_rc.size()) { + if (dst_rc.is_subdword()) + src = as_vgpr(ctx, src); + + if (src.bytes() == dst_rc.bytes()) { assert(idx == 0); return bld.copy(bld.def(dst_rc), src); } else { @@ -297,11 +304,23 @@ void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components) return; if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end()) return; + RegClass rc; + if (num_components > vec_src.size()) { + if (vec_src.type() == RegType::sgpr) { + /* should still help get_alu_src() */ + emit_split_vector(ctx, vec_src, vec_src.size()); + return; + } + /* sub-dword split */ + rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword(); + } else { + rc = RegClass(vec_src.type(), vec_src.size() / num_components); + } aco_ptr split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)}; split->operands[0] = Operand(vec_src); std::array elems; for (unsigned i = 0; i < num_components; i++) { - elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)}; + elems[i] = {ctx->program->allocateId(), rc}; split->definitions[i] = Definition(elems[i]); } ctx->block->instructions.emplace_back(std::move(split)); @@ -347,6 +366,82 @@ void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_compo ctx->allocated_vec.emplace(dst.id(), elems); } +/* adjust misaligned small bit size loads */ +void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst) +{ + Builder bld(ctx->program, ctx->block); + Operand shift; + Temp select = Temp(); + if (offset.isConstant()) { + assert(offset.constantValue() && offset.constantValue() < 4); + shift = Operand(offset.constantValue() * 8); + } else { + /* bit_offset = 8 * (offset & 0x3) */ + Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u)); + select = bld.tmp(s1); + shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u)); + } + + if (vec.size() == 1) { + bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift); + } else if (vec.size() == 2) { + Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2); + bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift); + if (tmp == dst) + emit_split_vector(ctx, dst, 2); + else + emit_extract_vector(ctx, tmp, 0, dst); + } else if (vec.size() == 4) { + Temp lo = bld.tmp(s2), hi = bld.tmp(s2); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec); + hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u)); + if (select != Temp()) + hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), select); + lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift); + Temp mid = bld.tmp(s1); + lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo); + hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift); + mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid); + emit_split_vector(ctx, dst, 2); + } +} + +/* this function trims subdword vectors: + * if dst is vgpr - split the src and create a shrunk version according to the mask. + * if dst is sgpr - split the src, but move the original to sgpr. */ +void trim_subdword_vector(isel_context *ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask) +{ + assert(vec_src.type() == RegType::vgpr); + emit_split_vector(ctx, vec_src, num_components); + + Builder bld(ctx->program, ctx->block); + std::array elems; + unsigned component_size = vec_src.bytes() / num_components; + RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword(); + + unsigned k = 0; + for (unsigned i = 0; i < num_components; i++) { + if (mask & (1 << i)) + elems[k++] = emit_extract_vector(ctx, vec_src, i, rc); + } + + if (dst.type() == RegType::vgpr) { + assert(dst.bytes() == k * component_size); + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, k, 1)}; + for (unsigned i = 0; i < k; i++) + vec->operands[i] = Operand(elems[i]); + vec->definitions[0] = Definition(dst); + bld.insert(std::move(vec)); + } else { + // TODO: alignbyte if mask doesn't start with 1? + assert(mask & 1); + assert(dst.size() == vec_src.size()); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src); + } + ctx->allocated_vec.emplace(dst.id(), elems); +} + Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2)) { Builder bld(ctx->program, ctx->block); @@ -356,7 +451,7 @@ Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2 assert(val.regClass() == s1); assert(dst.regClass() == bld.lm); - return bld.sop2(Builder::s_cselect, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val)); + return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val)); } Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1)) @@ -390,11 +485,33 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1) } Temp vec = get_ssa_temp(ctx, src.src.ssa); - unsigned elem_size = vec.size() / src.src.ssa->num_components; - assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */ - assert(vec.size() % elem_size == 0); + unsigned elem_size = vec.bytes() / src.src.ssa->num_components; + assert(elem_size > 0); + assert(vec.bytes() % elem_size == 0); + + if (elem_size < 4 && vec.type() == RegType::sgpr) { + assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16); + assert(size == 1); + unsigned swizzle = src.swizzle[0]; + if (vec.size() > 1) { + assert(src.src.ssa->bit_size == 16); + vec = emit_extract_vector(ctx, vec, swizzle / 2, s1); + swizzle = swizzle & 1; + } + if (swizzle == 0) + return vec; + + Temp dst{ctx->program->allocateId(), s1}; + aco_ptr bfe{create_instruction(aco_opcode::s_bfe_u32, Format::SOP2, 2, 2)}; + bfe->operands[0] = Operand(vec); + bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle))); + bfe->definitions[0] = Definition(dst); + bfe->definitions[1] = Definition(ctx->program->allocateId(), scc, s1); + ctx->block->instructions.emplace_back(std::move(bfe)); + return dst; + } - RegClass elem_rc = RegClass(vec.type(), elem_size); + RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4); if (size == 1) { return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc); } else { @@ -405,7 +522,7 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1) elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc); vec_instr->operands[i] = Operand{elems[i]}; } - Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)}; + Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size / 4)}; vec_instr->definitions[0] = Definition(dst); ctx->block->instructions.emplace_back(std::move(vec_instr)); ctx->allocated_vec.emplace(dst.id(), elems); @@ -446,16 +563,8 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o Temp t = src0; src0 = src1; src1 = t; - } else if (src0.type() == RegType::vgpr && - op != aco_opcode::v_madmk_f32 && - op != aco_opcode::v_madak_f32 && - op != aco_opcode::v_madmk_f16 && - op != aco_opcode::v_madak_f16) { - /* If the instruction is not commutative, we emit a VOP3A instruction */ - bld.vop2_e64(op, Definition(dst), src0, src1); - return; } else { - src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr + src1 = as_vgpr(ctx, src1); } } @@ -511,6 +620,24 @@ void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o if (src0.type() == RegType::vgpr) { /* to swap the operands, we might also have to change the opcode */ switch (op) { + case aco_opcode::v_cmp_lt_f16: + op = aco_opcode::v_cmp_gt_f16; + break; + case aco_opcode::v_cmp_ge_f16: + op = aco_opcode::v_cmp_le_f16; + break; + case aco_opcode::v_cmp_lt_i16: + op = aco_opcode::v_cmp_gt_i16; + break; + case aco_opcode::v_cmp_ge_i16: + op = aco_opcode::v_cmp_le_i16; + break; + case aco_opcode::v_cmp_lt_u16: + op = aco_opcode::v_cmp_gt_u16; + break; + case aco_opcode::v_cmp_ge_u16: + op = aco_opcode::v_cmp_le_u16; + break; case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break; @@ -580,10 +707,10 @@ void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o } void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst, - aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes) + aco_opcode v16_op, aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes) { - aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : s32_op; - aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : v32_op; + aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes; + aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op; bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index]; bool use_valu = s_op == aco_opcode::num_opcodes || divergent_vals || @@ -591,6 +718,7 @@ void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst, ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr; aco_opcode op = use_valu ? v_op : s_op; assert(op != aco_opcode::num_opcodes); + assert(dst.regClass() == ctx->program->lane_mask); if (use_valu) emit_vopc_instruction(ctx, instr, op, dst); @@ -735,6 +863,134 @@ void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val) emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u); } +Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) +{ + if (ctx->options->chip_class >= GFX7) + return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val); + + /* GFX6 doesn't support V_TRUNC_F64, lower it. */ + /* TODO: create more efficient code! */ + if (val.type() == RegType::sgpr) + val = as_vgpr(ctx, val); + + /* Split the input value. */ + Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val); + + /* Extract the exponent and compute the unbiased value. */ + Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u)); + exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u)); + + /* Extract the fractional part. */ + Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu)); + fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent); + + Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask); + + Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1); + Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo); + fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp); + tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi); + fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp); + + /* Get the sign bit. */ + Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi); + + /* Decide the operation to apply depending on the unbiased exponent. */ + Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u)); + Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0); + Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0); + Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u)); + dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51); + dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51); + + return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi); +} + +Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) +{ + if (ctx->options->chip_class >= GFX7) + return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val); + + /* GFX6 doesn't support V_FLOOR_F64, lower it. */ + Temp src0 = as_vgpr(ctx, val); + + Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */ + Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu)); + + Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask); + Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0); + Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val); + + Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0); + Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min); + + Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan); + Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan); + + Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1); + + Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v); + static_cast(add)->neg[1] = true; + + return add->definitions[0].getTemp(); +} + +Temp convert_int(Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) { + if (!dst.id()) { + if (dst_bits % 32 == 0 || src.type() == RegType::sgpr) + dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u)); + else + dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword()); + } + + if (dst.bytes() == src.bytes() && dst_bits < src_bits) + return bld.copy(Definition(dst), src); + else if (dst.bytes() < src.bytes()) + return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u)); + + Temp tmp = dst; + if (dst_bits == 64) + tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1); + + if (tmp == src) { + } else if (src.regClass() == s1) { + if (is_signed) + bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src); + else + bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src); + } else { + assert(src_bits != 8 || src.regClass() == v1b); + assert(src_bits != 16 || src.regClass() == v2b); + aco_ptr sdwa{create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; + sdwa->operands[0] = Operand(src); + sdwa->definitions[0] = Definition(tmp); + if (is_signed) + sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword; + else + sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword; + sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword; + bld.insert(std::move(sdwa)); + } + + if (dst_bits == 64) { + if (is_signed && dst.regClass() == s2) { + Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high); + } else if (is_signed && dst.regClass() == v2) { + Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high); + } else { + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u)); + } + } + + return dst; +} + void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) { if (!instr->dest.dest.is_ssa) { @@ -750,14 +1006,43 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_vec3: case nir_op_vec4: { std::array elems; - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)}; - for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) { + unsigned num = instr->dest.dest.ssa.num_components; + for (unsigned i = 0; i < num; ++i) elems[i] = get_alu_src(ctx, instr->src[i]); - vec->operands[i] = Operand{elems[i]}; + + if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) { + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)}; + RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u); + for (unsigned i = 0; i < num; ++i) { + if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword()) + vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc)); + else + vec->operands[i] = Operand{elems[i]}; + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), elems); + } else { + // TODO: that is a bit suboptimal.. + Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1)); + for (unsigned i = 0; i < num - 1; ++i) + if (((i+1) * instr->dest.dest.ssa.bit_size) % 32) + elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask); + for (unsigned i = 0; i < num; ++i) { + unsigned bit = i * instr->dest.dest.ssa.bit_size; + if (bit % 32 == 0) { + elems[bit / 32] = elems[i]; + } else { + elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), + elems[i], Operand((i * instr->dest.dest.ssa.bit_size) % 32)); + elems[bit / 32] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[bit / 32], elems[i]); + } + } + if (dst.size() == 1) + bld.copy(Definition(dst), elems[0]); + else + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), elems[0], elems[1]); } - vec->definitions[0] = Definition(dst); - ctx->block->instructions.emplace_back(std::move(vec)); - ctx->allocated_vec.emplace(dst.id(), elems); break; } case nir_op_mov: { @@ -787,7 +1072,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (instr->dest.dest.ssa.bit_size == 1) { assert(src.regClass() == bld.lm); assert(dst.regClass() == bld.lm); - bld.sop2(Builder::s_andn2, Definition(dst), bld.def(s1, scc), Operand(exec, bld.lm), src); + /* Don't use s_andn2 here, this allows the optimizer to make a better decision */ + Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src); + bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm)); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst); } else if (dst.type() == RegType::sgpr) { @@ -845,9 +1132,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_isign: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == s1) { - Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u)); - Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u)); - bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp); + Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand((uint32_t)-1)); + bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand(1u)); } else if (dst.regClass() == s2) { Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u)); Temp neqz; @@ -858,9 +1144,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) /* SCC gets zero-extended to 64 bit */ bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz)); } else if (dst.regClass() == v1) { - Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src); - Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); - bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz); + bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand((uint32_t)-1), src, Operand(1u)); } else if (dst.regClass() == v2) { Temp upper = emit_extract_vector(ctx, src, 1, v1); Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper); @@ -1060,7 +1344,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp msb = sub.def(0).getTemp(); Temp carry = sub.def(1).getTemp(); - bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry); + bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry)); } else if (src.regClass() == v1) { aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32; Temp msb_rev = bld.tmp(v1); @@ -1309,11 +1593,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_fmul: { - if (dst.size() == 1) { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1])); + if (dst.regClass() == v2b) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true); + } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true); - } else if (dst.size() == 2) { - bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]), - as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + } else if (dst.regClass() == v2) { + bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -1322,11 +1609,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_fadd: { - if (dst.size() == 1) { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1])); + if (dst.regClass() == v2b) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true); + } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true); - } else if (dst.size() == 2) { - bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]), - as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + } else if (dst.regClass() == v2) { + bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -1337,15 +1627,19 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fsub: { Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); - if (dst.size() == 1) { + if (dst.regClass() == v2b) { + if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr) + emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false); + else + emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true); + } else if (dst.regClass() == v1) { if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr) emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false); else emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true); - } else if (dst.size() == 2) { + } else if (dst.regClass() == v2) { Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), - get_alu_src(ctx, instr->src[0]), - as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + as_vgpr(ctx, src0), as_vgpr(ctx, src1)); VOP3A_instruction* sub = static_cast(add); sub->neg[1] = true; } else { @@ -1356,18 +1650,19 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_fmax: { - if (dst.size() == 1) { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1])); + if (dst.regClass() == v2b) { + // TODO: check fp_mode.must_flush_denorms16_64 + emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true); + } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32); - } else if (dst.size() == 2) { + } else if (dst.regClass() == v2) { if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) { - Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2), - get_alu_src(ctx, instr->src[0]), - as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2), src0, src1); bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp); } else { - bld.vop3(aco_opcode::v_max_f64, Definition(dst), - get_alu_src(ctx, instr->src[0]), - as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1); } } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1377,18 +1672,19 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_fmin: { - if (dst.size() == 1) { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1])); + if (dst.regClass() == v2b) { + // TODO: check fp_mode.must_flush_denorms16_64 + emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true); + } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32); - } else if (dst.size() == 2) { + } else if (dst.regClass() == v2) { if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) { - Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), - get_alu_src(ctx, instr->src[0]), - as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), src0, src1); bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp); } else { - bld.vop3(aco_opcode::v_min_f64, Definition(dst), - get_alu_src(ctx, instr->src[0]), - as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); + bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1); } } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1398,7 +1694,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_fmax3: { - if (dst.size() == 1) { + if (dst.regClass() == v2b) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false); + } else if (dst.regClass() == v1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1408,7 +1706,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_fmin3: { - if (dst.size() == 1) { + if (dst.regClass() == v2b) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false); + } else if (dst.regClass() == v1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1418,7 +1718,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_fmed3: { - if (dst.size() == 1) { + if (dst.regClass() == v2b) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false); + } else if (dst.regClass() == v1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1514,9 +1816,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_frsq: { - if (dst.size() == 1) { - emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); - } else if (dst.size() == 2) { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == v2b) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst); + } else if (dst.regClass() == v1) { + emit_rsq(ctx, bld, Definition(dst), src); + } else if (dst.regClass() == v2) { emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1527,11 +1832,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fneg: { Temp src = get_alu_src(ctx, instr->src[0]); - if (dst.size() == 1) { + if (dst.regClass() == v2b) { + bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src)); + } else if (dst.regClass() == v1) { if (ctx->block->fp_mode.must_flush_denorms32) src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src)); bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src)); - } else if (dst.size() == 2) { + } else if (dst.regClass() == v2) { if (ctx->block->fp_mode.must_flush_denorms16_64) src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src)); Temp upper = bld.tmp(v1), lower = bld.tmp(v1); @@ -1547,11 +1854,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fabs: { Temp src = get_alu_src(ctx, instr->src[0]); - if (dst.size() == 1) { + if (dst.regClass() == v2b) { + bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src)); + } else if (dst.regClass() == v1) { if (ctx->block->fp_mode.must_flush_denorms32) src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src)); bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src)); - } else if (dst.size() == 2) { + } else if (dst.regClass() == v2) { if (ctx->block->fp_mode.must_flush_denorms16_64) src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src)); Temp upper = bld.tmp(v1), lower = bld.tmp(v1); @@ -1567,11 +1876,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fsat: { Temp src = get_alu_src(ctx, instr->src[0]); - if (dst.size() == 1) { + if (dst.regClass() == v2b) { + bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand(0u), Operand(0x3f800000u), src); + } else if (dst.regClass() == v1) { bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src); /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */ // TODO: confirm that this holds under any circumstances - } else if (dst.size() == 2) { + } else if (dst.regClass() == v2) { Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u)); VOP3A_instruction* vop3 = static_cast(add); vop3->clamp = true; @@ -1583,8 +1894,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_flog2: { - if (dst.size() == 1) { - emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == v2b) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst); + } else if (dst.regClass() == v1) { + emit_log2(ctx, bld, Definition(dst), src); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -1593,9 +1907,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_frcp: { - if (dst.size() == 1) { - emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); - } else if (dst.size() == 2) { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == v2b) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst); + } else if (dst.regClass() == v1) { + emit_rcp(ctx, bld, Definition(dst), src); + } else if (dst.regClass() == v2) { emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1605,7 +1922,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_fexp2: { - if (dst.size() == 1) { + if (dst.regClass() == v2b) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst); + } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1615,9 +1934,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_fsqrt: { - if (dst.size() == 1) { - emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); - } else if (dst.size() == 2) { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == v2b) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst); + } else if (dst.regClass() == v1) { + emit_sqrt(ctx, bld, Definition(dst), src); + } else if (dst.regClass() == v2) { emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1627,9 +1949,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_ffract: { - if (dst.size() == 1) { + if (dst.regClass() == v2b) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst); + } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst); - } else if (dst.size() == 2) { + } else if (dst.regClass() == v2) { emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1639,10 +1963,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_ffloor: { - if (dst.size() == 1) { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == v2b) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst); + } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst); - } else if (dst.size() == 2) { - emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst); + } else if (dst.regClass() == v2) { + emit_floor_f64(ctx, bld, Definition(dst), src); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -1651,10 +1978,28 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_fceil: { - if (dst.size() == 1) { + Temp src0 = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == v2b) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst); + } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst); - } else if (dst.size() == 2) { - emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst); + } else if (dst.regClass() == v2) { + if (ctx->options->chip_class >= GFX7) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst); + } else { + /* GFX6 doesn't support V_CEIL_F64, lower it. */ + /* trunc = trunc(src0) + * if (src0 > 0.0 && src0 != trunc) + * trunc += 1.0 + */ + Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0); + Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u)); + Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc); + Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1); + Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond); + add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add); + bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add); + } } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -1663,10 +2008,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_ftrunc: { - if (dst.size() == 1) { + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == v2b) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst); + } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst); - } else if (dst.size() == 2) { - emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst); + } else if (dst.regClass() == v2) { + emit_trunc_f64(ctx, bld, Definition(dst), src); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -1675,10 +2023,38 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_fround_even: { - if (dst.size() == 1) { + Temp src0 = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == v2b) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst); + } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst); - } else if (dst.size() == 2) { - emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst); + } else if (dst.regClass() == v2) { + if (ctx->options->chip_class >= GFX7) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst); + } else { + /* GFX6 doesn't support V_RNDNE_F64, lower it. */ + Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0); + + Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u))); + Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi)); + Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi)); + Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi)); + static_cast(sub)->neg[1] = true; + tmp = sub->definitions[0].getTemp(); + + Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu)); + Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v); + static_cast(vop3)->abs[0] = true; + Temp cond = vop3->definitions[0].getTemp(); + + Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp); + Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond); + Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond); + + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); + } } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -1688,11 +2064,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fsin: case nir_op_fcos: { - Temp src = get_alu_src(ctx, instr->src[0]); + Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); aco_ptr norm; - if (dst.size() == 1) { - Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u)); - Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, as_vgpr(ctx, src)); + Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u)); + if (dst.regClass() == v2b) { + Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src); + aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16; + bld.vop1(opcode, Definition(dst), tmp); + } else if (dst.regClass() == v1) { + Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src); /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */ if (ctx->options->chip_class < GFX9) @@ -1708,14 +2088,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_ldexp: { - if (dst.size() == 1) { - bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), - as_vgpr(ctx, get_alu_src(ctx, instr->src[0])), - get_alu_src(ctx, instr->src[1])); - } else if (dst.size() == 2) { - bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), - as_vgpr(ctx, get_alu_src(ctx, instr->src[0])), - get_alu_src(ctx, instr->src[1])); + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.regClass() == v2b) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false); + } else if (dst.regClass() == v1) { + bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1); + } else if (dst.regClass() == v2) { + bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), as_vgpr(ctx, src0), src1); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -1724,12 +2104,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_frexp_sig: { - if (dst.size() == 1) { - bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), - get_alu_src(ctx, instr->src[0])); - } else if (dst.size() == 2) { - bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), - get_alu_src(ctx, instr->src[0])); + Temp src = get_alu_src(ctx, instr->src[0]); + if (dst.regClass() == v2b) { + bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src); + } else if (dst.regClass() == v1) { + bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src); + } else if (dst.regClass() == v2) { + bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -1738,12 +2119,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_frexp_exp: { - if (instr->src[0].src.ssa->bit_size == 32) { - bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), - get_alu_src(ctx, instr->src[0])); + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 16) { + Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src); + tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand(0u)); + convert_int(bld, tmp, 8, 32, true, dst); + } else if (instr->src[0].src.ssa->bit_size == 32) { + bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src); } else if (instr->src[0].src.ssa->bit_size == 64) { - bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), - get_alu_src(ctx, instr->src[0])); + bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -1753,12 +2137,19 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fsign: { Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); - if (dst.size() == 1) { + if (dst.regClass() == v2b) { + Temp one = bld.copy(bld.def(v1), Operand(0x3c00u)); + Temp minus_one = bld.copy(bld.def(v1), Operand(0xbc00u)); + Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); + src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond); + cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond); + } else if (dst.regClass() == v1) { Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond); cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond); - } else if (dst.size() == 2) { + } else if (dst.regClass() == v2) { Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u)); Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond); @@ -1775,8 +2166,25 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } break; } + case nir_op_f2f16: + case nir_op_f2f16_rtne: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 64) + src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src); + bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src); + break; + } + case nir_op_f2f16_rtz: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 64) + src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src); + bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u)); + break; + } case nir_op_f2f32: { - if (instr->src[0].src.ssa->bit_size == 64) { + if (instr->src[0].src.ssa->bit_size == 16) { + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst); + } else if (instr->src[0].src.ssa->bit_size == 64) { emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1786,23 +2194,34 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_f2f64: { - if (instr->src[0].src.ssa->bit_size == 32) { - emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst); - } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); - } + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 16) + src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); + bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src); + break; + } + case nir_op_i2f16: { + assert(dst.regClass() == v2b); + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 8) + src = convert_int(bld, src, 8, 16, true); + bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src); break; } case nir_op_i2f32: { assert(dst.size() == 1); - emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst); + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size <= 16) + src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true); + bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src); break; } case nir_op_i2f64: { - if (instr->src[0].src.ssa->bit_size == 32) { - emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst); + if (instr->src[0].src.ssa->bit_size <= 32) { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size <= 16) + src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true); + bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src); } else if (instr->src[0].src.ssa->bit_size == 64) { Temp src = get_alu_src(ctx, instr->src[0]); RegClass rc = RegClass(src.type(), 1); @@ -1820,14 +2239,33 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } break; } + case nir_op_u2f16: { + assert(dst.regClass() == v2b); + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 8) + src = convert_int(bld, src, 8, 16, false); + bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src); + break; + } case nir_op_u2f32: { assert(dst.size() == 1); - emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst); + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 8) { + //TODO: we should use v_cvt_f32_ubyte1/v_cvt_f32_ubyte2/etc depending on the register assignment + bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src); + } else { + if (instr->src[0].src.ssa->bit_size == 16) + src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true); + bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src); + } break; } case nir_op_u2f64: { - if (instr->src[0].src.ssa->bit_size == 32) { - emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst); + if (instr->src[0].src.ssa->bit_size <= 32) { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size <= 16) + src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, false); + bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src); } else if (instr->src[0].src.ssa->bit_size == 64) { Temp src = get_alu_src(ctx, instr->src[0]); RegClass rc = RegClass(src.type(), 1); @@ -1844,9 +2282,49 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } break; } + case nir_op_f2i8: + case nir_op_f2i16: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 16) + src = bld.vop1(aco_opcode::v_cvt_i16_f16, bld.def(v1), src); + else if (instr->src[0].src.ssa->bit_size == 32) + src = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src); + else + src = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src); + + if (dst.type() == RegType::vgpr) + bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u)); + else + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src); + break; + } + case nir_op_f2u8: + case nir_op_f2u16: { + Temp src = get_alu_src(ctx, instr->src[0]); + if (instr->src[0].src.ssa->bit_size == 16) + src = bld.vop1(aco_opcode::v_cvt_u16_f16, bld.def(v1), src); + else if (instr->src[0].src.ssa->bit_size == 32) + src = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src); + else + src = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src); + + if (dst.type() == RegType::vgpr) + bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u)); + else + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src); + break; + } case nir_op_f2i32: { Temp src = get_alu_src(ctx, instr->src[0]); - if (instr->src[0].src.ssa->bit_size == 32) { + if (instr->src[0].src.ssa->bit_size == 16) { + Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); + if (dst.type() == RegType::vgpr) { + bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp); + } else { + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), + bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp)); + } + } else if (instr->src[0].src.ssa->bit_size == 32) { if (dst.type() == RegType::vgpr) bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src); else @@ -1869,7 +2347,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_f2u32: { Temp src = get_alu_src(ctx, instr->src[0]); - if (instr->src[0].src.ssa->bit_size == 32) { + if (instr->src[0].src.ssa->bit_size == 16) { + Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); + if (dst.type() == RegType::vgpr) { + bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp); + } else { + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), + bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp)); + } + } else if (instr->src[0].src.ssa->bit_size == 32) { if (dst.type() == RegType::vgpr) bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src); else @@ -1892,7 +2378,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_f2i64: { Temp src = get_alu_src(ctx, instr->src[0]); - if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) { + if (instr->src[0].src.ssa->bit_size == 16) + src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); + + if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) { Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u)); Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src); @@ -1918,13 +2407,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper); - } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) { + } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) { if (src.type() == RegType::vgpr) src = bld.as_uniform(src); Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u)); - exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u)); - exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent); - exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent); + exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u)); + exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent); + exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent); Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src); Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u)); mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa); @@ -1946,10 +2435,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (instr->src[0].src.ssa->bit_size == 64) { Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u)); - Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src); + Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src); Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec); vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u)); - Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul); + Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul); Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc); Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma); Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor); @@ -1968,7 +2457,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_f2u64: { Temp src = get_alu_src(ctx, instr->src[0]); - if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) { + if (instr->src[0].src.ssa->bit_size == 16) + src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); + + if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) { Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent); exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent); @@ -1991,12 +2483,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); - } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) { + } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) { if (src.type() == RegType::vgpr) src = bld.as_uniform(src); Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u)); - exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u)); - exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent); + exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u)); + exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent); Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src); mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa); Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent); @@ -2015,10 +2507,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (instr->src[0].src.ssa->bit_size == 64) { Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u)); - Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src); + Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src); Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec); vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u)); - Temp floor = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul); + Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul); Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc); Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma); Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor); @@ -2035,15 +2527,30 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } break; } - case nir_op_b2f32: { + case nir_op_b2f16: { Temp src = get_alu_src(ctx, instr->src[0]); assert(src.regClass() == bld.lm); if (dst.regClass() == s1) { src = bool_to_scalar_condition(ctx, src); - bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src); - } else if (dst.regClass() == v1) { - bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src); + bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3c00u), src); + } else if (dst.regClass() == v2b) { + Temp one = bld.copy(bld.def(v1), Operand(0x3c00u)); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), one, src); + } else { + unreachable("Wrong destination register class for nir_op_b2f16."); + } + break; + } + case nir_op_b2f32: { + Temp src = get_alu_src(ctx, instr->src[0]); + assert(src.regClass() == bld.lm); + + if (dst.regClass() == s1) { + src = bool_to_scalar_condition(ctx, src); + bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src); + } else if (dst.regClass() == v1) { + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src); } else { unreachable("Wrong destination register class for nir_op_b2f32."); } @@ -2065,63 +2572,23 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } break; } - case nir_op_i2i32: { - Temp src = get_alu_src(ctx, instr->src[0]); - if (instr->src[0].src.ssa->bit_size == 64) { - /* we can actually just say dst = src, as it would map the lower register */ - emit_extract_vector(ctx, src, 0, dst); - } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); - } - break; - } - case nir_op_u2u32: { - Temp src = get_alu_src(ctx, instr->src[0]); - if (instr->src[0].src.ssa->bit_size == 16) { - if (dst.regClass() == s1) { - bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src); - } else { - // TODO: do better with SDWA - bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src); - } - } else if (instr->src[0].src.ssa->bit_size == 64) { - /* we can actually just say dst = src, as it would map the lower register */ - emit_extract_vector(ctx, src, 0, dst); - } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); - } - break; - } + case nir_op_i2i8: + case nir_op_i2i16: + case nir_op_i2i32: case nir_op_i2i64: { - Temp src = get_alu_src(ctx, instr->src[0]); - if (src.regClass() == s1) { - Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u)); - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high); - } else if (src.regClass() == v1) { - Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src); - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high); - } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); - } + convert_int(bld, get_alu_src(ctx, instr->src[0]), + instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst); break; } + case nir_op_u2u8: + case nir_op_u2u16: + case nir_op_u2u32: case nir_op_u2u64: { - Temp src = get_alu_src(ctx, instr->src[0]); - if (instr->src[0].src.ssa->bit_size == 32) { - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u)); - } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); - } + convert_int(bld, get_alu_src(ctx, instr->src[0]), + instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst); break; } + case nir_op_b2b32: case nir_op_b2i32: { Temp src = get_alu_src(ctx, instr->src[0]); assert(src.regClass() == bld.lm); @@ -2136,12 +2603,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } break; } + case nir_op_b2b1: case nir_op_i2b1: { Temp src = get_alu_src(ctx, instr->src[0]); assert(dst.regClass() == bld.lm); if (src.type() == RegType::vgpr) { assert(src.regClass() == v1 || src.regClass() == v2); + assert(dst.regClass() == bld.lm); bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32, Definition(dst), Operand(0u), src).def(0).setHint(vcc); } else { @@ -2170,6 +2639,34 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_unpack_64_2x32_split_y: bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0])); break; + case nir_op_unpack_32_2x16_split_x: + if (dst.type() == RegType::vgpr) { + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0])); + } else { + bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0])); + } + break; + case nir_op_unpack_32_2x16_split_y: + if (dst.type() == RegType::vgpr) { + bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0])); + } else { + bld.sop2(aco_opcode::s_bfe_u32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]), Operand(uint32_t(16 << 16 | 16))); + } + break; + case nir_op_pack_32_2x16_split: { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + if (dst.regClass() == v1) { + src0 = emit_extract_vector(ctx, src0, 0, v2b); + src1 = emit_extract_vector(ctx, src1, 0, v2b); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1); + } else { + src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, Operand(0xFFFFu)); + src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1, Operand(16u)); + bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1); + } + break; + } case nir_op_pack_half_2x16: { Temp src = get_alu_src(ctx, instr->src[0], 2); @@ -2229,7 +2726,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) */ f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16); Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u)); - Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), f32, smallest); + Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), f32, smallest); static_cast(vop3)->abs[0] = true; cmp_res = vop3->definitions[0].getTemp(); } @@ -2378,34 +2875,34 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_flt: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64); break; } case nir_op_fge: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64); break; } case nir_op_feq: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64); break; } case nir_op_fne: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64); break; } case nir_op_ilt: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32); break; } case nir_op_ige: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32); break; } case nir_op_ieq: { if (instr->src[0].src.ssa->bit_size == 1) emit_boolean_logic(ctx, instr, Builder::s_xnor, dst); else - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes); break; } @@ -2413,16 +2910,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (instr->src[0].src.ssa->bit_size == 1) emit_boolean_logic(ctx, instr, Builder::s_xor, dst); else - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes); break; } case nir_op_ult: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32); break; } case nir_op_uge: { - emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32); + emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32); break; } case nir_op_fddx: @@ -2483,6 +2980,12 @@ void visit_load_const(isel_context *ctx, nir_load_const_instr *instr) int val = instr->value[0].b ? -1 : 0; Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val); bld.sop1(Builder::s_mov, Definition(dst), op); + } else if (instr->def.bit_size == 8) { + /* ensure that the value is correctly represented in the low byte of the register */ + bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u8); + } else if (instr->def.bit_size == 16) { + /* ensure that the value is correctly represented in the low half of the register */ + bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u16); } else if (dst.size() == 1) { bld.copy(Definition(dst), Operand(instr->value[0].u32)); } else { @@ -2509,491 +3012,1419 @@ uint32_t widen_mask(uint32_t mask, unsigned multiplier) return new_mask; } -void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr) +void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst) { - /* This wouldn't work inside control flow or with indirect offsets but - * that doesn't happen because of nir_lower_io_to_temporaries(). */ - - unsigned write_mask = nir_intrinsic_write_mask(instr); - unsigned component = nir_intrinsic_component(instr); - Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - unsigned idx = nir_intrinsic_base(instr) + component; + Builder bld(ctx->program, ctx->block); + if (offset.isTemp()) { + Temp tmp[3] = {vec, vec, vec}; - nir_instr *off_instr = instr->src[1].ssa->parent_instr; - if (off_instr->type != nir_instr_type_load_const) { - fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n"); - nir_print_instr(off_instr, stderr); - fprintf(stderr, "\n"); - } - idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u; + if (vec.size() == 3) { + tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec); + } else if (vec.size() == 2) { + tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1]; + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec); + } + for (unsigned i = 0; i < dst.size(); i++) + tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset); - if (instr->src[0].ssa->bit_size == 64) - write_mask = widen_mask(write_mask, 2); + vec = tmp[0]; + if (dst.size() == 2) + vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]); - for (unsigned i = 0; i < 8; ++i) { - if (write_mask & (1 << i)) { - ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u); - ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1); - } - idx++; + offset = Operand(0u); } -} -void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr) -{ - Builder bld(ctx->program, ctx->block); - unsigned write_mask = nir_intrinsic_write_mask(instr); - Operand values[4]; - Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - for (unsigned i = 0; i < 4; ++i) { - if (write_mask & (1 << i)) { - Temp tmp = emit_extract_vector(ctx, src, i, v1); - values[i] = Operand(tmp); - } else { - values[i] = Operand(v1); - } - } + if (vec.bytes() == dst.bytes() && offset.constantValue() == 0) + bld.copy(Definition(dst), vec); + else + trim_subdword_vector(ctx, vec, dst, vec.bytes(), ((1 << dst.bytes()) - 1) << offset.constantValue()); +} - unsigned index = nir_intrinsic_base(instr) / 4; - unsigned target, col_format; - unsigned enabled_channels = 0xF; - aco_opcode compr_op = (aco_opcode)0; +struct LoadEmitInfo { + Operand offset; + Temp dst; + unsigned num_components; + unsigned component_size; + Temp resource = Temp(0, s1); + unsigned component_stride = 0; + unsigned const_offset = 0; + unsigned align_mul = 0; + unsigned align_offset = 0; + + bool glc = false; + unsigned swizzle_component_size = 0; + barrier_interaction barrier = barrier_none; + bool can_reorder = true; + Temp soffset = Temp(0, s1); +}; - nir_const_value* offset = nir_src_as_const_value(instr->src[1]); - assert(offset && "Non-const offsets on exports not yet supported"); - index += offset->u32; +using LoadCallback = Temp(*)( + Builder& bld, const LoadEmitInfo* info, Temp offset, unsigned bytes_needed, + unsigned align, unsigned const_offset, Temp dst_hint); - assert(index != FRAG_RESULT_COLOR); +template +void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info) +{ + unsigned load_size = info->num_components * info->component_size; + unsigned component_size = info->component_size; - /* Unlike vertex shader exports, it's fine to use multiple exports to - * export separate channels of one target. So shaders which export both - * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine. - * TODO: combine the exports in those cases and create better code - */ + unsigned num_vals = 0; + Temp vals[info->dst.bytes()]; - if (index == FRAG_RESULT_SAMPLE_MASK) { + unsigned const_offset = info->const_offset; - if (ctx->program->info->ps.writes_z) { - target = V_008DFC_SQ_EXP_MRTZ; - enabled_channels = 0x4; - col_format = (unsigned) -1; + unsigned align_mul = info->align_mul ? info->align_mul : component_size; + unsigned align_offset = (info->align_offset + const_offset) % align_mul; - values[2] = values[0]; - values[0] = Operand(v1); - } else { - bld.exp(aco_opcode::exp, Operand(v1), Operand(values[0]), Operand(v1), Operand(v1), - 0xc, V_008DFC_SQ_EXP_MRTZ, true); - return; + unsigned bytes_read = 0; + while (bytes_read < load_size) { + unsigned bytes_needed = load_size - bytes_read; + + /* add buffer for unaligned loads */ + int byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1; + + if (byte_align) { + if ((bytes_needed > 2 || !supports_8bit_16bit_loads) && byte_align_loads) { + if (info->component_stride) { + assert(supports_8bit_16bit_loads && "unimplemented"); + bytes_needed = 2; + byte_align = 0; + } else { + bytes_needed += byte_align == -1 ? 4 - info->align_mul : byte_align; + bytes_needed = align(bytes_needed, 4); + } + } else { + byte_align = 0; + } } - } else if (index == FRAG_RESULT_DEPTH) { + if (info->swizzle_component_size) + bytes_needed = MIN2(bytes_needed, info->swizzle_component_size); + if (info->component_stride) + bytes_needed = MIN2(bytes_needed, info->component_size); + + bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4); - target = V_008DFC_SQ_EXP_MRTZ; - enabled_channels = 0x1; - col_format = (unsigned) -1; + /* reduce constant offset */ + Operand offset = info->offset; + unsigned reduced_const_offset = const_offset; + bool remove_const_offset_completely = need_to_align_offset; + if (const_offset && (remove_const_offset_completely || const_offset >= max_const_offset_plus_one)) { + unsigned to_add = const_offset; + if (remove_const_offset_completely) { + reduced_const_offset = 0; + } else { + to_add = const_offset / max_const_offset_plus_one * max_const_offset_plus_one; + reduced_const_offset %= max_const_offset_plus_one; + } + Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp(); + if (offset.isConstant()) { + offset = Operand(offset.constantValue() + to_add); + } else if (offset_tmp.regClass() == s1) { + offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), + offset_tmp, Operand(to_add)); + } else if (offset_tmp.regClass() == v1) { + offset = bld.vadd32(bld.def(v1), offset_tmp, Operand(to_add)); + } else { + Temp lo = bld.tmp(offset_tmp.type(), 1); + Temp hi = bld.tmp(offset_tmp.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp); + + if (offset_tmp.regClass() == s2) { + Temp carry = bld.tmp(s1); + lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo, Operand(to_add)); + hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry); + offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi); + } else { + Temp new_lo = bld.tmp(v1); + Temp carry = bld.vadd32(Definition(new_lo), lo, Operand(to_add), true).def(1).getTemp(); + hi = bld.vadd32(bld.def(v1), hi, Operand(0u), false, carry); + offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi); + } + } + } - } else if (index == FRAG_RESULT_STENCIL) { + /* align offset down if needed */ + Operand aligned_offset = offset; + if (need_to_align_offset) { + Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp(); + if (offset.isConstant()) { + aligned_offset = Operand(offset.constantValue() & 0xfffffffcu); + } else if (offset_tmp.regClass() == s1) { + aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfffffffcu), offset_tmp); + } else if (offset_tmp.regClass() == s2) { + aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand((uint64_t)0xfffffffffffffffcllu), offset_tmp); + } else if (offset_tmp.regClass() == v1) { + aligned_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), offset_tmp); + } else if (offset_tmp.regClass() == v2) { + Temp hi = bld.tmp(v1), lo = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp); + lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), lo); + aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi); + } + } + Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() : + bld.copy(bld.def(s1), aligned_offset); + + unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul; + Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align, + reduced_const_offset, byte_align ? Temp() : info->dst); + + /* shift result right if needed */ + if (byte_align) { + Operand align((uint32_t)byte_align); + if (byte_align == -1) { + if (offset.isConstant()) + align = Operand(offset.constantValue() % 4u); + else if (offset.size() == 2) + align = Operand(emit_extract_vector(ctx, offset.getTemp(), 0, RegClass(offset.getTemp().type(), 1))); + else + align = offset; + } - if (ctx->program->info->ps.writes_z) { - target = V_008DFC_SQ_EXP_MRTZ; - enabled_channels = 0x2; - col_format = (unsigned) -1; + if (align.isTemp() || align.constantValue()) { + assert(val.bytes() >= load_size && "unimplemented"); + Temp new_val = bld.tmp(RegClass::get(val.type(), load_size)); + if (val.type() == RegType::sgpr) + byte_align_scalar(ctx, val, align, new_val); + else + byte_align_vector(ctx, val, align, new_val); + val = new_val; + } + } - values[1] = values[0]; - values[0] = Operand(v1); + /* add result to list and advance */ + if (info->component_stride) { + assert(val.bytes() == info->component_size && "unimplemented"); + const_offset += info->component_stride; + align_offset = (align_offset + info->component_stride) % align_mul; } else { - values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]); - bld.exp(aco_opcode::exp, values[0], Operand(v1), Operand(v1), Operand(v1), - 0x3, V_008DFC_SQ_EXP_MRTZ, true); - return; + const_offset += val.bytes(); + align_offset = (align_offset + val.bytes()) % align_mul; } - - } else { - index -= FRAG_RESULT_DATA0; - target = V_008DFC_SQ_EXP_MRT + index; - col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf; + bytes_read += val.bytes(); + vals[num_vals++] = val; } - bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1; - bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1; - - switch (col_format) - { - case V_028714_SPI_SHADER_ZERO: - enabled_channels = 0; /* writemask */ - target = V_008DFC_SQ_EXP_NULL; - break; - - case V_028714_SPI_SHADER_32_R: - enabled_channels = 1; - break; - case V_028714_SPI_SHADER_32_GR: - enabled_channels = 0x3; - break; + /* the callback wrote directly to dst */ + if (vals[0] == info->dst) { + assert(num_vals == 1); + emit_split_vector(ctx, info->dst, info->num_components); + return; + } - case V_028714_SPI_SHADER_32_AR: - if (ctx->options->chip_class >= GFX10) { - /* Special case: on GFX10, the outputs are different for 32_AR */ - enabled_channels = 0x3; - values[1] = values[3]; - values[3] = Operand(v1); - } else { - enabled_channels = 0x9; + /* create array of components */ + unsigned components_split = 0; + std::array allocated_vec; + bool has_vgprs = false; + for (unsigned i = 0; i < num_vals;) { + Temp tmp[num_vals]; + unsigned num_tmps = 0; + unsigned tmp_size = 0; + RegType reg_type = RegType::sgpr; + while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) { + if (vals[i].type() == RegType::vgpr) + reg_type = RegType::vgpr; + tmp_size += vals[i].bytes(); + tmp[num_tmps++] = vals[i++]; + } + if (num_tmps > 1) { + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)}; + for (unsigned i = 0; i < num_vals; i++) + vec->operands[i] = Operand(tmp[i]); + tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size)); + vec->definitions[0] = Definition(tmp[0]); + bld.insert(std::move(vec)); } - break; - - case V_028714_SPI_SHADER_FP16_ABGR: - enabled_channels = 0x5; - compr_op = aco_opcode::v_cvt_pkrtz_f16_f32; - break; - case V_028714_SPI_SHADER_UNORM16_ABGR: - enabled_channels = 0x5; - compr_op = aco_opcode::v_cvt_pknorm_u16_f32; - break; + if (tmp[0].bytes() % component_size) { + /* trim tmp[0] */ + assert(i == num_vals); + RegClass new_rc = RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size); + tmp[0] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand(0u)); + } - case V_028714_SPI_SHADER_SNORM16_ABGR: - enabled_channels = 0x5; - compr_op = aco_opcode::v_cvt_pknorm_i16_f32; - break; + RegClass elem_rc = RegClass::get(reg_type, component_size); - case V_028714_SPI_SHADER_UINT16_ABGR: { - enabled_channels = 0x5; - compr_op = aco_opcode::v_cvt_pk_u16_u32; - if (is_int8 || is_int10) { - /* clamp */ - uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0; - Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb)); + unsigned start = components_split; - for (unsigned i = 0; i < 4; i++) { - if ((write_mask >> i) & 1) { - values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), - i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val), - values[i]); - } + if (tmp_size == elem_rc.bytes()) { + allocated_vec[components_split++] = tmp[0]; + } else { + assert(tmp_size % elem_rc.bytes() == 0); + aco_ptr split{create_instruction( + aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())}; + for (unsigned i = 0; i < split->definitions.size(); i++) { + Temp component = bld.tmp(elem_rc); + allocated_vec[components_split++] = component; + split->definitions[i] = Definition(component); } + split->operands[0] = Operand(tmp[0]); + bld.insert(std::move(split)); } - break; - } - - case V_028714_SPI_SHADER_SINT16_ABGR: - enabled_channels = 0x5; - compr_op = aco_opcode::v_cvt_pk_i16_i32; - if (is_int8 || is_int10) { - /* clamp */ - uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0; - uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0; - Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb)); - Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb)); - for (unsigned i = 0; i < 4; i++) { - if ((write_mask >> i) & 1) { - values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), - i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val), - values[i]); - values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), - i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val), - values[i]); - } - } + /* try to p_as_uniform early so we can create more optimizable code and + * also update allocated_vec */ + for (unsigned j = start; j < components_split; j++) { + if (allocated_vec[j].bytes() % 4 == 0 && info->dst.type() == RegType::sgpr) + allocated_vec[j] = bld.as_uniform(allocated_vec[j]); + has_vgprs |= allocated_vec[j].type() == RegType::vgpr; } - break; - - case V_028714_SPI_SHADER_32_ABGR: - enabled_channels = 0xF; - break; - - default: - break; } - if (target == V_008DFC_SQ_EXP_NULL) - return; + /* concatenate components and p_as_uniform() result if needed */ + if (info->dst.type() == RegType::vgpr || !has_vgprs) + ctx->allocated_vec.emplace(info->dst.id(), allocated_vec); - if ((bool) compr_op) { - for (int i = 0; i < 2; i++) { - /* check if at least one of the values to be compressed is enabled */ - unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1; - if (enabled) { - enabled_channels |= enabled << (i*2); - values[i] = bld.vop3(compr_op, bld.def(v1), - values[i*2].isUndefined() ? Operand(0u) : values[i*2], - values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]); - } else { - values[i] = Operand(v1); - } - } - values[2] = Operand(v1); - values[3] = Operand(v1); + int padding_bytes = MAX2((int)info->dst.bytes() - int(allocated_vec[0].bytes() * info->num_components), 0); + + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, info->num_components + !!padding_bytes, 1)}; + for (unsigned i = 0; i < info->num_components; i++) + vec->operands[i] = Operand(allocated_vec[i]); + if (padding_bytes) + vec->operands[info->num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes)); + if (info->dst.type() == RegType::sgpr && has_vgprs) { + Temp tmp = bld.tmp(RegType::vgpr, info->dst.size()); + vec->definitions[0] = Definition(tmp); + bld.insert(std::move(vec)); + bld.pseudo(aco_opcode::p_as_uniform, Definition(info->dst), tmp); } else { - for (int i = 0; i < 4; i++) - values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1); + vec->definitions[0] = Definition(info->dst); + bld.insert(std::move(vec)); } - - bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], - enabled_channels, target, (bool) compr_op); } -Operand load_lds_size_m0(isel_context *ctx) +Operand load_lds_size_m0(Builder& bld) { /* TODO: m0 does not need to be initialized on GFX9+ */ - Builder bld(ctx->program, ctx->block); return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff)); } -void load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst, - Temp address, unsigned base_offset, unsigned align) +Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info, + Temp offset, unsigned bytes_needed, + unsigned align, unsigned const_offset, + Temp dst_hint) { - assert(util_is_power_of_two_nonzero(align) && align >= 4); + offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset; - Builder bld(ctx->program, ctx->block); + Operand m = load_lds_size_m0(bld); - Operand m = load_lds_size_m0(ctx); + bool large_ds_read = bld.program->chip_class >= GFX7; + bool usable_read2 = bld.program->chip_class >= GFX7; - unsigned num_components = dst.size() * 4u / elem_size_bytes; - unsigned bytes_read = 0; - unsigned result_size = 0; - unsigned total_bytes = num_components * elem_size_bytes; - std::array result; - - while (bytes_read < total_bytes) { - unsigned todo = total_bytes - bytes_read; - bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0; - bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0; - - aco_opcode op = aco_opcode::last_opcode; - bool read2 = false; - if (todo >= 16 && aligned16) { - op = aco_opcode::ds_read_b128; - todo = 16; - } else if (todo >= 16 && aligned8) { - op = aco_opcode::ds_read2_b64; - read2 = true; - todo = 16; - } else if (todo >= 12 && aligned16) { - op = aco_opcode::ds_read_b96; - todo = 12; - } else if (todo >= 8 && aligned8) { - op = aco_opcode::ds_read_b64; - todo = 8; - } else if (todo >= 8) { - op = aco_opcode::ds_read2_b32; - read2 = true; - todo = 8; - } else if (todo >= 4) { - op = aco_opcode::ds_read_b32; - todo = 4; - } else { - assert(false); - } - assert(todo % elem_size_bytes == 0); - unsigned num_elements = todo / elem_size_bytes; - unsigned offset = base_offset + bytes_read; - unsigned max_offset = read2 ? 1019 : 65535; + bool read2 = false; + unsigned size = 0; + aco_opcode op; + //TODO: use ds_read_u8_d16_hi/ds_read_u16_d16_hi if beneficial + if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) { + size = 16; + op = aco_opcode::ds_read_b128; + } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) { + size = 16; + read2 = true; + op = aco_opcode::ds_read2_b64; + } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) { + size = 12; + op = aco_opcode::ds_read_b96; + } else if (bytes_needed >= 8 && align % 8 == 0) { + size = 8; + op = aco_opcode::ds_read_b64; + } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0) { + size = 8; + read2 = true; + op = aco_opcode::ds_read2_b32; + } else if (bytes_needed >= 4 && align % 4 == 0) { + size = 4; + op = aco_opcode::ds_read_b32; + } else if (bytes_needed >= 2 && align % 2 == 0) { + size = 2; + op = aco_opcode::ds_read_u16; + } else { + size = 1; + op = aco_opcode::ds_read_u8; + } - Temp address_offset = address; - if (offset > max_offset) { - address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset); - offset = bytes_read; - } - assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */ + unsigned max_offset_plus_one = read2 ? 254 * (size / 2u) + 1 : 65536; + if (const_offset >= max_offset_plus_one) { + offset = bld.vadd32(bld.def(v1), offset, Operand(const_offset / max_offset_plus_one)); + const_offset %= max_offset_plus_one; + } - Temp res; - if (num_components == 1 && dst.type() == RegType::vgpr) - res = dst; - else - res = bld.tmp(RegClass(RegType::vgpr, todo / 4)); + if (read2) + const_offset /= (size / 2u); - if (read2) - res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1); - else - res = bld.ds(op, Definition(res), address_offset, m, offset); + RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4)); + Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc); + if (read2) + bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1); + else + bld.ds(op, Definition(val), offset, m, const_offset); - if (num_components == 1) { - assert(todo == total_bytes); - if (dst.type() == RegType::sgpr) - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res); - return; - } + if (size < 4) + val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u)); - if (dst.type() == RegType::sgpr) - res = bld.as_uniform(res); + return val; +} + +static auto emit_lds_load = emit_load; + +Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info, + Temp offset, unsigned bytes_needed, + unsigned align, unsigned const_offset, + Temp dst_hint) +{ + unsigned size = 0; + aco_opcode op; + if (bytes_needed <= 4) { + size = 1; + op = info->resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword; + } else if (bytes_needed <= 8) { + size = 2; + op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2; + } else if (bytes_needed <= 16) { + size = 4; + op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4; + } else if (bytes_needed <= 32) { + size = 8; + op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8; + } else { + size = 16; + op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16; + } + aco_ptr load{create_instruction(op, Format::SMEM, 2, 1)}; + if (info->resource.id()) { + load->operands[0] = Operand(info->resource); + load->operands[1] = Operand(offset); + } else { + load->operands[0] = Operand(offset); + load->operands[1] = Operand(0u); + } + RegClass rc(RegType::sgpr, size); + Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc); + load->definitions[0] = Definition(val); + load->glc = info->glc; + load->dlc = info->glc && bld.program->chip_class >= GFX10; + load->barrier = info->barrier; + load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works + bld.insert(std::move(load)); + return val; +} + +static auto emit_smem_load = emit_load; + +Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info, + Temp offset, unsigned bytes_needed, + unsigned align_, unsigned const_offset, + Temp dst_hint) +{ + Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); + Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); + + if (info->soffset.id()) { + if (soffset.isTemp()) + vaddr = bld.copy(bld.def(v1), soffset); + soffset = Operand(info->soffset); + } + + unsigned bytes_size = 0; + aco_opcode op; + if (bytes_needed == 1) { + bytes_size = 1; + op = aco_opcode::buffer_load_ubyte; + } else if (bytes_needed == 2) { + bytes_size = 2; + op = aco_opcode::buffer_load_ushort; + } else if (bytes_needed <= 4) { + bytes_size = 4; + op = aco_opcode::buffer_load_dword; + } else if (bytes_needed <= 8) { + bytes_size = 8; + op = aco_opcode::buffer_load_dwordx2; + } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) { + bytes_size = 12; + op = aco_opcode::buffer_load_dwordx3; + } else { + bytes_size = 16; + op = aco_opcode::buffer_load_dwordx4; + } + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; + mubuf->operands[0] = Operand(info->resource); + mubuf->operands[1] = vaddr; + mubuf->operands[2] = soffset; + mubuf->offen = (offset.type() == RegType::vgpr); + mubuf->glc = info->glc; + mubuf->dlc = info->glc && bld.program->chip_class >= GFX10; + mubuf->barrier = info->barrier; + mubuf->can_reorder = info->can_reorder; + mubuf->offset = const_offset; + RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4)); + Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc); + mubuf->definitions[0] = Definition(val); + bld.insert(std::move(mubuf)); + + if (bytes_size < 4) + val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u)); + + return val; +} + +static auto emit_mubuf_load = emit_load; + +Temp get_gfx6_global_rsrc(Builder& bld, Temp addr) +{ + uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + + if (addr.type() == RegType::vgpr) + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf)); + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf)); +} + +Temp global_load_callback(Builder& bld, const LoadEmitInfo *info, + Temp offset, unsigned bytes_needed, + unsigned align_, unsigned const_offset, + Temp dst_hint) +{ + unsigned bytes_size = 0; + bool mubuf = bld.program->chip_class == GFX6; + bool global = bld.program->chip_class >= GFX9; + aco_opcode op; + if (bytes_needed == 1) { + bytes_size = 1; + op = mubuf ? aco_opcode::buffer_load_ubyte : global ? aco_opcode::global_load_ubyte : aco_opcode::flat_load_ubyte; + } else if (bytes_needed == 2) { + bytes_size = 2; + op = mubuf ? aco_opcode::buffer_load_ushort : global ? aco_opcode::global_load_ushort : aco_opcode::flat_load_ushort; + } else if (bytes_needed <= 4) { + bytes_size = 4; + op = mubuf ? aco_opcode::buffer_load_dword : global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword; + } else if (bytes_needed <= 8) { + bytes_size = 8; + op = mubuf ? aco_opcode::buffer_load_dwordx2 : global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2; + } else if (bytes_needed <= 12 && !mubuf) { + bytes_size = 12; + op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3; + } else { + bytes_size = 16; + op = mubuf ? aco_opcode::buffer_load_dwordx4 : global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4; + } + RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4)); + Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc); + if (mubuf) { + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; + mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset)); + mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); + mubuf->operands[2] = Operand(0u); + mubuf->glc = info->glc; + mubuf->dlc = false; + mubuf->offset = 0; + mubuf->addr64 = offset.type() == RegType::vgpr; + mubuf->disable_wqm = false; + mubuf->barrier = info->barrier; + mubuf->definitions[0] = Definition(val); + bld.insert(std::move(mubuf)); + } else { + offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset; - if (num_elements == 1) { - result[result_size++] = res; + aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)}; + flat->operands[0] = Operand(offset); + flat->operands[1] = Operand(s1); + flat->glc = info->glc; + flat->dlc = info->glc && bld.program->chip_class >= GFX10; + flat->barrier = info->barrier; + flat->offset = 0u; + flat->definitions[0] = Definition(val); + bld.insert(std::move(flat)); + } + + if (bytes_size < 4) + val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u)); + + return val; +} + +static auto emit_global_load = emit_load; + +Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst, + Temp address, unsigned base_offset, unsigned align) +{ + assert(util_is_power_of_two_nonzero(align)); + + Builder bld(ctx->program, ctx->block); + + unsigned num_components = dst.bytes() / elem_size_bytes; + LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes}; + info.align_mul = align; + info.align_offset = 0; + info.barrier = barrier_shared; + info.can_reorder = false; + info.const_offset = base_offset; + emit_lds_load(ctx, bld, &info); + + return dst; +} + +void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *offsets, Temp src) +{ + if (!count) + return; + + Builder bld(ctx->program, ctx->block); + + ASSERTED bool is_subdword = false; + for (unsigned i = 0; i < count; i++) + is_subdword |= offsets[i] % 4; + is_subdword |= (src.bytes() - offsets[count - 1]) % 4; + assert(!is_subdword || dst_type == RegType::vgpr); + + /* count == 1 fast path */ + if (count == 1) { + if (dst_type == RegType::sgpr) + dst[0] = bld.as_uniform(src); + else + dst[0] = as_vgpr(ctx, src); + return; + } + + for (unsigned i = 0; i < count - 1; i++) + dst[i] = bld.tmp(RegClass::get(dst_type, offsets[i + 1] - offsets[i])); + dst[count - 1] = bld.tmp(RegClass::get(dst_type, src.bytes() - offsets[count - 1])); + + if (is_subdword && src.type() == RegType::sgpr) { + src = as_vgpr(ctx, src); + } else { + /* use allocated_vec if possible */ + auto it = ctx->allocated_vec.find(src.id()); + if (it != ctx->allocated_vec.end()) { + unsigned total_size = 0; + for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++) + total_size += it->second[i].bytes(); + if (total_size != src.bytes()) + goto split; + + unsigned elem_size = it->second[0].bytes(); + + for (unsigned i = 0; i < count; i++) { + if (offsets[i] % elem_size || dst[i].bytes() % elem_size) + goto split; + } + + for (unsigned i = 0; i < count; i++) { + unsigned start_idx = offsets[i] / elem_size; + unsigned op_count = dst[i].bytes() / elem_size; + if (op_count == 1) { + if (dst_type == RegType::sgpr) + dst[i] = bld.as_uniform(it->second[start_idx]); + else + dst[i] = as_vgpr(ctx, it->second[start_idx]); + continue; + } + + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)}; + for (unsigned j = 0; j < op_count; j++) { + Temp tmp = it->second[start_idx + j]; + if (dst_type == RegType::sgpr) + tmp = bld.as_uniform(tmp); + vec->operands[j] = Operand(tmp); + } + vec->definitions[0] = Definition(dst[i]); + bld.insert(std::move(vec)); + } + return; + } + } + + if (dst_type == RegType::sgpr) + src = bld.as_uniform(src); + + split: + /* just split it */ + aco_ptr split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)}; + split->operands[0] = Operand(src); + for (unsigned i = 0; i < count; i++) + split->definitions[i] = Definition(dst[i]); + bld.insert(std::move(split)); +} + +bool scan_write_mask(uint32_t mask, uint32_t todo_mask, + int *start, int *count) +{ + unsigned start_elem = ffs(todo_mask) - 1; + bool skip = !(mask & (1 << start_elem)); + if (skip) + mask = ~mask & todo_mask; + + mask &= todo_mask; + + u_bit_scan_consecutive_range(&mask, start, count); + + return !skip; +} + +void advance_write_mask(uint32_t *todo_mask, int start, int count) +{ + *todo_mask &= ~u_bit_consecutive(0, count) << start; +} + +void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, + Temp address, unsigned base_offset, unsigned align) +{ + assert(util_is_power_of_two_nonzero(align)); + assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8); + + Builder bld(ctx->program, ctx->block); + bool large_ds_write = ctx->options->chip_class >= GFX7; + bool usable_write2 = ctx->options->chip_class >= GFX7; + + unsigned write_count = 0; + Temp write_datas[32]; + unsigned offsets[32]; + aco_opcode opcodes[32]; + + wrmask = widen_mask(wrmask, elem_size_bytes); + + uint32_t todo = u_bit_consecutive(0, data.bytes()); + while (todo) { + int offset, bytes; + if (!scan_write_mask(wrmask, todo, &offset, &bytes)) { + offsets[write_count] = offset; + opcodes[write_count] = aco_opcode::num_opcodes; + write_count++; + advance_write_mask(&todo, offset, bytes); + continue; + } + + bool aligned2 = offset % 2 == 0 && align % 2 == 0; + bool aligned4 = offset % 4 == 0 && align % 4 == 0; + bool aligned8 = offset % 8 == 0 && align % 8 == 0; + bool aligned16 = offset % 16 == 0 && align % 16 == 0; + + //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial + aco_opcode op = aco_opcode::num_opcodes; + if (bytes >= 16 && aligned16 && large_ds_write) { + op = aco_opcode::ds_write_b128; + bytes = 16; + } else if (bytes >= 12 && aligned16 && large_ds_write) { + op = aco_opcode::ds_write_b96; + bytes = 12; + } else if (bytes >= 8 && aligned8) { + op = aco_opcode::ds_write_b64; + bytes = 8; + } else if (bytes >= 4 && aligned4) { + op = aco_opcode::ds_write_b32; + bytes = 4; + } else if (bytes >= 2 && aligned2) { + op = aco_opcode::ds_write_b16; + bytes = 2; + } else if (bytes >= 1) { + op = aco_opcode::ds_write_b8; + bytes = 1; } else { - assert(res != dst && res.size() % num_elements == 0); - aco_ptr split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)}; - split->operands[0] = Operand(res); - for (unsigned i = 0; i < num_elements; i++) - split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4)); - ctx->block->instructions.emplace_back(std::move(split)); + assert(false); + } + + offsets[write_count] = offset; + opcodes[write_count] = op; + write_count++; + advance_write_mask(&todo, offset, bytes); + } + + Operand m = load_lds_size_m0(bld); + + split_store_data(ctx, RegType::vgpr, write_count, write_datas, offsets, data); + + for (unsigned i = 0; i < write_count; i++) { + aco_opcode op = opcodes[i]; + if (op == aco_opcode::num_opcodes) + continue; + + Temp data = write_datas[i]; + + unsigned second = write_count; + if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) { + for (second = i + 1; second < write_count; second++) { + if (opcodes[second] == op && (offsets[second] - offsets[i]) % data.bytes() == 0) { + op = data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64; + opcodes[second] = aco_opcode::num_opcodes; + break; + } + } + } + + bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64; + unsigned write2_off = (offsets[second] - offsets[i]) / data.bytes(); + + unsigned inline_offset = base_offset + offsets[i]; + unsigned max_offset = write2 ? (255 - write2_off) * data.bytes() : 65535; + Temp address_offset = address; + if (inline_offset > max_offset) { + address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset); + inline_offset = offsets[i]; + } + assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */ + + if (write2) { + Temp second_data = write_datas[second]; + inline_offset /= data.bytes(); + bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off); + } else { + bld.ds(op, address_offset, data, m, inline_offset); + } + } +} + +unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset) +{ + unsigned align = 16; + if (const_offset) + align = std::min(align, 1u << (ffs(const_offset) - 1)); + + return align; +} + + +aco_opcode get_buffer_store_op(bool smem, unsigned bytes) +{ + switch (bytes) { + case 1: + assert(!smem); + return aco_opcode::buffer_store_byte; + case 2: + assert(!smem); + return aco_opcode::buffer_store_short; + case 4: + return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword; + case 8: + return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2; + case 12: + assert(!smem); + return aco_opcode::buffer_store_dwordx3; + case 16: + return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4; + } + unreachable("Unexpected store size"); + return aco_opcode::num_opcodes; +} + +void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type, + Temp data, unsigned writemask, int swizzle_element_size, + unsigned *write_count, Temp *write_datas, unsigned *offsets) +{ + unsigned write_count_with_skips = 0; + bool skips[16]; + + /* determine how to split the data */ + unsigned todo = u_bit_consecutive(0, data.bytes()); + while (todo) { + int offset, bytes; + skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &bytes); + offsets[write_count_with_skips] = offset; + if (skips[write_count_with_skips]) { + advance_write_mask(&todo, offset, bytes); + write_count_with_skips++; + continue; + } + + /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be + * larger than swizzle_element_size */ + bytes = MIN2(bytes, swizzle_element_size); + if (bytes % 4) + bytes = bytes > 4 ? bytes & ~0x3 : MIN2(bytes, 2); + + /* SMEM and GFX6 VMEM can't emit 12-byte stores */ + if ((ctx->program->chip_class == GFX6 || smem) && bytes == 12) + bytes = 8; + + /* dword or larger stores have to be dword-aligned */ + unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4; + unsigned align_offset = instr ? nir_intrinsic_align_mul(instr) : 0; + bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0; + if (bytes >= 4 && !dword_aligned) + bytes = MIN2(bytes, 2); + + advance_write_mask(&todo, offset, bytes); + write_count_with_skips++; + } + + /* actually split data */ + split_store_data(ctx, dst_type, write_count_with_skips, write_datas, offsets, data); + + /* remove skips */ + for (unsigned i = 0; i < write_count_with_skips; i++) { + if (skips[i]) + continue; + write_datas[*write_count] = write_datas[i]; + offsets[*write_count] = offsets[i]; + (*write_count)++; + } +} + +Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes, + unsigned split_cnt = 0u, Temp dst = Temp()) +{ + Builder bld(ctx->program, ctx->block); + unsigned dword_size = elem_size_bytes / 4; + + if (!dst.id()) + dst = bld.tmp(RegClass(reg_type, cnt * dword_size)); + + std::array allocated_vec; + aco_ptr instr {create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)}; + instr->definitions[0] = Definition(dst); + + for (unsigned i = 0; i < cnt; ++i) { + if (arr[i].id()) { + assert(arr[i].size() == dword_size); + allocated_vec[i] = arr[i]; + instr->operands[i] = Operand(arr[i]); + } else { + Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)), Operand(0u, dword_size == 2)); + allocated_vec[i] = zero; + instr->operands[i] = Operand(zero); + } + } + + bld.insert(std::move(instr)); + + if (split_cnt) + emit_split_vector(ctx, dst, split_cnt); + else + ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */ + + return dst; +} + +inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, unsigned const_offset) +{ + if (const_offset >= 4096) { + unsigned excess_const_offset = const_offset / 4096u * 4096u; + const_offset %= 4096u; + + if (!voffset.id()) + voffset = bld.copy(bld.def(v1), Operand(excess_const_offset)); + else if (unlikely(voffset.regClass() == s1)) + voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), Operand(excess_const_offset), Operand(voffset)); + else if (likely(voffset.regClass() == v1)) + voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand(excess_const_offset)); + else + unreachable("Unsupported register class of voffset"); + } + + return const_offset; +} + +void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata, + unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false) +{ + assert(vdata.id()); + assert(vdata.size() != 3 || ctx->program->chip_class != GFX6); + assert(vdata.size() >= 1 && vdata.size() <= 4); + + Builder bld(ctx->program, ctx->block); + aco_opcode op = get_buffer_store_op(false, vdata.bytes()); + const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset); + + Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1); + Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u); + Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset, + /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false, + /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc); + + static_cast(r.instr)->can_reorder = allow_reorder; +} + +void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, + unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask, + bool allow_combining = true, bool reorder = true, bool slc = false) +{ + Builder bld(ctx->program, ctx->block); + assert(elem_size_bytes == 4 || elem_size_bytes == 8); + assert(write_mask); + write_mask = widen_mask(write_mask, elem_size_bytes); + + unsigned write_count = 0; + Temp write_datas[32]; + unsigned offsets[32]; + split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, + allow_combining ? 16 : 4, &write_count, write_datas, offsets); + + for (unsigned i = 0; i < write_count; i++) { + unsigned const_offset = offsets[i] + base_const_offset; + emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc); + } +} + +void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset, + unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components, + unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true) +{ + assert(elem_size_bytes == 4 || elem_size_bytes == 8); + assert((num_components * elem_size_bytes / 4) == dst.size()); + assert(!!stride != allow_combining); + + Builder bld(ctx->program, ctx->block); + + LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor}; + info.component_stride = allow_combining ? 0 : stride; + info.glc = true; + info.swizzle_component_size = allow_combining ? 0 : 4; + info.align_mul = MIN2(elem_size_bytes, 4); + info.align_offset = 0; + info.soffset = soffset; + info.const_offset = base_const_offset; + emit_mubuf_load(ctx, bld, &info); +} + +std::pair offset_add_from_nir(isel_context *ctx, const std::pair &base_offset, nir_src *off_src, unsigned stride = 1u) +{ + Builder bld(ctx->program, ctx->block); + Temp offset = base_offset.first; + unsigned const_offset = base_offset.second; + + if (!nir_src_is_const(*off_src)) { + Temp indirect_offset_arg = get_ssa_temp(ctx, off_src->ssa); + Temp with_stride; + + /* Calculate indirect offset with stride */ + if (likely(indirect_offset_arg.regClass() == v1)) + with_stride = bld.v_mul24_imm(bld.def(v1), indirect_offset_arg, stride); + else if (indirect_offset_arg.regClass() == s1) + with_stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), indirect_offset_arg); + else + unreachable("Unsupported register class of indirect offset"); + + /* Add to the supplied base offset */ + if (offset.id() == 0) + offset = with_stride; + else if (unlikely(offset.regClass() == s1 && with_stride.regClass() == s1)) + offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), with_stride, offset); + else if (offset.size() == 1 && with_stride.size() == 1) + offset = bld.vadd32(bld.def(v1), with_stride, offset); + else + unreachable("Unsupported register class of indirect offset"); + } else { + unsigned const_offset_arg = nir_src_as_uint(*off_src); + const_offset += const_offset_arg * stride; + } + + return std::make_pair(offset, const_offset); +} + +std::pair offset_add(isel_context *ctx, const std::pair &off1, const std::pair &off2) +{ + Builder bld(ctx->program, ctx->block); + Temp offset; + + if (off1.first.id() && off2.first.id()) { + if (unlikely(off1.first.regClass() == s1 && off2.first.regClass() == s1)) + offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), off1.first, off2.first); + else if (off1.first.size() == 1 && off2.first.size() == 1) + offset = bld.vadd32(bld.def(v1), off1.first, off2.first); + else + unreachable("Unsupported register class of indirect offset"); + } else { + offset = off1.first.id() ? off1.first : off2.first; + } + + return std::make_pair(offset, off1.second + off2.second); +} + +std::pair offset_mul(isel_context *ctx, const std::pair &offs, unsigned multiplier) +{ + Builder bld(ctx->program, ctx->block); + unsigned const_offset = offs.second * multiplier; + + if (!offs.first.id()) + return std::make_pair(offs.first, const_offset); + + Temp offset = unlikely(offs.first.regClass() == s1) + ? bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(multiplier), offs.first) + : bld.v_mul24_imm(bld.def(v1), offs.first, multiplier); + + return std::make_pair(offset, const_offset); +} + +std::pair get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride, unsigned component_stride) +{ + Builder bld(ctx->program, ctx->block); + + /* base is the driver_location, which is already multiplied by 4, so is in dwords */ + unsigned const_offset = nir_intrinsic_base(instr) * base_stride; + /* component is in bytes */ + const_offset += nir_intrinsic_component(instr) * component_stride; + + /* offset should be interpreted in relation to the base, so the instruction effectively reads/writes another input/output when it has an offset */ + nir_src *off_src = nir_get_io_offset_src(instr); + return offset_add_from_nir(ctx, std::make_pair(Temp(), const_offset), off_src, 4u * base_stride); +} + +std::pair get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned stride = 1u) +{ + return get_intrinsic_io_basic_offset(ctx, instr, stride, stride); +} + +Temp get_tess_rel_patch_id(isel_context *ctx) +{ + Builder bld(ctx->program, ctx->block); + + switch (ctx->shader->info.stage) { + case MESA_SHADER_TESS_CTRL: + return bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffu), + get_arg(ctx, ctx->args->ac.tcs_rel_ids)); + case MESA_SHADER_TESS_EVAL: + return get_arg(ctx, ctx->args->tes_rel_patch_id); + default: + unreachable("Unsupported stage in get_tess_rel_patch_id"); + } +} + +std::pair get_tcs_per_vertex_input_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr) +{ + assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL); + Builder bld(ctx->program, ctx->block); + + uint32_t tcs_in_patch_stride = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 4; + uint32_t tcs_in_vertex_stride = ctx->tcs_num_inputs * 4; + + std::pair offs = get_intrinsic_io_basic_offset(ctx, instr); + + nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr); + offs = offset_add_from_nir(ctx, offs, vertex_index_src, tcs_in_vertex_stride); + + Temp rel_patch_id = get_tess_rel_patch_id(ctx); + Temp tcs_in_current_patch_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, tcs_in_patch_stride); + offs = offset_add(ctx, offs, std::make_pair(tcs_in_current_patch_offset, 0)); + + return offset_mul(ctx, offs, 4u); +} + +std::pair get_tcs_output_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, bool per_vertex = false) +{ + assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL); + Builder bld(ctx->program, ctx->block); + + uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16; + uint32_t output_vertex_size = ctx->tcs_num_outputs * 16; + uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; + uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16; + + std::pair offs = instr + ? get_intrinsic_io_basic_offset(ctx, instr, 4u) + : std::make_pair(Temp(), 0u); + + Temp rel_patch_id = get_tess_rel_patch_id(ctx); + Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, output_patch_stride); + + if (per_vertex) { + assert(instr); + + nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr); + offs = offset_add_from_nir(ctx, offs, vertex_index_src, output_vertex_size); + + uint32_t output_patch0_offset = (input_patch_size * ctx->tcs_num_patches); + offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_offset)); + } else { + uint32_t output_patch0_patch_data_offset = (input_patch_size * ctx->tcs_num_patches + pervertex_output_patch_size); + offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_patch_data_offset)); + } + + return offs; +} + +std::pair get_tcs_per_vertex_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + + unsigned vertices_per_patch = ctx->shader->info.tess.tcs_vertices_out; + unsigned attr_stride = vertices_per_patch * ctx->tcs_num_patches; + + std::pair offs = get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u); + + Temp rel_patch_id = get_tess_rel_patch_id(ctx); + Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, vertices_per_patch * 16u); + offs = offset_add(ctx, offs, std::make_pair(patch_off, 0u)); + + nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr); + offs = offset_add_from_nir(ctx, offs, vertex_index_src, 16u); + + return offs; +} + +std::pair get_tcs_per_patch_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, unsigned const_base_offset = 0u) +{ + Builder bld(ctx->program, ctx->block); + + unsigned output_vertex_size = ctx->tcs_num_outputs * 16; + unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; + unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches; + unsigned attr_stride = ctx->tcs_num_patches; + + std::pair offs = instr + ? get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u) + : std::make_pair(Temp(), 0u); + + if (const_base_offset) + offs.second += const_base_offset * attr_stride; + + Temp rel_patch_id = get_tess_rel_patch_id(ctx); + Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, 16u); + offs = offset_add(ctx, offs, std::make_pair(patch_off, per_patch_data_offset)); + + return offs; +} + +bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect) +{ + assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL); + + if (mask == 0) + return false; + + unsigned drv_loc = nir_intrinsic_base(instr); + nir_src *off_src = nir_get_io_offset_src(instr); + + if (!nir_src_is_const(*off_src)) { + *indirect = true; + return false; + } + + *indirect = false; + uint64_t slot = per_vertex + ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4] + : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0); + return (((uint64_t) 1) << slot) & mask; +} + +bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr) +{ + unsigned write_mask = nir_intrinsic_write_mask(instr); + unsigned component = nir_intrinsic_component(instr); + unsigned idx = nir_intrinsic_base(instr) + component; + + nir_instr *off_instr = instr->src[1].ssa->parent_instr; + if (off_instr->type != nir_instr_type_load_const) + return false; + + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + idx += nir_src_as_uint(instr->src[1]) * 4u; + + if (instr->src[0].ssa->bit_size == 64) + write_mask = widen_mask(write_mask, 2); + + for (unsigned i = 0; i < 8; ++i) { + if (write_mask & (1 << i)) { + ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u); + ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, v1); } - - bytes_read += todo; + idx++; } - assert(result_size == num_components && result_size > 1); - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)}; - for (unsigned i = 0; i < result_size; i++) - vec->operands[i] = Operand(result[i]); - vec->definitions[0] = Definition(dst); - ctx->block->instructions.emplace_back(std::move(vec)); - ctx->allocated_vec.emplace(dst.id(), result); + return true; } -Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned size, RegType type) +bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst) { - if (start == 0 && size == data.size()) - return type == RegType::vgpr ? as_vgpr(ctx, data) : data; - - unsigned size_hint = 1; - auto it = ctx->allocated_vec.find(data.id()); - if (it != ctx->allocated_vec.end()) - size_hint = it->second[0].size(); - if (size % size_hint || start % size_hint) - size_hint = 1; + /* Only TCS per-vertex inputs are supported by this function. + * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same. + */ + if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq) + return false; - start /= size_hint; - size /= size_hint; + nir_src *off_src = nir_get_io_offset_src(instr); + nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr); + nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr; + bool can_use_temps = nir_src_is_const(*off_src) && + vertex_index_instr->type == nir_instr_type_intrinsic && + nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id; - Temp elems[size]; - for (unsigned i = 0; i < size; i++) - elems[i] = emit_extract_vector(ctx, data, start + i, RegClass(type, size_hint)); + if (!can_use_temps) + return false; - if (size == 1) - return type == RegType::vgpr ? as_vgpr(ctx, elems[0]) : elems[0]; + unsigned idx = nir_intrinsic_base(instr) + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src); + Temp *src = &ctx->inputs.temps[idx]; + create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst); - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)}; - for (unsigned i = 0; i < size; i++) - vec->operands[i] = Operand(elems[i]); - Temp res = {ctx->program->allocateId(), RegClass(type, size * size_hint)}; - vec->definitions[0] = Definition(res); - ctx->block->instructions.emplace_back(std::move(vec)); - return res; + return true; } -void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align) +void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr) { Builder bld(ctx->program, ctx->block); - unsigned bytes_written = 0; - while (bytes_written < total_size * 4) { - unsigned todo = total_size * 4 - bytes_written; - bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0; - bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0; - - aco_opcode op = aco_opcode::last_opcode; - bool write2 = false; - unsigned size = 0; - if (todo >= 16 && aligned16) { - op = aco_opcode::ds_write_b128; - size = 4; - } else if (todo >= 16 && aligned8) { - op = aco_opcode::ds_write2_b64; - write2 = true; - size = 4; - } else if (todo >= 12 && aligned16) { - op = aco_opcode::ds_write_b96; - size = 3; - } else if (todo >= 8 && aligned8) { - op = aco_opcode::ds_write_b64; - size = 2; - } else if (todo >= 8) { - op = aco_opcode::ds_write2_b32; - write2 = true; - size = 2; - } else if (todo >= 4) { - op = aco_opcode::ds_write_b32; - size = 1; - } else { - assert(false); - } - unsigned offset = offset0 + offset1 + bytes_written; - unsigned max_offset = write2 ? 1020 : 65535; - Temp address_offset = address; - if (offset > max_offset) { - address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset); - offset = offset1 + bytes_written; - } - assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */ + if (ctx->tcs_in_out_eq && store_output_to_temps(ctx, instr)) { + /* When the TCS only reads this output directly and for the same vertices as its invocation id, it is unnecessary to store the VS output to LDS. */ + bool indirect_write; + bool temp_only_input = tcs_driver_location_matches_api_mask(ctx, instr, true, ctx->tcs_temp_only_inputs, &indirect_write); + if (temp_only_input && !indirect_write) + return; + } - if (write2) { - Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr); - Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr); - bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1); + std::pair offs = get_intrinsic_io_basic_offset(ctx, instr, 4u); + Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + unsigned write_mask = nir_intrinsic_write_mask(instr); + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u; + + if (ctx->stage == vertex_es || ctx->stage == tess_eval_es) { + /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */ + Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u)); + Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset); + store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true); + } else { + Temp lds_base; + + if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) { + /* GFX9+: ES stage is merged into GS, data is passed between them using LDS. */ + unsigned itemsize = ctx->stage == vertex_geometry_gs + ? ctx->program->info->vs.es_info.esgs_itemsize + : ctx->program->info->tes.es_info.esgs_itemsize; + Temp thread_id = emit_mbcnt(ctx, bld.def(v1)); + Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24)); + Temp vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), thread_id, + bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size)); + lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize); + } else if (ctx->stage == vertex_ls || ctx->stage == vertex_tess_control_hs) { + /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS. + * GFX9+: LS is merged into HS, but still uses the same LDS layout. + */ + Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id); + lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u); } else { - Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr); - bld.ds(op, address_offset, val, m, offset); + unreachable("Invalid LS or ES stage"); } - bytes_written += size * 4; + offs = offset_add(ctx, offs, std::make_pair(lds_base, 0u)); + unsigned lds_align = calculate_lds_alignment(ctx, offs.second); + store_lds(ctx, elem_size_bytes, src, write_mask, offs.first, offs.second, lds_align); } } -void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, - Temp address, unsigned base_offset, unsigned align) +bool tcs_output_is_tess_factor(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex) +{ + if (per_vertex) + return false; + + unsigned off = nir_intrinsic_base(instr) * 4u; + return off == ctx->tcs_tess_lvl_out_loc || + off == ctx->tcs_tess_lvl_in_loc; + +} + +bool tcs_output_is_read_by_tes(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex) { - assert(util_is_power_of_two_nonzero(align) && align >= 4); + uint64_t mask = per_vertex + ? ctx->program->info->tcs.tes_inputs_read + : ctx->program->info->tcs.tes_patch_inputs_read; - Operand m = load_lds_size_m0(ctx); + bool indirect_write = false; + bool output_read_by_tes = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write); + return indirect_write || output_read_by_tes; +} - /* we need at most two stores for 32bit variables */ - int start[2], count[2]; - u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]); - u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]); - assert(wrmask == 0); +bool tcs_output_is_read_by_tcs(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex) +{ + uint64_t mask = per_vertex + ? ctx->shader->info.outputs_read + : ctx->shader->info.patch_outputs_read; - /* one combined store is sufficient */ - if (count[0] == count[1]) { - Builder bld(ctx->program, ctx->block); + bool indirect_write = false; + bool output_read = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write); + return indirect_write || output_read; +} - Temp address_offset = address; - if ((base_offset >> 2) + start[1] > 255) { - address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset); - base_offset = 0; - } +void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex) +{ + assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs); + assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL); - assert(count[0] == 1); - Temp val0 = emit_extract_vector(ctx, data, start[0], v1); - Temp val1 = emit_extract_vector(ctx, data, start[1], v1); - aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64; - base_offset = base_offset / elem_size_bytes; - bld.ds(op, address_offset, val0, val1, m, - base_offset + start[0], base_offset + start[1]); - return; - } + Builder bld(ctx->program, ctx->block); - for (unsigned i = 0; i < 2; i++) { - if (count[i] == 0) - continue; + Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa); + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; + unsigned write_mask = nir_intrinsic_write_mask(instr); + + bool is_tess_factor = tcs_output_is_tess_factor(ctx, instr, per_vertex); + bool write_to_vmem = !is_tess_factor && tcs_output_is_read_by_tes(ctx, instr, per_vertex); + bool write_to_lds = is_tess_factor || tcs_output_is_read_by_tcs(ctx, instr, per_vertex); + + if (write_to_vmem) { + std::pair vmem_offs = per_vertex + ? get_tcs_per_vertex_output_vmem_offset(ctx, instr) + : get_tcs_per_patch_output_vmem_offset(ctx, instr); - unsigned elem_size_words = elem_size_bytes / 4; - ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words, - base_offset, start[i] * elem_size_bytes, align); + Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u)); + Temp oc_lds = get_arg(ctx, ctx->args->oc_lds); + store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, false); } - return; + + if (write_to_lds) { + std::pair lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex); + unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second); + store_lds(ctx, elem_size_bytes, store_val, write_mask, lds_offs.first, lds_offs.second, lds_align); + } +} + +void visit_load_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex) +{ + assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs); + assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL); + + Builder bld(ctx->program, ctx->block); + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + std::pair lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex); + unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second); + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; + + load_lds(ctx, elem_size_bytes, dst, lds_offs.first, lds_offs.second, lds_align); } void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr) { - if (ctx->stage == vertex_vs) { - visit_store_vs_output(ctx, instr); - } else if (ctx->stage == fragment_fs) { - visit_store_fs_output(ctx, instr); + if (ctx->stage == vertex_vs || + ctx->stage == tess_eval_vs || + ctx->stage == fragment_fs || + ctx->stage == ngg_vertex_gs || + ctx->stage == ngg_tess_eval_gs || + ctx->shader->info.stage == MESA_SHADER_GEOMETRY) { + bool stored_to_temps = store_output_to_temps(ctx, instr); + if (!stored_to_temps) { + fprintf(stderr, "Unimplemented output offset instruction:\n"); + nir_print_instr(instr->src[1].ssa->parent_instr, stderr); + fprintf(stderr, "\n"); + abort(); + } + } else if (ctx->stage == vertex_es || + ctx->stage == vertex_ls || + ctx->stage == tess_eval_es || + (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) || + (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_VERTEX) || + (ctx->stage == tess_eval_geometry_gs && ctx->shader->info.stage == MESA_SHADER_TESS_EVAL)) { + visit_store_ls_or_es_output(ctx, instr); + } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) { + visit_store_tcs_output(ctx, instr, false); } else { unreachable("Shader stage not implemented"); } } +void visit_load_output(isel_context *ctx, nir_intrinsic_instr *instr) +{ + visit_load_tcs_output(ctx, instr, false); +} + void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask) { Temp coord1 = emit_extract_vector(ctx, src, 0, v1); Temp coord2 = emit_extract_vector(ctx, src, 1, v1); Builder bld(ctx->program, ctx->block); - Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component); - bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component); + Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component); + if (ctx->program->has_16bank_lds) + interp_p1.instr->operands[0].setLateKill(true); + bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, component); } void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components) @@ -3055,32 +4486,60 @@ void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr } } -unsigned get_num_channels_from_data_format(unsigned data_format) +bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info, + unsigned offset, unsigned stride, unsigned channels) +{ + unsigned vertex_byte_size = vtx_info->chan_byte_size * channels; + if (vtx_info->chan_byte_size != 4 && channels == 3) + return false; + return (ctx->options->chip_class != GFX6 && ctx->options->chip_class != GFX10) || + (offset % vertex_byte_size == 0 && stride % vertex_byte_size == 0); +} + +uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info, + unsigned offset, unsigned stride, unsigned *channels) { - switch (data_format) { + if (!vtx_info->chan_byte_size) { + *channels = vtx_info->num_channels; + return vtx_info->chan_format; + } + + unsigned num_channels = *channels; + if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) { + unsigned new_channels = num_channels + 1; + /* first, assume more loads is worse and try using a larger data format */ + while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) { + new_channels++; + /* don't make the attribute potentially out-of-bounds */ + if (offset + new_channels * vtx_info->chan_byte_size > stride) + new_channels = 5; + } + + if (new_channels == 5) { + /* then try decreasing load size (at the cost of more loads) */ + new_channels = *channels; + while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) + new_channels--; + } + + if (new_channels < *channels) + *channels = new_channels; + num_channels = new_channels; + } + + switch (vtx_info->chan_format) { case V_008F0C_BUF_DATA_FORMAT_8: + return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8, + V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1]; case V_008F0C_BUF_DATA_FORMAT_16: + return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16, + V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1]; case V_008F0C_BUF_DATA_FORMAT_32: - return 1; - case V_008F0C_BUF_DATA_FORMAT_8_8: - case V_008F0C_BUF_DATA_FORMAT_16_16: - case V_008F0C_BUF_DATA_FORMAT_32_32: - return 2; - case V_008F0C_BUF_DATA_FORMAT_10_11_11: - case V_008F0C_BUF_DATA_FORMAT_11_11_10: - case V_008F0C_BUF_DATA_FORMAT_32_32_32: - return 3; - case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: - case V_008F0C_BUF_DATA_FORMAT_10_10_10_2: - case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: - case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: - case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: - return 4; - default: - break; + return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32, + V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1]; } - - return 4; + unreachable("shouldn't reach here"); + return V_008F0C_BUF_DATA_FORMAT_INVALID; } /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW. @@ -3117,7 +4576,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) { Builder bld(ctx->program, ctx->block); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - if (ctx->stage & sw_vs) { + if (ctx->shader->info.stage == MESA_SHADER_VERTEX) { nir_instr *off_instr = instr->src[0].ssa->parent_instr; if (off_instr->type != nir_instr_type_load_const) { @@ -3137,24 +4596,24 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location]; unsigned dfmt = attrib_format & 0xf; - unsigned nfmt = (attrib_format >> 4) & 0x7; - unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt); + const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt); + unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component; - unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels); + unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels); unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3; bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location); if (post_shuffle) num_channels = MAX2(num_channels, 3); - Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, Operand(attrib_binding * 16u)); + Operand off = bld.copy(bld.def(s1), Operand(attrib_binding * 16u)); + Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off); Temp index; if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) { uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location]; Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance); if (divisor) { - ctx->needs_instance_id = true; Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id); if (divisor != 1) { Temp divided = bld.tmp(v1); @@ -3172,53 +4631,109 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) get_arg(ctx, ctx->args->ac.vertex_id)); } - if (attrib_stride != 0 && attrib_offset > attrib_stride) { - index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index); - attrib_offset = attrib_offset % attrib_stride; - } + Temp channels[num_channels]; + unsigned channel_start = 0; + bool direct_fetch = false; + + /* skip unused channels at the start */ + if (vtx_info->chan_byte_size && !post_shuffle) { + channel_start = ffs(mask) - 1; + for (unsigned i = 0; i < channel_start; i++) + channels[i] = Temp(0, s1); + } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) { + num_channels = 3 - (ffs(mask) - 1); + } + + /* load channels */ + while (channel_start < num_channels) { + unsigned fetch_size = num_channels - channel_start; + unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size; + bool expanded = false; + + /* use MUBUF when possible to avoid possible alignment issues */ + /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */ + bool use_mubuf = (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || + nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || + nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) && + vtx_info->chan_byte_size == 4; + unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID; + if (!use_mubuf) { + fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size); + } else { + if (fetch_size == 3 && ctx->options->chip_class == GFX6) { + /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */ + fetch_size = 4; + expanded = true; + } + } - Operand soffset(0u); - if (attrib_offset >= 4096) { - soffset = bld.copy(bld.def(s1), Operand(attrib_offset)); - attrib_offset = 0; - } + Temp fetch_index = index; + if (attrib_stride != 0 && fetch_offset > attrib_stride) { + fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index); + fetch_offset = fetch_offset % attrib_stride; + } - aco_opcode opcode; - switch (num_channels) { - case 1: - opcode = aco_opcode::tbuffer_load_format_x; - break; - case 2: - opcode = aco_opcode::tbuffer_load_format_xy; - break; - case 3: - opcode = aco_opcode::tbuffer_load_format_xyz; - break; - case 4: - opcode = aco_opcode::tbuffer_load_format_xyzw; - break; - default: - unreachable("Unimplemented load_input vector size"); - } + Operand soffset(0u); + if (fetch_offset >= 4096) { + soffset = bld.copy(bld.def(s1), Operand(fetch_offset / 4096 * 4096)); + fetch_offset %= 4096; + } + + aco_opcode opcode; + switch (fetch_size) { + case 1: + opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x; + break; + case 2: + opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy; + break; + case 3: + assert(ctx->options->chip_class >= GFX7 || + (!use_mubuf && ctx->options->chip_class == GFX6)); + opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz; + break; + case 4: + opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw; + break; + default: + unreachable("Unimplemented load_input vector size"); + } - Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst; + Temp fetch_dst; + if (channel_start == 0 && fetch_size == dst.size() && !post_shuffle && + !expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE || + num_channels <= 3)) { + direct_fetch = true; + fetch_dst = dst; + } else { + fetch_dst = bld.tmp(RegType::vgpr, fetch_size); + } - aco_ptr mubuf{create_instruction(opcode, Format::MTBUF, 3, 1)}; - mubuf->operands[0] = Operand(index); - mubuf->operands[1] = Operand(list); - mubuf->operands[2] = soffset; - mubuf->definitions[0] = Definition(tmp); - mubuf->idxen = true; - mubuf->can_reorder = true; - mubuf->dfmt = dfmt; - mubuf->nfmt = nfmt; - assert(attrib_offset < 4096); - mubuf->offset = attrib_offset; - ctx->block->instructions.emplace_back(std::move(mubuf)); + if (use_mubuf) { + Instruction *mubuf = bld.mubuf(opcode, + Definition(fetch_dst), list, fetch_index, soffset, + fetch_offset, false, true).instr; + static_cast(mubuf)->can_reorder = true; + } else { + Instruction *mtbuf = bld.mtbuf(opcode, + Definition(fetch_dst), list, fetch_index, soffset, + fetch_dfmt, nfmt, fetch_offset, false, true).instr; + static_cast(mtbuf)->can_reorder = true; + } + + emit_split_vector(ctx, fetch_dst, fetch_dst.size()); - emit_split_vector(ctx, tmp, tmp.size()); + if (fetch_size == 1) { + channels[channel_start] = fetch_dst; + } else { + for (unsigned i = 0; i < MIN2(fetch_size, num_channels - channel_start); i++) + channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, v1); + } + + channel_start += fetch_size; + } - if (tmp.id() != dst.id()) { + if (!direct_fetch) { bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT; @@ -3227,13 +4742,18 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal; aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; + std::array elems; + unsigned num_temp = 0; for (unsigned i = 0; i < dst.size(); i++) { unsigned idx = i + component; - if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) { - Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1); - vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha)); - } else if (idx < num_channels) { - vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1)); + if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) { + Temp channel = channels[swizzle[idx]]; + if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE) + channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel); + vec->operands[i] = Operand(channel); + + num_temp++; + elems[i] = channel; } else if (is_float && idx == 3) { vec->operands[i] = Operand(0x3f800000u); } else if (!is_float && idx == 3) { @@ -3245,10 +4765,13 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) vec->definitions[0] = Definition(dst); ctx->block->instructions.emplace_back(std::move(vec)); emit_split_vector(ctx, dst, dst.size()); - } - } else if (ctx->stage == fragment_fs) { - nir_instr *off_instr = instr->src[0].ssa->parent_instr; + if (num_temp == dst.size()) + ctx->allocated_vec.emplace(dst.id(), elems); + } + } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) { + unsigned offset_idx = instr->intrinsic == nir_intrinsic_load_input ? 0 : 1; + nir_instr *off_instr = instr->src[offset_idx].ssa->parent_instr; if (off_instr->type != nir_instr_type_load_const || nir_instr_as_load_const(off_instr)->value[0].u32 != 0) { fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n"); @@ -3257,13 +4780,13 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) } Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask); - nir_const_value* offset = nir_src_as_const_value(instr->src[0]); + nir_const_value* offset = nir_src_as_const_value(instr->src[offset_idx]); if (offset) { assert(offset->u32 == 0); } else { /* the lower 15bit of the prim_mask contain the offset into LDS * while the upper bits contain the number of prims */ - Temp offset_src = get_ssa_temp(ctx, instr->src[0].ssa); + Temp offset_src = get_ssa_temp(ctx, instr->src[offset_idx].ssa); assert(offset_src.regClass() == s1 && "TODO: divergent offsets..."); Builder bld(ctx->program, ctx->block); Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u)); @@ -3275,28 +4798,208 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) unsigned idx = nir_intrinsic_base(instr); unsigned component = nir_intrinsic_component(instr); + unsigned vertex_id = 2; /* P0 */ + + if (instr->intrinsic == nir_intrinsic_load_input_vertex) { + nir_const_value* src0 = nir_src_as_const_value(instr->src[0]); + switch (src0->u32) { + case 0: + vertex_id = 2; /* P0 */ + break; + case 1: + vertex_id = 0; /* P10 */ + break; + case 2: + vertex_id = 1; /* P20 */ + break; + default: + unreachable("invalid vertex index"); + } + } if (dst.size() == 1) { - bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(2u), bld.m0(prim_mask), idx, component); + bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(vertex_id), bld.m0(prim_mask), idx, component); } else { aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; for (unsigned i = 0; i < dst.size(); i++) - vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u), bld.m0(prim_mask), idx, component + i); + vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(vertex_id), bld.m0(prim_mask), idx, component + i); vec->definitions[0] = Definition(dst); bld.insert(std::move(vec)); } + } else if (ctx->shader->info.stage == MESA_SHADER_TESS_EVAL) { + Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u)); + Temp soffset = get_arg(ctx, ctx->args->oc_lds); + std::pair offs = get_tcs_per_patch_output_vmem_offset(ctx, instr); + unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8u; + + load_vmem_mubuf(ctx, dst, ring, offs.first, soffset, offs.second, elem_size_bytes, instr->dest.ssa.num_components); } else { unreachable("Shader stage not implemented"); } } +std::pair get_gs_per_vertex_input_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride = 1u) +{ + assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY); + + Builder bld(ctx->program, ctx->block); + nir_src *vertex_src = nir_get_io_vertex_index_src(instr); + Temp vertex_offset; + + if (!nir_src_is_const(*vertex_src)) { + /* better code could be created, but this case probably doesn't happen + * much in practice */ + Temp indirect_vertex = as_vgpr(ctx, get_ssa_temp(ctx, vertex_src->ssa)); + for (unsigned i = 0; i < ctx->shader->info.gs.vertices_in; i++) { + Temp elem; + + if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) { + elem = get_arg(ctx, ctx->args->gs_vtx_offset[i / 2u * 2u]); + if (i % 2u) + elem = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), elem); + } else { + elem = get_arg(ctx, ctx->args->gs_vtx_offset[i]); + } + + if (vertex_offset.id()) { + Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), + Operand(i), indirect_vertex); + vertex_offset = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), vertex_offset, elem, cond); + } else { + vertex_offset = elem; + } + } + + if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) + vertex_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu), vertex_offset); + } else { + unsigned vertex = nir_src_as_uint(*vertex_src); + if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) + vertex_offset = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), + get_arg(ctx, ctx->args->gs_vtx_offset[vertex / 2u * 2u]), + Operand((vertex % 2u) * 16u), Operand(16u)); + else + vertex_offset = get_arg(ctx, ctx->args->gs_vtx_offset[vertex]); + } + + std::pair offs = get_intrinsic_io_basic_offset(ctx, instr, base_stride); + offs = offset_add(ctx, offs, std::make_pair(vertex_offset, 0u)); + return offset_mul(ctx, offs, 4u); +} + +void visit_load_gs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr) +{ + assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY); + + Builder bld(ctx->program, ctx->block); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8; + + if (ctx->stage == geometry_gs) { + std::pair offs = get_gs_per_vertex_input_offset(ctx, instr, ctx->program->wave_size); + Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_GS * 16u)); + load_vmem_mubuf(ctx, dst, ring, offs.first, Temp(), offs.second, elem_size_bytes, instr->dest.ssa.num_components, 4u * ctx->program->wave_size, false, true); + } else if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) { + std::pair offs = get_gs_per_vertex_input_offset(ctx, instr); + unsigned lds_align = calculate_lds_alignment(ctx, offs.second); + load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align); + } else { + unreachable("Unsupported GS stage."); + } +} + +void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr) +{ + assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL); + + Builder bld(ctx->program, ctx->block); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + if (load_input_from_temps(ctx, instr, dst)) + return; + + std::pair offs = get_tcs_per_vertex_input_lds_offset(ctx, instr); + unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8; + unsigned lds_align = calculate_lds_alignment(ctx, offs.second); + + load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align); +} + +void visit_load_tes_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr) +{ + assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL); + + Builder bld(ctx->program, ctx->block); + + Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u)); + Temp oc_lds = get_arg(ctx, ctx->args->oc_lds); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8; + std::pair offs = get_tcs_per_vertex_output_vmem_offset(ctx, instr); + + load_vmem_mubuf(ctx, dst, ring, offs.first, oc_lds, offs.second, elem_size_bytes, instr->dest.ssa.num_components, 0u, true, true); +} + +void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr) +{ + switch (ctx->shader->info.stage) { + case MESA_SHADER_GEOMETRY: + visit_load_gs_per_vertex_input(ctx, instr); + break; + case MESA_SHADER_TESS_CTRL: + visit_load_tcs_per_vertex_input(ctx, instr); + break; + case MESA_SHADER_TESS_EVAL: + visit_load_tes_per_vertex_input(ctx, instr); + break; + default: + unreachable("Unimplemented shader stage"); + } +} + +void visit_load_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr) +{ + visit_load_tcs_output(ctx, instr, true); +} + +void visit_store_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr) +{ + assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs); + assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL); + + visit_store_tcs_output(ctx, instr, true); +} + +void visit_load_tess_coord(isel_context *ctx, nir_intrinsic_instr *instr) +{ + assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL); + + Builder bld(ctx->program, ctx->block); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + Operand tes_u(get_arg(ctx, ctx->args->tes_u)); + Operand tes_v(get_arg(ctx, ctx->args->tes_v)); + Operand tes_w(0u); + + if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) { + Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v); + tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0x3f800000u /* 1.0f */), tmp); + tes_w = Operand(tmp); + } + + Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w); + emit_split_vector(ctx, tess_coord, 3); +} + Temp load_desc_ptr(isel_context *ctx, unsigned desc_set) { if (ctx->program->info->need_indirect_descriptor_sets) { Builder bld(ctx->program, ctx->block); Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0])); - return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false); + Operand off = bld.copy(bld.def(s1), Operand(desc_set << 2)); + return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off);//, false, false, false); } return get_arg(ctx, ctx->args->descriptor_sets[desc_set]); @@ -3357,158 +5060,34 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) nir_const_index ? Operand(const_index) : Operand(index), Operand(desc_ptr)); } else { - index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), - nir_const_index ? Operand(const_index) : Operand(index), - Operand(desc_ptr)); - } - - bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index); -} - -void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, - Temp rsrc, Temp offset, bool glc=false, bool readonly=true) -{ - Builder bld(ctx->program, ctx->block); - - unsigned num_bytes = dst.size() * 4; - bool dlc = glc && ctx->options->chip_class >= GFX10; - - aco_opcode op; - if (dst.type() == RegType::vgpr || (ctx->options->chip_class < GFX8 && !readonly)) { - Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); - Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); - unsigned const_offset = 0; - - Temp lower = Temp(); - if (num_bytes > 16) { - assert(num_components == 3 || num_components == 4); - op = aco_opcode::buffer_load_dwordx4; - lower = bld.tmp(v4); - aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; - mubuf->definitions[0] = Definition(lower); - mubuf->operands[0] = vaddr; - mubuf->operands[1] = Operand(rsrc); - mubuf->operands[2] = soffset; - mubuf->offen = (offset.type() == RegType::vgpr); - mubuf->glc = glc; - mubuf->dlc = dlc; - mubuf->barrier = readonly ? barrier_none : barrier_buffer; - mubuf->can_reorder = readonly; - bld.insert(std::move(mubuf)); - emit_split_vector(ctx, lower, 2); - num_bytes -= 16; - const_offset = 16; - } - - switch (num_bytes) { - case 4: - op = aco_opcode::buffer_load_dword; - break; - case 8: - op = aco_opcode::buffer_load_dwordx2; - break; - case 12: - op = aco_opcode::buffer_load_dwordx3; - break; - case 16: - op = aco_opcode::buffer_load_dwordx4; - break; - default: - unreachable("Load SSBO not implemented for this size."); - } - aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; - mubuf->operands[0] = vaddr; - mubuf->operands[1] = Operand(rsrc); - mubuf->operands[2] = soffset; - mubuf->offen = (offset.type() == RegType::vgpr); - mubuf->glc = glc; - mubuf->dlc = dlc; - mubuf->barrier = readonly ? barrier_none : barrier_buffer; - mubuf->can_reorder = readonly; - mubuf->offset = const_offset; - aco_ptr instr = std::move(mubuf); - - if (dst.size() > 4) { - assert(lower != Temp()); - Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size()); - instr->definitions[0] = Definition(upper); - bld.insert(std::move(instr)); - if (dst.size() == 8) - emit_split_vector(ctx, upper, 2); - instr.reset(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1)); - instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2)); - instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2)); - instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2)); - if (dst.size() == 8) - instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2)); - } - - if (dst.type() == RegType::sgpr) { - Temp vec = bld.tmp(RegType::vgpr, dst.size()); - instr->definitions[0] = Definition(vec); - bld.insert(std::move(instr)); - expand_vector(ctx, vec, dst, num_components, (1 << num_components) - 1); - } else { - instr->definitions[0] = Definition(dst); - bld.insert(std::move(instr)); - emit_split_vector(ctx, dst, num_components); - } - } else { - switch (num_bytes) { - case 4: - op = aco_opcode::s_buffer_load_dword; - break; - case 8: - op = aco_opcode::s_buffer_load_dwordx2; - break; - case 12: - case 16: - op = aco_opcode::s_buffer_load_dwordx4; - break; - case 24: - case 32: - op = aco_opcode::s_buffer_load_dwordx8; - break; - default: - unreachable("Load SSBO not implemented for this size."); - } - aco_ptr load{create_instruction(op, Format::SMEM, 2, 1)}; - load->operands[0] = Operand(rsrc); - load->operands[1] = Operand(bld.as_uniform(offset)); - assert(load->operands[1].getTemp().type() == RegType::sgpr); - load->definitions[0] = Definition(dst); - load->glc = glc; - load->dlc = dlc; - load->barrier = readonly ? barrier_none : barrier_buffer; - load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works - assert(ctx->options->chip_class >= GFX8 || !glc); - - /* trim vector */ - if (dst.size() == 3) { - Temp vec = bld.tmp(s4); - load->definitions[0] = Definition(vec); - bld.insert(std::move(load)); - emit_split_vector(ctx, vec, 4); - - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), - emit_extract_vector(ctx, vec, 0, s1), - emit_extract_vector(ctx, vec, 1, s1), - emit_extract_vector(ctx, vec, 2, s1)); - } else if (dst.size() == 6) { - Temp vec = bld.tmp(s8); - load->definitions[0] = Definition(vec); - bld.insert(std::move(load)); - emit_split_vector(ctx, vec, 4); - - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), - emit_extract_vector(ctx, vec, 0, s2), - emit_extract_vector(ctx, vec, 1, s2), - emit_extract_vector(ctx, vec, 2, s2)); - } else { - bld.insert(std::move(load)); - } - emit_split_vector(ctx, dst, num_components); + index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), + nir_const_index ? Operand(const_index) : Operand(index), + Operand(desc_ptr)); } + + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index); +} + +void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size, + Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, + bool glc=false, bool readonly=true) +{ + Builder bld(ctx->program, ctx->block); + + bool use_smem = dst.type() != RegType::vgpr && ((ctx->options->chip_class >= GFX8 && component_size >= 4) || readonly); + if (use_smem) + offset = bld.as_uniform(offset); + + LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc}; + info.glc = glc; + info.barrier = readonly ? barrier_none : barrier_buffer; + info.can_reorder = readonly; + info.align_mul = align_mul; + info.align_offset = align_offset; + if (use_smem) + emit_smem_load(ctx, bld, &info); + else + emit_mubuf_load(ctx, bld, &info); } void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr) @@ -3546,20 +5125,20 @@ void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr) rsrc = convert_pointer_to_64_bit(ctx, rsrc); rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); } - - load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa)); + unsigned size = instr->dest.ssa.bit_size / 8; + load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), + nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr)); } void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) { Builder bld(ctx->program, ctx->block); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - unsigned offset = nir_intrinsic_base(instr); + unsigned count = instr->dest.ssa.num_components; nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]); - if (index_cv && instr->dest.ssa.bit_size == 32) { - unsigned count = instr->dest.ssa.num_components; + if (index_cv && instr->dest.ssa.bit_size == 32) { unsigned start = (offset + index_cv->u32) / 4u; start -= ctx->args->ac.base_inline_push_consts; if (start + count <= ctx->args->ac.num_inline_push_consts) { @@ -3582,9 +5161,22 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants)); Temp vec = dst; bool trim = false; + bool aligned = true; + + if (instr->dest.ssa.bit_size == 8) { + aligned = index_cv && (offset + index_cv->u32) % 4 == 0; + bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4); + if (!aligned) + vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2); + } else if (instr->dest.ssa.bit_size == 16) { + aligned = index_cv && (offset + index_cv->u32) % 4 == 0; + if (!aligned) + vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1); + } + aco_opcode op; - switch (dst.size()) { + switch (vec.size()) { case 1: op = aco_opcode::s_load_dword; break; @@ -3609,6 +5201,12 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) bld.smem(op, Definition(vec), ptr, index); + if (!aligned) { + Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index); + byte_align_scalar(ctx, vec, byte_offset, dst); + return; + } + if (trim) { emit_split_vector(ctx, vec, 4); RegClass rc = dst.size() == 3 ? s1 : s2; @@ -3653,14 +5251,15 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr) bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)), Operand(MIN2(base + range, ctx->shader->constant_data_size)), Operand(desc_type)); - - load_buffer(ctx, instr->num_components, dst, rsrc, offset); + unsigned size = instr->dest.ssa.bit_size / 8; + // TODO: get alignment information for subdword constants + load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0); } void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr) { if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent) - ctx->cf_info.exec_potentially_empty = true; + ctx->cf_info.exec_potentially_empty_discard = true; ctx->program->needs_exact = true; @@ -3679,7 +5278,7 @@ void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr) Builder bld(ctx->program, ctx->block); if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent) - ctx->cf_info.exec_potentially_empty = true; + ctx->cf_info.exec_potentially_empty_discard = true; bool divergent = ctx->cf_info.parent_if.is_divergent || ctx->cf_info.parent_loop.has_divergent_continue; @@ -3707,6 +5306,9 @@ void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr) ctx->block->kind |= block_kind_break; unsigned idx = ctx->block->index; + ctx->cf_info.parent_loop.has_divergent_branch = true; + ctx->cf_info.nir_to_aco[instr->instr.block->index] = idx; + /* remove critical edges from linear CFG */ bld.branch(aco_opcode::p_branch); Block* break_block = ctx->program->create_and_insert_block(); @@ -3900,7 +5502,7 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr, Operand off; if (!index_set) { - off = Operand(offset); + off = bld.copy(bld.def(s1), Operand(offset)); } else { off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index))); @@ -3976,7 +5578,7 @@ static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array) * The sample index should be adjusted as follows: * sample_index = (fmask >> (sample_index * 4)) & 0xF; */ -static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coords, Operand sample_index, Temp fmask_desc_ptr) +static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vector& coords, Operand sample_index, Temp fmask_desc_ptr) { Builder bld(ctx->program, ctx->block); Temp fmask = bld.tmp(v1); @@ -3984,9 +5586,12 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coo ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da) : 0; - aco_ptr load{create_instruction(aco_opcode::image_load, Format::MIMG, 2, 1)}; - load->operands[0] = Operand(coords); - load->operands[1] = Operand(fmask_desc_ptr); + Temp coord = da ? bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), coords[0], coords[1], coords[2]) : + bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), coords[0], coords[1]); + aco_ptr load{create_instruction(aco_opcode::image_load, Format::MIMG, 3, 1)}; + load->operands[0] = Operand(fmask_desc_ptr); + load->operands[1] = Operand(s4); /* no sampler */ + load->operands[2] = Operand(coord); load->definitions[0] = Definition(fmask); load->glc = false; load->dlc = false; @@ -3998,8 +5603,12 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coo ctx->block->instructions.emplace_back(std::move(load)); Operand sample_index4; - if (sample_index.isConstant() && sample_index.constantValue() < 16) { - sample_index4 = Operand(sample_index.constantValue() << 2); + if (sample_index.isConstant()) { + if (sample_index.constantValue() < 16) { + sample_index4 = Operand(sample_index.constantValue() << 2); + } else { + sample_index4 = Operand(0u); + } } else if (sample_index.regClass() == s1) { sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u)); } else { @@ -4039,43 +5648,36 @@ static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D; int count = image_type_to_components_count(dim, is_array); - std::vector coords(count); + std::vector coords(count); + Builder bld(ctx->program, ctx->block); if (is_ms) { - Operand sample_index; - nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]); - if (sample_cv) - sample_index = Operand(sample_cv->u32); - else - sample_index = Operand(emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1)); - + count--; + Temp src2 = get_ssa_temp(ctx, instr->src[2].ssa); + /* get sample index */ if (instr->intrinsic == nir_intrinsic_image_deref_load) { - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, is_array ? 3 : 2, 1)}; - for (unsigned i = 0; i < vec->operands.size(); i++) - vec->operands[i] = Operand(emit_extract_vector(ctx, src0, i, v1)); - Temp fmask_load_address = {ctx->program->allocateId(), is_array ? v3 : v2}; - vec->definitions[0] = Definition(fmask_load_address); - ctx->block->instructions.emplace_back(std::move(vec)); + nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]); + Operand sample_index = sample_cv ? Operand(sample_cv->u32) : Operand(emit_extract_vector(ctx, src2, 0, v1)); + std::vector fmask_load_address; + for (unsigned i = 0; i < (is_array ? 3 : 2); i++) + fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1)); Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false); - sample_index = Operand(adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr)); + coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr); + } else { + coords[count] = emit_extract_vector(ctx, src2, 0, v1); } - count--; - coords[count] = sample_index; } - if (count == 1 && !gfx9_1d) - return emit_extract_vector(ctx, src0, 0, v1); - if (gfx9_1d) { - coords[0] = Operand(emit_extract_vector(ctx, src0, 0, v1)); + coords[0] = emit_extract_vector(ctx, src0, 0, v1); coords.resize(coords.size() + 1); - coords[1] = Operand((uint32_t) 0); + coords[1] = bld.copy(bld.def(v1), Operand(0u)); if (is_array) - coords[2] = Operand(emit_extract_vector(ctx, src0, 1, v1)); + coords[2] = emit_extract_vector(ctx, src0, 1, v1); } else { for (int i = 0; i < count; i++) - coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1)); + coords[i] = emit_extract_vector(ctx, src0, i, v1); } if (instr->intrinsic == nir_intrinsic_image_deref_load || @@ -4084,12 +5686,12 @@ static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0; if (!level_zero) - coords.emplace_back(Operand(get_ssa_temp(ctx, instr->src[lod_index].ssa))); + coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa)); } aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)}; for (unsigned i = 0; i < coords.size(); i++) - vec->operands[i] = coords[i]; + vec->operands[i] = Operand(coords[i]); Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())}; vec->definitions[0] = Definition(res); ctx->block->instructions.emplace_back(std::move(vec)); @@ -4130,8 +5732,8 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) unreachable(">4 channel buffer image load"); } aco_ptr load{create_instruction(opcode, Format::MUBUF, 3, 1)}; - load->operands[0] = Operand(vindex); - load->operands[1] = Operand(rsrc); + load->operands[0] = Operand(rsrc); + load->operands[1] = Operand(vindex); load->operands[2] = Operand((uint32_t) 0); Temp tmp; if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr) @@ -4163,9 +5765,10 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0; aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip; - aco_ptr load{create_instruction(opcode, Format::MIMG, 2, 1)}; - load->operands[0] = Operand(coords); - load->operands[1] = Operand(resource); + aco_ptr load{create_instruction(opcode, Format::MIMG, 3, 1)}; + load->operands[0] = Operand(resource); + load->operands[1] = Operand(s4); /* no sampler */ + load->operands[2] = Operand(coords); load->definitions[0] = Definition(tmp); load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0; load->dlc = load->glc && ctx->options->chip_class >= GFX10; @@ -4211,8 +5814,8 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) unreachable(">4 channel buffer image store"); } aco_ptr store{create_instruction(opcode, Format::MUBUF, 4, 0)}; - store->operands[0] = Operand(vindex); - store->operands[1] = Operand(rsrc); + store->operands[0] = Operand(rsrc); + store->operands[1] = Operand(vindex); store->operands[2] = Operand((uint32_t) 0); store->operands[3] = Operand(data); store->idxen = true; @@ -4232,11 +5835,10 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0; aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip; - aco_ptr store{create_instruction(opcode, Format::MIMG, 4, 0)}; - store->operands[0] = Operand(coords); - store->operands[1] = Operand(resource); - store->operands[2] = Operand(s4); - store->operands[3] = Operand(data); + aco_ptr store{create_instruction(opcode, Format::MIMG, 3, 0)}; + store->operands[0] = Operand(resource); + store->operands[1] = Operand(data); + store->operands[2] = Operand(coords); store->glc = glc; store->dlc = false; store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); @@ -4328,8 +5930,8 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true); //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented."); aco_ptr mubuf{create_instruction(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)}; - mubuf->operands[0] = Operand(vindex); - mubuf->operands[1] = Operand(resource); + mubuf->operands[0] = Operand(resource); + mubuf->operands[1] = Operand(vindex); mubuf->operands[2] = Operand((uint32_t)0); mubuf->operands[3] = Operand(data); if (return_previous) @@ -4347,11 +5949,10 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) Temp coords = get_image_coords(ctx, instr, type); Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true); - aco_ptr mimg{create_instruction(image_op, Format::MIMG, 4, return_previous ? 1 : 0)}; - mimg->operands[0] = Operand(coords); - mimg->operands[1] = Operand(resource); - mimg->operands[2] = Operand(s4); /* no sampler */ - mimg->operands[3] = Operand(data); + aco_ptr mimg{create_instruction(image_op, Format::MIMG, 3, return_previous ? 1 : 0)}; + mimg->operands[0] = Operand(resource); + mimg->operands[1] = Operand(data); + mimg->operands[2] = Operand(coords); if (return_previous) mimg->definitions[0] = Definition(dst); mimg->glc = return_previous; @@ -4370,31 +5971,27 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements) { if (in_elements && ctx->options->chip_class == GFX8) { + /* we only have to divide by 1, 2, 4, 8, 12 or 16 */ Builder bld(ctx->program, ctx->block); + Temp size = emit_extract_vector(ctx, desc, 2, s1); + + Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size); + size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.as_uniform(size_div3), Operand(1u)); + Temp stride = emit_extract_vector(ctx, desc, 1, s1); stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u)); - stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride); - stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride); - Temp size = emit_extract_vector(ctx, desc, 2, s1); - size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size); - - Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride); - res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res); - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res); - - // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16} - /* idea - * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32) - * in case 12 (or 3?), we have to divide by 3: - * set v_skip in case it's 12 (if we also have to take care of 3, shift first) - * use v_mul_hi_u32 with magic number to divide - * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane - * disable v_skip - * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions - */ + Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand(12u)); + size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12)); + Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst; + bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), + size, bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride)); + if (dst.type() == RegType::vgpr) + bld.copy(Definition(dst), shr_dst); + + /* TODO: we can probably calculate this faster with v_skip when stride != 12 */ } else { emit_extract_vector(ctx, desc, 2, dst); } @@ -4421,9 +6018,10 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - aco_ptr mimg{create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)}; - mimg->operands[0] = Operand(lod); - mimg->operands[1] = Operand(resource); + aco_ptr mimg{create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)}; + mimg->operands[0] = Operand(resource); + mimg->operands[1] = Operand(s4); /* no sampler */ + mimg->operands[2] = Operand(lod); uint8_t& dmask = mimg->dmask; mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); mimg->dmask = (1 << instr->dest.ssa.num_components) - 1; @@ -4472,7 +6070,9 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); - load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc, false); + unsigned size = instr->dest.ssa.bit_size / 8; + load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), + nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false); } void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) @@ -4480,113 +6080,60 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) Builder bld(ctx->program, ctx->block); Temp data = get_ssa_temp(ctx, instr->src[0].ssa); unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; - unsigned writemask = nir_intrinsic_write_mask(instr); + unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes); Temp offset = get_ssa_temp(ctx, instr->src[2].ssa); Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] && - ctx->options->chip_class >= GFX8; + ctx->options->chip_class >= GFX8 && + elem_size_bytes >= 4; if (smem) offset = bld.as_uniform(offset); bool smem_nonfs = smem && ctx->stage != fragment_fs; - while (writemask) { - int start, count; - u_bit_scan_consecutive_range(&writemask, &start, &count); - if (count == 3 && smem) { - writemask |= 1u << (start + 2); - count = 2; - } - int num_bytes = count * elem_size_bytes; - - if (num_bytes > 16) { - assert(elem_size_bytes == 8); - writemask |= (((count - 2) << 1) - 1) << (start + 2); - count = 2; - num_bytes = 16; - } - - // TODO: check alignment of sub-dword stores - // TODO: split 3 bytes. there is no store instruction for that - - Temp write_data; - if (count != instr->num_components) { - emit_split_vector(ctx, data, instr->num_components); - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; - for (int i = 0; i < count; i++) { - Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4)); - vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem); - } - write_data = bld.tmp(!smem ? RegType::vgpr : smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4); - vec->definitions[0] = Definition(write_data); - ctx->block->instructions.emplace_back(std::move(vec)); - } else if (!smem && data.type() != RegType::vgpr) { - assert(num_bytes % 4 == 0); - write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data); - } else if (smem_nonfs && data.type() == RegType::vgpr) { - assert(num_bytes % 4 == 0); - write_data = bld.as_uniform(data); - } else { - write_data = data; - } + unsigned write_count = 0; + Temp write_datas[32]; + unsigned offsets[32]; + split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr), + data, writemask, 16, &write_count, write_datas, offsets); - aco_opcode vmem_op, smem_op; - switch (num_bytes) { - case 4: - vmem_op = aco_opcode::buffer_store_dword; - smem_op = aco_opcode::s_buffer_store_dword; - break; - case 8: - vmem_op = aco_opcode::buffer_store_dwordx2; - smem_op = aco_opcode::s_buffer_store_dwordx2; - break; - case 12: - vmem_op = aco_opcode::buffer_store_dwordx3; - smem_op = aco_opcode::last_opcode; - assert(!smem); - break; - case 16: - vmem_op = aco_opcode::buffer_store_dwordx4; - smem_op = aco_opcode::s_buffer_store_dwordx4; - break; - default: - unreachable("Store SSBO not implemented for this size."); - } - if (ctx->stage == fragment_fs) - smem_op = aco_opcode::p_fs_buffer_store_smem; + for (unsigned i = 0; i < write_count; i++) { + aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes()); + if (smem && ctx->stage == fragment_fs) + op = aco_opcode::p_fs_buffer_store_smem; if (smem) { - aco_ptr store{create_instruction(smem_op, Format::SMEM, 3, 0)}; + aco_ptr store{create_instruction(op, Format::SMEM, 3, 0)}; store->operands[0] = Operand(rsrc); - if (start) { + if (offsets[i]) { Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), - offset, Operand(start * elem_size_bytes)); + offset, Operand(offsets[i])); store->operands[1] = Operand(off); } else { store->operands[1] = Operand(offset); } - if (smem_op != aco_opcode::p_fs_buffer_store_smem) + if (op != aco_opcode::p_fs_buffer_store_smem) store->operands[1].setFixed(m0); - store->operands[2] = Operand(write_data); + store->operands[2] = Operand(write_datas[i]); store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); store->dlc = false; store->disable_wqm = true; store->barrier = barrier_buffer; ctx->block->instructions.emplace_back(std::move(store)); ctx->program->wb_smem_l1_on_end = true; - if (smem_op == aco_opcode::p_fs_buffer_store_smem) { + if (op == aco_opcode::p_fs_buffer_store_smem) { ctx->block->kind |= block_kind_needs_lowering; ctx->program->needs_exact = true; } } else { - aco_ptr store{create_instruction(vmem_op, Format::MUBUF, 4, 0)}; - store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); - store->operands[1] = Operand(rsrc); + aco_ptr store{create_instruction(op, Format::MUBUF, 4, 0)}; + store->operands[0] = Operand(rsrc); + store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); - store->operands[3] = Operand(write_data); - store->offset = start * elem_size_bytes; + store->operands[3] = Operand(write_datas[i]); + store->offset = offsets[i]; store->offen = (offset.type() == RegType::vgpr); store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); store->dlc = false; @@ -4671,8 +6218,8 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) } aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; - mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); - mubuf->operands[1] = Operand(rsrc); + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); mubuf->operands[3] = Operand(data); if (return_previous) @@ -4699,163 +6246,124 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) { Builder bld(ctx->program, ctx->block); unsigned num_components = instr->num_components; - unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8; + unsigned component_size = instr->dest.ssa.bit_size / 8; + + LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)), + get_ssa_temp(ctx, &instr->dest.ssa), + num_components, component_size}; + info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); + info.align_mul = nir_intrinsic_align_mul(instr); + info.align_offset = nir_intrinsic_align_offset(instr); + info.barrier = barrier_buffer; + info.can_reorder = false; + /* VMEM stores don't update the SMEM cache and it's difficult to prove that + * it's safe to use SMEM */ + bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE; + if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) || !can_use_smem) { + emit_global_load(ctx, bld, &info); + } else { + info.offset = Operand(bld.as_uniform(info.offset)); + emit_smem_load(ctx, bld, &info); + } +} - Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); +void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) +{ + Builder bld(ctx->program, ctx->block); + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; + unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes); - bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); - bool dlc = glc && ctx->options->chip_class >= GFX10; - aco_opcode op; - if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) { - bool global = ctx->options->chip_class >= GFX9; - aco_opcode op; - switch (num_bytes) { - case 4: - op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword; - break; - case 8: - op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2; - break; - case 12: - op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3; - break; - case 16: - op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4; - break; - default: - unreachable("load_global not implemented for this size."); - } - aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)}; - flat->operands[0] = Operand(addr); - flat->operands[1] = Operand(s1); - flat->glc = glc; - flat->dlc = dlc; - flat->barrier = barrier_buffer; + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp addr = get_ssa_temp(ctx, instr->src[1].ssa); + bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + + if (ctx->options->chip_class >= GFX7) + addr = as_vgpr(ctx, addr); + + unsigned write_count = 0; + Temp write_datas[32]; + unsigned offsets[32]; + split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, + 16, &write_count, write_datas, offsets); + + for (unsigned i = 0; i < write_count; i++) { + if (ctx->options->chip_class >= GFX7) { + unsigned offset = offsets[i]; + Temp store_addr = addr; + if (offset > 0 && ctx->options->chip_class < GFX9) { + Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1); + Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1); + Temp carry = bld.tmp(bld.lm); + bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr); + + bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)), + Operand(offset), addr0); + bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm), + Operand(0u), addr1, + carry).def(1).setHint(vcc); + + store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1); + + offset = 0; + } - if (dst.type() == RegType::sgpr) { - Temp vec = bld.tmp(RegType::vgpr, dst.size()); - flat->definitions[0] = Definition(vec); - ctx->block->instructions.emplace_back(std::move(flat)); - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec); - } else { - flat->definitions[0] = Definition(dst); - ctx->block->instructions.emplace_back(std::move(flat)); - } - emit_split_vector(ctx, dst, num_components); - } else { - switch (num_bytes) { + bool global = ctx->options->chip_class >= GFX9; + aco_opcode op; + switch (write_datas[i].bytes()) { + case 1: + op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; + break; + case 2: + op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; + break; case 4: - op = aco_opcode::s_load_dword; + op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break; case 8: - op = aco_opcode::s_load_dwordx2; + op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2; break; case 12: + op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3; + break; case 16: - op = aco_opcode::s_load_dwordx4; + op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4; break; default: - unreachable("load_global not implemented for this size."); - } - aco_ptr load{create_instruction(op, Format::SMEM, 2, 1)}; - load->operands[0] = Operand(addr); - load->operands[1] = Operand(0u); - load->definitions[0] = Definition(dst); - load->glc = glc; - load->dlc = dlc; - load->barrier = barrier_buffer; - assert(ctx->options->chip_class >= GFX8 || !glc); - - if (dst.size() == 3) { - /* trim vector */ - Temp vec = bld.tmp(s4); - load->definitions[0] = Definition(vec); - ctx->block->instructions.emplace_back(std::move(load)); - emit_split_vector(ctx, vec, 4); + unreachable("store_global not implemented for this size."); + } - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), - emit_extract_vector(ctx, vec, 0, s1), - emit_extract_vector(ctx, vec, 1, s1), - emit_extract_vector(ctx, vec, 2, s1)); + aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)}; + flat->operands[0] = Operand(store_addr); + flat->operands[1] = Operand(s1); + flat->operands[2] = Operand(write_datas[i]); + flat->glc = glc; + flat->dlc = false; + flat->offset = offset; + flat->disable_wqm = true; + flat->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(flat)); } else { - ctx->block->instructions.emplace_back(std::move(load)); - } - } -} - -void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) -{ - Builder bld(ctx->program, ctx->block); - unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; - - Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); - Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); - - unsigned writemask = nir_intrinsic_write_mask(instr); - while (writemask) { - int start, count; - u_bit_scan_consecutive_range(&writemask, &start, &count); - unsigned num_bytes = count * elem_size_bytes; - - Temp write_data = data; - if (count != instr->num_components) { - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; - for (int i = 0; i < count; i++) - vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1)); - write_data = bld.tmp(RegType::vgpr, count); - vec->definitions[0] = Definition(write_data); - ctx->block->instructions.emplace_back(std::move(vec)); - } - - unsigned offset = start * elem_size_bytes; - if (offset > 0 && ctx->options->chip_class < GFX9) { - Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1); - Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1); - Temp carry = bld.tmp(bld.lm); - bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr); + assert(ctx->options->chip_class == GFX6); - bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)), - Operand(offset), addr0); - bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm), - Operand(0u), addr1, - carry).def(1).setHint(vcc); + aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes()); - addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1); - - offset = 0; - } + Temp rsrc = get_gfx6_global_rsrc(bld, addr); - bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); - bool global = ctx->options->chip_class >= GFX9; - aco_opcode op; - switch (num_bytes) { - case 4: - op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; - break; - case 8: - op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2; - break; - case 12: - op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3; - break; - case 16: - op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4; - break; - default: - unreachable("store_global not implemented for this size."); + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, 0)}; + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); + mubuf->operands[2] = Operand(0u); + mubuf->operands[3] = Operand(write_datas[i]); + mubuf->glc = glc; + mubuf->dlc = false; + mubuf->offset = offsets[i]; + mubuf->addr64 = addr.type() == RegType::vgpr; + mubuf->disable_wqm = true; + mubuf->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(mubuf)); } - aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)}; - flat->operands[0] = Operand(addr); - flat->operands[1] = Operand(s1); - flat->operands[2] = Operand(data); - flat->glc = glc; - flat->dlc = false; - flat->offset = offset; - flat->disable_wqm = true; - flat->barrier = barrier_buffer; - ctx->program->needs_exact = true; - ctx->block->instructions.emplace_back(std::move(flat)); } } @@ -4873,75 +6381,149 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr) } Builder bld(ctx->program, ctx->block); - Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); + if (ctx->options->chip_class >= GFX7) + addr = as_vgpr(ctx, addr); + if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap) data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2), get_ssa_temp(ctx, instr->src[2].ssa), data); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - bool global = ctx->options->chip_class >= GFX9; aco_opcode op32, op64; - switch (instr->intrinsic) { - case nir_intrinsic_global_atomic_add: - op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add; - op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2; - break; - case nir_intrinsic_global_atomic_imin: - op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin; - op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2; - break; - case nir_intrinsic_global_atomic_umin: - op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin; - op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2; - break; - case nir_intrinsic_global_atomic_imax: - op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax; - op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2; - break; - case nir_intrinsic_global_atomic_umax: - op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax; - op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2; - break; - case nir_intrinsic_global_atomic_and: - op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and; - op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2; - break; - case nir_intrinsic_global_atomic_or: - op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or; - op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2; - break; - case nir_intrinsic_global_atomic_xor: - op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor; - op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2; - break; - case nir_intrinsic_global_atomic_exchange: - op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap; - op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2; - break; - case nir_intrinsic_global_atomic_comp_swap: - op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap; - op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2; - break; - default: - unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions."); + + if (ctx->options->chip_class >= GFX7) { + bool global = ctx->options->chip_class >= GFX9; + switch (instr->intrinsic) { + case nir_intrinsic_global_atomic_add: + op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add; + op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2; + break; + case nir_intrinsic_global_atomic_imin: + op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin; + op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2; + break; + case nir_intrinsic_global_atomic_umin: + op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin; + op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2; + break; + case nir_intrinsic_global_atomic_imax: + op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax; + op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2; + break; + case nir_intrinsic_global_atomic_umax: + op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax; + op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2; + break; + case nir_intrinsic_global_atomic_and: + op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and; + op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2; + break; + case nir_intrinsic_global_atomic_or: + op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or; + op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2; + break; + case nir_intrinsic_global_atomic_xor: + op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor; + op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2; + break; + case nir_intrinsic_global_atomic_exchange: + op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap; + op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2; + break; + case nir_intrinsic_global_atomic_comp_swap: + op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap; + op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2; + break; + default: + unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions."); + } + + aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; + aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)}; + flat->operands[0] = Operand(addr); + flat->operands[1] = Operand(s1); + flat->operands[2] = Operand(data); + if (return_previous) + flat->definitions[0] = Definition(dst); + flat->glc = return_previous; + flat->dlc = false; /* Not needed for atomics */ + flat->offset = 0; + flat->disable_wqm = true; + flat->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(flat)); + } else { + assert(ctx->options->chip_class == GFX6); + + switch (instr->intrinsic) { + case nir_intrinsic_global_atomic_add: + op32 = aco_opcode::buffer_atomic_add; + op64 = aco_opcode::buffer_atomic_add_x2; + break; + case nir_intrinsic_global_atomic_imin: + op32 = aco_opcode::buffer_atomic_smin; + op64 = aco_opcode::buffer_atomic_smin_x2; + break; + case nir_intrinsic_global_atomic_umin: + op32 = aco_opcode::buffer_atomic_umin; + op64 = aco_opcode::buffer_atomic_umin_x2; + break; + case nir_intrinsic_global_atomic_imax: + op32 = aco_opcode::buffer_atomic_smax; + op64 = aco_opcode::buffer_atomic_smax_x2; + break; + case nir_intrinsic_global_atomic_umax: + op32 = aco_opcode::buffer_atomic_umax; + op64 = aco_opcode::buffer_atomic_umax_x2; + break; + case nir_intrinsic_global_atomic_and: + op32 = aco_opcode::buffer_atomic_and; + op64 = aco_opcode::buffer_atomic_and_x2; + break; + case nir_intrinsic_global_atomic_or: + op32 = aco_opcode::buffer_atomic_or; + op64 = aco_opcode::buffer_atomic_or_x2; + break; + case nir_intrinsic_global_atomic_xor: + op32 = aco_opcode::buffer_atomic_xor; + op64 = aco_opcode::buffer_atomic_xor_x2; + break; + case nir_intrinsic_global_atomic_exchange: + op32 = aco_opcode::buffer_atomic_swap; + op64 = aco_opcode::buffer_atomic_swap_x2; + break; + case nir_intrinsic_global_atomic_comp_swap: + op32 = aco_opcode::buffer_atomic_cmpswap; + op64 = aco_opcode::buffer_atomic_cmpswap_x2; + break; + default: + unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions."); + } + + Temp rsrc = get_gfx6_global_rsrc(bld, addr); + + aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; + + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); + mubuf->operands[2] = Operand(0u); + mubuf->operands[3] = Operand(data); + if (return_previous) + mubuf->definitions[0] = Definition(dst); + mubuf->glc = return_previous; + mubuf->dlc = false; + mubuf->offset = 0; + mubuf->addr64 = addr.type() == RegType::vgpr; + mubuf->disable_wqm = true; + mubuf->barrier = barrier_buffer; + ctx->program->needs_exact = true; + ctx->block->instructions.emplace_back(std::move(mubuf)); } - aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; - aco_ptr flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)}; - flat->operands[0] = Operand(addr); - flat->operands[1] = Operand(s1); - flat->operands[2] = Operand(data); - if (return_previous) - flat->definitions[0] = Definition(dst); - flat->glc = return_previous; - flat->dlc = false; /* Not needed for atomics */ - flat->offset = 0; - flat->disable_wqm = true; - flat->barrier = barrier_buffer; - ctx->program->needs_exact = true; - ctx->block->instructions.emplace_back(std::move(flat)); } void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { @@ -4949,10 +6531,7 @@ void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { switch(instr->intrinsic) { case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier: - bld.barrier(aco_opcode::p_memory_barrier_all); - break; - case nir_intrinsic_memory_barrier_atomic_counter: - bld.barrier(aco_opcode::p_memory_barrier_atomic); + bld.barrier(aco_opcode::p_memory_barrier_common); break; case nir_intrinsic_memory_barrier_buffer: bld.barrier(aco_opcode::p_memory_barrier_buffer); @@ -4960,6 +6539,7 @@ void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { case nir_intrinsic_memory_barrier_image: bld.barrier(aco_opcode::p_memory_barrier_image); break; + case nir_intrinsic_memory_barrier_tcs_patch: case nir_intrinsic_memory_barrier_shared: bld.barrier(aco_opcode::p_memory_barrier_shared); break; @@ -4973,7 +6553,6 @@ void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr) { // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read() Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared."); Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); Builder bld(ctx->program, ctx->block); @@ -4988,7 +6567,6 @@ void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr) Temp data = get_ssa_temp(ctx, instr->src[0].ssa); Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; - assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported."); unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes; store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align); @@ -4997,7 +6575,8 @@ void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr) void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr) { unsigned offset = nir_intrinsic_base(instr); - Operand m = load_lds_size_m0(ctx); + Builder bld(ctx->program, ctx->block); + Operand m = load_lds_size_m0(bld); Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); @@ -5090,7 +6669,6 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr) } if (offset > 65535) { - Builder bld(ctx->program, ctx->block); address = bld.vadd32(bld.def(v1), Operand(offset), address); offset = 0; } @@ -5129,128 +6707,45 @@ Temp get_scratch_resource(isel_context *ctx) /* older generations need element size = 16 bytes. element size removed in GFX9 */ if (ctx->program->chip_class <= GFX8) - rsrc_conf |= S_008F0C_ELEMENT_SIZE(3); - - return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf)); -} - -void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { - assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64); - Builder bld(ctx->program, ctx->block); - Temp rsrc = get_scratch_resource(ctx); - Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); - Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - - aco_opcode op; - switch (dst.size()) { - case 1: - op = aco_opcode::buffer_load_dword; - break; - case 2: - op = aco_opcode::buffer_load_dwordx2; - break; - case 3: - op = aco_opcode::buffer_load_dwordx3; - break; - case 4: - op = aco_opcode::buffer_load_dwordx4; - break; - case 6: - case 8: { - std::array elems; - Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4, - bld.def(v4), offset, rsrc, - ctx->program->scratch_offset, 0, true); - Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 : - aco_opcode::buffer_load_dwordx4, - dst.size() == 6 ? bld.def(v2) : bld.def(v4), - offset, rsrc, ctx->program->scratch_offset, 16, true); - emit_split_vector(ctx, lower, 2); - elems[0] = emit_extract_vector(ctx, lower, 0, v2); - elems[1] = emit_extract_vector(ctx, lower, 1, v2); - if (dst.size() == 8) { - emit_split_vector(ctx, upper, 2); - elems[2] = emit_extract_vector(ctx, upper, 0, v2); - elems[3] = emit_extract_vector(ctx, upper, 1, v2); - } else { - elems[2] = upper; - } - - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, - Format::PSEUDO, dst.size() / 2, 1)}; - for (unsigned i = 0; i < dst.size() / 2; i++) - vec->operands[i] = Operand(elems[i]); - vec->definitions[0] = Definition(dst); - bld.insert(std::move(vec)); - ctx->allocated_vec.emplace(dst.id(), elems); - return; - } - default: - unreachable("Wrong dst size for nir_intrinsic_load_scratch"); - } + rsrc_conf |= S_008F0C_ELEMENT_SIZE(3); + + return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf)); +} + +void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { + Builder bld(ctx->program, ctx->block); + Temp rsrc = get_scratch_resource(ctx); + Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - bld.mubuf(op, Definition(dst), offset, rsrc, ctx->program->scratch_offset, 0, true); - emit_split_vector(ctx, dst, instr->num_components); + LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components, + instr->dest.ssa.bit_size / 8u, rsrc}; + info.align_mul = nir_intrinsic_align_mul(instr); + info.align_offset = nir_intrinsic_align_offset(instr); + info.swizzle_component_size = 16; + info.can_reorder = false; + info.soffset = ctx->program->scratch_offset; + emit_mubuf_load(ctx, bld, &info); } void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { - assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64); Builder bld(ctx->program, ctx->block); Temp rsrc = get_scratch_resource(ctx); Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; - unsigned writemask = nir_intrinsic_write_mask(instr); - - while (writemask) { - int start, count; - u_bit_scan_consecutive_range(&writemask, &start, &count); - int num_bytes = count * elem_size_bytes; - - if (num_bytes > 16) { - assert(elem_size_bytes == 8); - writemask |= (((count - 2) << 1) - 1) << (start + 2); - count = 2; - num_bytes = 16; - } - - // TODO: check alignment of sub-dword stores - // TODO: split 3 bytes. there is no store instruction for that - - Temp write_data; - if (count != instr->num_components) { - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; - for (int i = 0; i < count; i++) { - Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4)); - vec->operands[i] = Operand(elem); - } - write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4)); - vec->definitions[0] = Definition(write_data); - ctx->block->instructions.emplace_back(std::move(vec)); - } else { - write_data = data; - } + unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes); - aco_opcode op; - switch (num_bytes) { - case 4: - op = aco_opcode::buffer_store_dword; - break; - case 8: - op = aco_opcode::buffer_store_dwordx2; - break; - case 12: - op = aco_opcode::buffer_store_dwordx3; - break; - case 16: - op = aco_opcode::buffer_store_dwordx4; - break; - default: - unreachable("Invalid data size for nir_intrinsic_store_scratch."); - } + unsigned write_count = 0; + Temp write_datas[32]; + unsigned offsets[32]; + split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, + 16, &write_count, write_datas, offsets); - bld.mubuf(op, offset, rsrc, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true); + for (unsigned i = 0; i < write_count; i++) { + aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes()); + bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true); } } @@ -5284,6 +6779,102 @@ void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) { bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage)); } +void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) { + Builder bld(ctx->program, ctx->block); + + unsigned stream = nir_intrinsic_stream_id(instr); + Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u); + nir_const_value *next_vertex_cv = nir_src_as_const_value(instr->src[0]); + + /* get GSVS ring */ + Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u)); + + unsigned num_components = + ctx->program->info->gs.num_stream_output_components[stream]; + assert(num_components); + + unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out; + unsigned stream_offset = 0; + for (unsigned i = 0; i < stream; i++) { + unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * ctx->shader->info.gs.vertices_out; + stream_offset += prev_stride * ctx->program->wave_size; + } + + /* Limit on the stride field for <= GFX7. */ + assert(stride < (1 << 14)); + + Temp gsvs_dwords[4]; + for (unsigned i = 0; i < 4; i++) + gsvs_dwords[i] = bld.tmp(s1); + bld.pseudo(aco_opcode::p_split_vector, + Definition(gsvs_dwords[0]), + Definition(gsvs_dwords[1]), + Definition(gsvs_dwords[2]), + Definition(gsvs_dwords[3]), + gsvs_ring); + + if (stream_offset) { + Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand(stream_offset)); + + Temp carry = bld.tmp(s1); + gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), gsvs_dwords[0], stream_offset_tmp); + gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(0u), bld.scc(carry)); + } + + gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(S_008F04_STRIDE(stride))); + gsvs_dwords[2] = bld.copy(bld.def(s1), Operand((uint32_t)ctx->program->wave_size)); + + gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), + gsvs_dwords[0], gsvs_dwords[1], gsvs_dwords[2], gsvs_dwords[3]); + + unsigned offset = 0; + for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) { + if (ctx->program->info->gs.output_streams[i] != stream) + continue; + + for (unsigned j = 0; j < 4; j++) { + if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j))) + continue; + + if (ctx->outputs.mask[i] & (1 << j)) { + Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex); + unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u; + if (const_offset >= 4096u) { + if (vaddr_offset.isUndefined()) + vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u)); + else + vaddr_offset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset); + const_offset %= 4096u; + } + + aco_ptr mtbuf{create_instruction(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)}; + mtbuf->operands[0] = Operand(gsvs_ring); + mtbuf->operands[1] = vaddr_offset; + mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset)); + mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]); + mtbuf->offen = !vaddr_offset.isUndefined(); + mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32; + mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; + mtbuf->offset = const_offset; + mtbuf->glc = true; + mtbuf->slc = true; + mtbuf->barrier = barrier_gs_data; + mtbuf->can_reorder = true; + bld.insert(std::move(mtbuf)); + } + + offset += ctx->shader->info.gs.vertices_out; + } + + /* outputs for the next vertex are undefined and keeping them around can + * create invalid IR with control flow */ + ctx->outputs.mask[i] = 0; + } + + bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream)); +} + Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src) { Builder bld(ctx->program, ctx->block); @@ -5302,7 +6893,8 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) { //subgroupAnd(val) -> (exec & ~val) == 0 Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); - return bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp)); + Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp)); + return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond); } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) { //subgroupOr(val) -> (val & exec) != 0 Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp(); @@ -5510,6 +7102,18 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) emit_split_vector(ctx, dst, 2); break; } + case nir_intrinsic_load_barycentric_model: { + Temp model = get_arg(ctx, ctx->args->ac.pull_model); + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Temp p1 = emit_extract_vector(ctx, model, 0, v1); + Temp p2 = emit_extract_vector(ctx, model, 1, v1); + Temp p3 = emit_extract_vector(ctx, model, 2, v1); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), + Operand(p1), Operand(p2), Operand(p3)); + emit_split_vector(ctx, dst, 3); + break; + } case nir_intrinsic_load_barycentric_at_sample: { uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16; switch (ctx->options->key.fs.num_samples) { @@ -5533,12 +7137,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u)); offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset)); } - sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(offset)); + + Operand off = bld.copy(bld.def(s1), Operand(offset)); + sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off); } else if (ctx->options->chip_class >= GFX9) { addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr); sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset); - } else { + } else if (ctx->options->chip_class >= GFX7) { /* addr += private_segment_buffer + sample_pos_offset */ Temp tmp0 = bld.tmp(s1); Temp tmp1 = bld.tmp(s1); @@ -5555,6 +7161,32 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) /* sample_pos = flat_load_dwordx2 addr */ sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1)); + } else { + assert(ctx->options->chip_class == GFX6); + + uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(0u), Operand(rsrc_conf)); + + addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr); + addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand(0u)); + + sample_pos = bld.tmp(v2); + + aco_ptr load{create_instruction(aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)}; + load->definitions[0] = Definition(sample_pos); + load->operands[0] = Operand(rsrc); + load->operands[1] = Operand(addr); + load->operands[2] = Operand(0u); + load->offset = sample_pos_offset; + load->offen = 0; + load->addr64 = true; + load->glc = false; + load->dlc = false; + load->disable_wqm = false; + load->barrier = barrier_none; + load->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(load)); } /* sample_pos -= 0.5 */ @@ -5580,14 +7212,16 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc); break; } - case nir_intrinsic_load_view_index: - case nir_intrinsic_load_layer_id: { - if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) { + case nir_intrinsic_load_view_index: { + if (ctx->stage & (sw_vs | sw_gs | sw_tcs | sw_tes)) { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index))); break; } + /* fallthrough */ + } + case nir_intrinsic_load_layer_id: { unsigned idx = nir_intrinsic_base(instr); bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0); @@ -5605,6 +7239,9 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u)); break; } + case nir_intrinsic_load_tess_coord: + visit_load_tess_coord(ctx, instr); + break; case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break; @@ -5612,8 +7249,21 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) visit_store_output(ctx, instr); break; case nir_intrinsic_load_input: + case nir_intrinsic_load_input_vertex: visit_load_input(ctx, instr); break; + case nir_intrinsic_load_output: + visit_load_output(ctx, instr); + break; + case nir_intrinsic_load_per_vertex_input: + visit_load_per_vertex_input(ctx, instr); + break; + case nir_intrinsic_load_per_vertex_output: + visit_load_per_vertex_output(ctx, instr); + break; + case nir_intrinsic_store_per_vertex_output: + visit_store_per_vertex_output(ctx, instr); + break; case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break; @@ -5716,16 +7366,23 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_get_buffer_size: visit_get_buffer_size(ctx, instr); break; - case nir_intrinsic_barrier: { - unsigned* bsize = ctx->program->info->cs.block_size; - unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2]; - if (workgroup_size > ctx->program->wave_size) + case nir_intrinsic_control_barrier: { + if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) { + /* GFX6 only (thanks to a hw bug workaround): + * The real barrier instruction isn’t needed, because an entire patch + * always fits into a single wave. + */ + break; + } + + if (ctx->program->workgroup_size > ctx->program->wave_size) bld.sopp(aco_opcode::s_barrier); + break; } + case nir_intrinsic_memory_barrier_tcs_patch: case nir_intrinsic_group_memory_barrier: case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_atomic_counter: case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: case nir_intrinsic_memory_barrier_shared: @@ -5906,8 +7563,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) assert(dst.regClass() == bld.lm); Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); - Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp)); - emit_wqm(ctx, val, dst); + Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp)); + bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond); break; } case nir_intrinsic_vote_any: { @@ -5916,9 +7573,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) assert(src.regClass() == bld.lm); assert(dst.regClass() == bld.lm); - Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); - Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), Operand(0u), bld.scc(tmp)); - emit_wqm(ctx, val, dst); + Temp tmp = bool_to_scalar_condition(ctx, src); + bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst); break; } case nir_intrinsic_reduce: @@ -6196,7 +7852,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } case nir_intrinsic_demote: - bld.pseudo(aco_opcode::p_demote_to_helper); + bld.pseudo(aco_opcode::p_demote_to_helper, Operand(-1u)); + + if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent) + ctx->cf_info.exec_potentially_empty_discard = true; ctx->block->kind |= block_kind_uses_demote; ctx->program->needs_exact = true; break; @@ -6205,6 +7864,9 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) assert(src.regClass() == bld.lm); Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); bld.pseudo(aco_opcode::p_demote_to_helper, cond); + + if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent) + ctx->cf_info.exec_potentially_empty_discard = true; ctx->block->kind |= block_kind_uses_demote; ctx->program->needs_exact = true; break; @@ -6243,6 +7905,63 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id)); break; } + case nir_intrinsic_load_invocation_id: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) { + if (ctx->options->chip_class >= GFX10) + bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), get_arg(ctx, ctx->args->ac.gs_invocation_id)); + else + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id)); + } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) { + bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), + get_arg(ctx, ctx->args->ac.tcs_rel_ids), Operand(8u), Operand(5u)); + } else { + unreachable("Unsupported stage for load_invocation_id"); + } + + break; + } + case nir_intrinsic_load_primitive_id: { + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + switch (ctx->shader->info.stage) { + case MESA_SHADER_GEOMETRY: + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id)); + break; + case MESA_SHADER_TESS_CTRL: + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id)); + break; + case MESA_SHADER_TESS_EVAL: + bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id)); + break; + default: + unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id"); + } + + break; + } + case nir_intrinsic_load_patch_vertices_in: { + assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL || + ctx->shader->info.stage == MESA_SHADER_TESS_EVAL); + + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + bld.copy(Definition(dst), Operand(ctx->args->options->key.tcs.input_vertices)); + break; + } + case nir_intrinsic_emit_vertex_with_counter: { + visit_emit_vertex_with_counter(ctx, instr); + break; + } + case nir_intrinsic_end_primitive_with_counter: { + unsigned stream = nir_intrinsic_stream_id(instr); + bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream)); + break; + } + case nir_intrinsic_set_vertex_count: { + /* unused, the HW keeps track of this for us */ + break; + } default: fprintf(stderr, "Unimplemented intrinsic instr: "); nir_print_instr(&instr->instr, stderr); @@ -6290,6 +8009,8 @@ void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr, *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false); } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false); + } else if (instr->op == nir_texop_fragment_mask_fetch) { + *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false); } else { *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false); } @@ -6342,7 +8063,7 @@ void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv, Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma); Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id); - Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id); + Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id); is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z); Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y); @@ -6366,22 +8087,20 @@ void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv, *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp); } -void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array) +void prepare_cube_coords(isel_context *ctx, std::vector& coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array) { Builder bld(ctx->program, ctx->block); - Temp coord_args[4], ma, tc, sc, id; - for (unsigned i = 0; i < (is_array ? 4 : 3); i++) - coord_args[i] = emit_extract_vector(ctx, *coords, i, v1); + Temp ma, tc, sc, id; if (is_array) { - coord_args[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_args[3]); + coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]); // see comment in ac_prepare_cube_coords() if (ctx->options->chip_class <= GFX8) - coord_args[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coord_args[3]); + coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coords[3]); } - ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]); + ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]); aco_ptr vop3a{create_instruction(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)}; vop3a->operands[0] = Operand(ma); @@ -6390,15 +8109,15 @@ void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, vop3a->definitions[0] = Definition(invma); ctx->block->instructions.emplace_back(std::move(vop3a)); - sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]); + sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]); if (!is_deriv) sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/)); - tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]); + tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]); if (!is_deriv) tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/)); - id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]); + id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]); if (is_deriv) { sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma); @@ -6427,27 +8146,11 @@ void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, } if (is_array) - id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coord_args[3], id, Operand(0x41000000u/*8.0*/)); - *coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), sc, tc, id); - -} - -Temp apply_round_slice(isel_context *ctx, Temp coords, unsigned idx) -{ - Temp coord_vec[3]; - for (unsigned i = 0; i < coords.size(); i++) - coord_vec[i] = emit_extract_vector(ctx, coords, i, v1); - - Builder bld(ctx->program, ctx->block); - coord_vec[idx] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_vec[idx]); - - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)}; - for (unsigned i = 0; i < coords.size(); i++) - vec->operands[i] = Operand(coord_vec[i]); - Temp res = bld.tmp(RegType::vgpr, coords.size()); - vec->definitions[0] = Definition(res); - ctx->block->instructions.emplace_back(std::move(vec)); - return res; + id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/)); + coords.resize(3); + coords[0] = sc; + coords[1] = tc; + coords[2] = id; } void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4]) @@ -6469,8 +8172,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) Builder bld(ctx->program, ctx->block); bool has_bias = false, has_lod = false, level_zero = false, has_compare = false, has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false; - Temp resource, sampler, fmask_ptr, bias = Temp(), coords, compare = Temp(), sample_index = Temp(), - lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), derivs = Temp(); + Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(), + lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(); + std::vector coords; + std::vector derivs; nir_const_value *sample_index_cv = NULL; nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL}; enum glsl_base_type stype; @@ -6483,9 +8188,12 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) for (unsigned i = 0; i < instr->num_srcs; i++) { switch (instr->src[i].src_type) { - case nir_tex_src_coord: - coords = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[i].src.ssa)); + case nir_tex_src_coord: { + Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa); + for (unsigned i = 0; i < coord.size(); i++) + coords.emplace_back(emit_extract_vector(ctx, coord, i, v1)); break; + } case nir_tex_src_bias: if (instr->op == nir_texop_txb) { bias = get_ssa_temp(ctx, instr->src[i].src.ssa); @@ -6533,7 +8241,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) break; } } -// TODO: all other cases: structure taken from ac_nir_to_llvm.c + if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true); @@ -6543,10 +8251,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16)); Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2); Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */)); - Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u)); + Operand default_sample = Operand(1u); + if (ctx->options->robust_buffer_access) { + /* Extract the second dword of the descriptor, if it's + * all zero, then it's a null descriptor. + */ + Temp dword1 = emit_extract_vector(ctx, resource, 1, s1); + Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u)); + default_sample = Operand(is_non_null_descriptor); + } + + Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u)); bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), - samples, Operand(1u), bld.scc(is_msaa)); + samples, default_sample, bld.scc(is_msaa)); return; } @@ -6613,15 +8331,19 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) } if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components) - prepare_cube_coords(ctx, &coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod); + prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod); /* pack derivatives */ if (has_ddx || has_ddy) { if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) { - derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4), - ddx, Operand(0u), ddy, Operand(0u)); + assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1); + Temp zero = bld.copy(bld.def(v1), Operand(0u)); + derivs = {ddx, zero, ddy, zero}; } else { - derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, ddx.size() + ddy.size()), ddx, ddy); + for (unsigned i = 0; has_ddx && i < ddx.size(); i++) + derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1)); + for (unsigned i = 0; has_ddy && i < ddy.size(); i++) + derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1)); } has_derivs = true; } @@ -6630,7 +8352,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array && instr->op != nir_texop_txf) - coords = apply_round_slice(ctx, coords, 1); + coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]); if (instr->coord_components > 2 && (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || @@ -6638,22 +8360,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && instr->is_array && - instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) - coords = apply_round_slice(ctx, coords, 2); + instr->op != nir_texop_txf && + instr->op != nir_texop_txf_ms && + instr->op != nir_texop_fragment_fetch && + instr->op != nir_texop_fragment_mask_fetch) + coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]); if (ctx->options->chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->op != nir_texop_lod && instr->coord_components) { assert(coords.size() > 0 && coords.size() < 3); - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, coords.size() + 1, 1)}; - vec->operands[0] = Operand(emit_extract_vector(ctx, coords, 0, v1)); - vec->operands[1] = instr->op == nir_texop_txf ? Operand((uint32_t) 0) : Operand((uint32_t) 0x3f000000); - if (coords.size() > 1) - vec->operands[2] = Operand(emit_extract_vector(ctx, coords, 1, v1)); - coords = bld.tmp(RegType::vgpr, coords.size() + 1); - vec->definitions[0] = Definition(coords); - ctx->block->instructions.emplace_back(std::move(vec)); + coords.insert(std::next(coords.begin()), bld.copy(bld.def(v1), instr->op == nir_texop_txf ? + Operand((uint32_t) 0) : + Operand((uint32_t) 0x3f000000))); } bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array); @@ -6663,7 +8383,9 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && - instr->op != nir_texop_txs) { + instr->op != nir_texop_txs && + instr->op != nir_texop_fragment_fetch && + instr->op != nir_texop_fragment_mask_fetch) { assert(has_sample_index); Operand op(sample_index); if (sample_index_cv) @@ -6672,24 +8394,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) } if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) { - Temp split_coords[coords.size()]; - emit_split_vector(ctx, coords, coords.size()); - for (unsigned i = 0; i < coords.size(); i++) - split_coords[i] = emit_extract_vector(ctx, coords, i, v1); - - unsigned i = 0; - for (; i < std::min(offset.size(), instr->coord_components); i++) { + for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) { Temp off = emit_extract_vector(ctx, offset, i, v1); - split_coords[i] = bld.vadd32(bld.def(v1), split_coords[i], off); + coords[i] = bld.vadd32(bld.def(v1), coords[i], off); } - - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)}; - for (unsigned i = 0; i < coords.size(); i++) - vec->operands[i] = Operand(split_coords[i]); - coords = bld.tmp(coords.regClass()); - vec->definitions[0] = Definition(coords); - ctx->block->instructions.emplace_back(std::move(vec)); - has_offset = false; } @@ -6728,9 +8436,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) if (tmp_dst.id() == dst.id() && div_by_6) tmp_dst = bld.tmp(tmp_dst.regClass()); - tex.reset(create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)); - tex->operands[0] = Operand(as_vgpr(ctx,lod)); - tex->operands[1] = Operand(resource); + tex.reset(create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)); + tex->operands[0] = Operand(resource); + tex->operands[1] = Operand(s4); /* no sampler */ + tex->operands[2] = Operand(as_vgpr(ctx,lod)); if (ctx->options->chip_class == GFX9 && instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_1D && @@ -6768,9 +8477,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) Temp tg4_compare_cube_wa64 = Temp(); if (tg4_integer_workarounds) { - tex.reset(create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)); - tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); - tex->operands[1] = Operand(resource); + tex.reset(create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)); + tex->operands[0] = Operand(resource); + tex->operands[1] = Operand(s4); /* no sampler */ + tex->operands[2] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); tex->dim = dim; tex->dmask = 0x3; tex->da = da; @@ -6788,12 +8498,9 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]); } - Temp orig_coords[2] = { - emit_extract_vector(ctx, coords, 0, v1), - emit_extract_vector(ctx, coords, 1, v1)}; Temp new_coords[2] = { - bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[0], half_texel[0]), - bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[1], half_texel[1]) + bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]), + bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1]) }; if (tg4_integer_cube_workaround) { @@ -6842,63 +8549,14 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) ctx->block->instructions.emplace_back(std::move(vec)); new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - new_coords[0], orig_coords[0], tg4_compare_cube_wa64); + new_coords[0], coords[0], tg4_compare_cube_wa64); new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), - new_coords[1], orig_coords[1], tg4_compare_cube_wa64); + new_coords[1], coords[1], tg4_compare_cube_wa64); } - - if (coords.size() == 3) { - coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), - new_coords[0], new_coords[1], - emit_extract_vector(ctx, coords, 2, v1)); - } else { - assert(coords.size() == 2); - coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), - new_coords[0], new_coords[1]); - } - } - - std::vector args; - if (has_offset) - args.emplace_back(Operand(offset)); - if (has_bias) - args.emplace_back(Operand(bias)); - if (has_compare) - args.emplace_back(Operand(compare)); - if (has_derivs) - args.emplace_back(Operand(derivs)); - args.emplace_back(Operand(coords)); - if (has_sample_index) - args.emplace_back(Operand(sample_index)); - if (has_lod) - args.emplace_back(lod); - - Temp arg; - if (args.size() > 1) { - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)}; - unsigned size = 0; - for (unsigned i = 0; i < args.size(); i++) { - size += args[i].size(); - vec->operands[i] = args[i]; - } - RegClass rc = RegClass(RegType::vgpr, size); - Temp tmp = bld.tmp(rc); - vec->definitions[0] = Definition(tmp); - ctx->block->instructions.emplace_back(std::move(vec)); - arg = tmp; - } else { - assert(args[0].isTemp()); - arg = as_vgpr(ctx, args[0].getTemp()); + coords[0] = new_coords[0]; + coords[1] = new_coords[1]; } - /* we don't need the bias, sample index, compare value or offset to be - * computed in WQM but if the p_create_vector copies the coordinates, then it - * needs to be in WQM */ - if (!(has_ddx && has_ddy) && !has_lod && !level_zero && - instr->sampler_dim != GLSL_SAMPLER_DIM_MS && - instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS) - arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true); - if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe() @@ -6925,8 +8583,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) tmp_dst = bld.tmp(RegType::vgpr, last_bit); aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; - mubuf->operands[0] = Operand(coords); - mubuf->operands[1] = Operand(resource); + mubuf->operands[0] = Operand(resource); + mubuf->operands[1] = Operand(coords[0]); mubuf->operands[2] = Operand((uint32_t) 0); mubuf->definitions[0] = Definition(tmp_dst); mubuf->idxen = true; @@ -6937,14 +8595,41 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) return; } + /* gather MIMG address components */ + std::vector args; + if (has_offset) + args.emplace_back(offset); + if (has_bias) + args.emplace_back(bias); + if (has_compare) + args.emplace_back(compare); + if (has_derivs) + args.insert(args.end(), derivs.begin(), derivs.end()); + + args.insert(args.end(), coords.begin(), coords.end()); + if (has_sample_index) + args.emplace_back(sample_index); + if (has_lod) + args.emplace_back(lod); + + Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size())); + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)}; + vec->definitions[0] = Definition(arg); + for (unsigned i = 0; i < args.size(); i++) + vec->operands[i] = Operand(args[i]); + ctx->block->instructions.emplace_back(std::move(vec)); + if (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms || - instr->op == nir_texop_samples_identical) { - aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? aco_opcode::image_load : aco_opcode::image_load_mip; - tex.reset(create_instruction(op, Format::MIMG, 2, 1)); - tex->operands[0] = Operand(arg); - tex->operands[1] = Operand(resource); + instr->op == nir_texop_samples_identical || + instr->op == nir_texop_fragment_fetch || + instr->op == nir_texop_fragment_mask_fetch) { + aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ? aco_opcode::image_load : aco_opcode::image_load_mip; + tex.reset(create_instruction(op, Format::MIMG, 3, 1)); + tex->operands[0] = Operand(resource); + tex->operands[1] = Operand(s4); /* no sampler */ + tex->operands[2] = Operand(arg); tex->dim = dim; tex->dmask = dmask; tex->unrm = true; @@ -7029,10 +8714,19 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) opcode = aco_opcode::image_get_lod; } + /* we don't need the bias, sample index, compare value or offset to be + * computed in WQM but if the p_create_vector copies the coordinates, then it + * needs to be in WQM */ + if (ctx->stage == fragment_fs && + !has_derivs && !has_lod && !level_zero && + instr->sampler_dim != GLSL_SAMPLER_DIM_MS && + instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS) + arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true); + tex.reset(create_instruction(opcode, Format::MIMG, 3, 1)); - tex->operands[0] = Operand(arg); - tex->operands[1] = Operand(resource); - tex->operands[2] = Operand(sampler); + tex->operands[0] = Operand(resource); + tex->operands[1] = Operand(sampler); + tex->operands[2] = Operand(arg); tex->dim = dim; tex->dmask = dmask; tex->da = da; @@ -7091,7 +8785,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr) std::vector& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds; unsigned num_operands = 0; - Operand operands[std::max(exec_list_length(&instr->srcs), (unsigned)preds.size())]; + Operand operands[std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1]; unsigned num_defined = 0; unsigned cur_pred_idx = 0; for (std::pair src : phi_src) { @@ -7109,6 +8803,10 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr) continue; } } + /* Handle missing predecessors at the end. This shouldn't happen with loop + * headers and we can't ignore these sources for loop header phis. */ + if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size()) + continue; cur_pred_idx++; Operand op = get_phi_operand(ctx, src.second); operands[num_operands++] = op; @@ -7118,6 +8816,17 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr) while (cur_pred_idx++ < preds.size()) operands[num_operands++] = Operand(dst.regClass()); + /* If the loop ends with a break, still add a linear continue edge in case + * that break is divergent or continue_or_break is used. We'll either remove + * this operand later in visit_loop() if it's not necessary or replace the + * undef with something correct. */ + if (!logical && ctx->block->kind & block_kind_loop_header) { + nir_loop *loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent); + nir_block *last = nir_loop_last_block(loop); + if (last->successors[0] != instr->instr.block) + operands[num_operands++] = Operand(RegClass()); + } + if (num_defined == 0) { Builder bld(ctx->program, ctx->block); if (dst.regClass() == s1) { @@ -7272,6 +8981,11 @@ void visit_jump(isel_context *ctx, nir_jump_instr *instr) abort(); } + if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) { + ctx->cf_info.exec_potentially_empty_break = true; + ctx->cf_info.exec_potentially_empty_break_depth = ctx->cf_info.loop_nest_depth; + } + /* remove critical edges from linear CFG */ bld.branch(aco_opcode::p_branch); Block* break_block = ctx->program->create_and_insert_block(); @@ -7334,8 +9048,54 @@ void visit_block(isel_context *ctx, nir_block *block) +static Operand create_continue_phis(isel_context *ctx, unsigned first, unsigned last, + aco_ptr& header_phi, Operand *vals) +{ + vals[0] = Operand(header_phi->definitions[0].getTemp()); + RegClass rc = vals[0].regClass(); + + unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth; + + unsigned next_pred = 1; + + for (unsigned idx = first + 1; idx <= last; idx++) { + Block& block = ctx->program->blocks[idx]; + if (block.loop_nest_depth != loop_nest_depth) { + vals[idx - first] = vals[idx - 1 - first]; + continue; + } + + if (block.kind & block_kind_continue) { + vals[idx - first] = header_phi->operands[next_pred]; + next_pred++; + continue; + } + + bool all_same = true; + for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++) + all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first]; + + Operand val; + if (all_same) { + val = vals[block.linear_preds[0] - first]; + } else { + aco_ptr phi(create_instruction( + aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1)); + for (unsigned i = 0; i < block.linear_preds.size(); i++) + phi->operands[i] = vals[block.linear_preds[i] - first]; + val = Operand(Temp(ctx->program->allocateId(), rc)); + phi->definitions[0] = Definition(val.getTemp()); + block.instructions.emplace(block.instructions.begin(), std::move(phi)); + } + vals[idx - first] = val; + } + + return vals[last - first]; +} + static void visit_loop(isel_context *ctx, nir_loop *loop) { + //TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true append_logical_end(ctx->block); ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform; Builder bld(ctx->program, ctx->block); @@ -7356,12 +9116,12 @@ static void visit_loop(isel_context *ctx, nir_loop *loop) unsigned loop_header_idx = loop_header->index; loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit); append_logical_start(ctx->block); - visit_cf_list(ctx, &loop->body); + bool unreachable = visit_cf_list(ctx, &loop->body); //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken? if (!ctx->cf_info.has_branch) { append_logical_end(ctx->block); - if (ctx->cf_info.exec_potentially_empty) { + if (ctx->cf_info.exec_potentially_empty_discard || ctx->cf_info.exec_potentially_empty_break) { /* Discards can result in code running with an empty exec mask. * This would result in divergent breaks not ever being taken. As a * workaround, break the loop when the loop mask is empty instead of @@ -7386,7 +9146,8 @@ static void visit_loop(isel_context *ctx, nir_loop *loop) add_linear_edge(block_idx, continue_block); add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]); - add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]); + if (!ctx->cf_info.parent_loop.has_divergent_branch) + add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]); ctx->block = &ctx->program->blocks[block_idx]; } else { ctx->block->kind |= (block_kind_continue | block_kind_uniform); @@ -7400,8 +9161,11 @@ static void visit_loop(isel_context *ctx, nir_loop *loop) bld.branch(aco_opcode::p_branch); } - /* fixup phis in loop header from unreachable blocks */ - if (ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch) { + /* Fixup phis in loop header from unreachable blocks. + * has_branch/has_divergent_branch also indicates if the loop ends with a + * break/continue instruction, but we don't emit those if unreachable=true */ + if (unreachable) { + assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch); bool linear = ctx->cf_info.has_branch; bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch; for (aco_ptr& instr : ctx->program->blocks[loop_header_idx].instructions) { @@ -7415,6 +9179,24 @@ static void visit_loop(isel_context *ctx, nir_loop *loop) } } + /* Fixup linear phis in loop header from expecting a continue. Both this fixup + * and the previous one shouldn't both happen at once because a break in the + * merge block would get CSE'd */ + if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) { + unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1); + Operand vals[num_vals]; + for (aco_ptr& instr : ctx->program->blocks[loop_header_idx].instructions) { + if (instr->opcode == aco_opcode::p_linear_phi) { + if (ctx->cf_info.has_branch) + instr->operands.pop_back(); + else + instr->operands.back() = create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals); + } else if (!is_phi(instr)) { + break; + } + } + } + ctx->cf_info.has_branch = false; // TODO: if the loop has not a single exit, we must add one °° @@ -7468,10 +9250,16 @@ static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth; ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level)); - ic->exec_potentially_empty_old = ctx->cf_info.exec_potentially_empty; + ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard; + ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break; + ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth; ic->divergent_old = ctx->cf_info.parent_if.is_divergent; ctx->cf_info.parent_if.is_divergent = true; - ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */ + + /* divergent branches use cbranch_execz */ + ctx->cf_info.exec_potentially_empty_discard = false; + ctx->cf_info.exec_potentially_empty_break = false; + ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; /** emit logical then block */ Block* BB_then_logical = ctx->program->create_and_insert_block(); @@ -7516,8 +9304,14 @@ static void begin_divergent_if_else(isel_context *ctx, if_context *ic) branch->operands[0] = Operand(ic->cond); ctx->block->instructions.push_back(std::move(branch)); - ic->exec_potentially_empty_old |= ctx->cf_info.exec_potentially_empty; - ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */ + ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard; + ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break; + ic->exec_potentially_empty_break_depth_old = + std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth); + /* divergent branches use cbranch_execz */ + ctx->cf_info.exec_potentially_empty_discard = false; + ctx->cf_info.exec_potentially_empty_break = false; + ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; /** emit logical else block */ Block* BB_else_logical = ctx->program->create_and_insert_block(); @@ -7564,17 +9358,115 @@ static void end_divergent_if(isel_context *ctx, if_context *ic) ctx->cf_info.parent_if.is_divergent = ic->divergent_old; - ctx->cf_info.exec_potentially_empty |= ic->exec_potentially_empty_old; + ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old; + ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old; + ctx->cf_info.exec_potentially_empty_break_depth = + std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth); + if (ctx->cf_info.loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth && + !ctx->cf_info.parent_if.is_divergent) { + ctx->cf_info.exec_potentially_empty_break = false; + ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; + } /* uniform control flow never has an empty exec-mask */ - if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) - ctx->cf_info.exec_potentially_empty = false; + if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) { + ctx->cf_info.exec_potentially_empty_discard = false; + ctx->cf_info.exec_potentially_empty_break = false; + ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; + } +} + +static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond) +{ + assert(cond.regClass() == s1); + + append_logical_end(ctx->block); + ctx->block->kind |= block_kind_uniform; + + aco_ptr branch; + aco_opcode branch_opcode = aco_opcode::p_cbranch_z; + branch.reset(create_instruction(branch_opcode, Format::PSEUDO_BRANCH, 1, 0)); + branch->operands[0] = Operand(cond); + branch->operands[0].setFixed(scc); + ctx->block->instructions.emplace_back(std::move(branch)); + + ic->BB_if_idx = ctx->block->index; + ic->BB_endif = Block(); + ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth; + ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level; + + ctx->cf_info.has_branch = false; + ctx->cf_info.parent_loop.has_divergent_branch = false; + + /** emit then block */ + Block* BB_then = ctx->program->create_and_insert_block(); + BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_edge(ic->BB_if_idx, BB_then); + append_logical_start(BB_then); + ctx->block = BB_then; +} + +static void begin_uniform_if_else(isel_context *ctx, if_context *ic) +{ + Block *BB_then = ctx->block; + + ic->uniform_has_then_branch = ctx->cf_info.has_branch; + ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch; + + if (!ic->uniform_has_then_branch) { + append_logical_end(BB_then); + /* branch from then block to endif block */ + aco_ptr branch; + branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_then->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_then->index, &ic->BB_endif); + if (!ic->then_branch_divergent) + add_logical_edge(BB_then->index, &ic->BB_endif); + BB_then->kind |= block_kind_uniform; + } + + ctx->cf_info.has_branch = false; + ctx->cf_info.parent_loop.has_divergent_branch = false; + + /** emit else block */ + Block* BB_else = ctx->program->create_and_insert_block(); + BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth; + add_edge(ic->BB_if_idx, BB_else); + append_logical_start(BB_else); + ctx->block = BB_else; +} + +static void end_uniform_if(isel_context *ctx, if_context *ic) +{ + Block *BB_else = ctx->block; + + if (!ctx->cf_info.has_branch) { + append_logical_end(BB_else); + /* branch from then block to endif block */ + aco_ptr branch; + branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); + BB_else->instructions.emplace_back(std::move(branch)); + add_linear_edge(BB_else->index, &ic->BB_endif); + if (!ctx->cf_info.parent_loop.has_divergent_branch) + add_logical_edge(BB_else->index, &ic->BB_endif); + BB_else->kind |= block_kind_uniform; + } + + ctx->cf_info.has_branch &= ic->uniform_has_then_branch; + ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent; + + /** emit endif merge block */ + if (!ctx->cf_info.has_branch) { + ctx->block = ctx->program->insert_block(std::move(ic->BB_endif)); + append_logical_start(ctx->block); + } } -static void visit_if(isel_context *ctx, nir_if *if_stmt) +static bool visit_if(isel_context *ctx, nir_if *if_stmt) { Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa); Builder bld(ctx->program, ctx->block); aco_ptr branch; + if_context ic; if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */ /** @@ -7592,77 +9484,18 @@ static void visit_if(isel_context *ctx, nir_if *if_stmt) * to the loop exit/entry block. Otherwise, it branches to the next * merge block. **/ - append_logical_end(ctx->block); - ctx->block->kind |= block_kind_uniform; - /* emit branch */ - assert(cond.regClass() == bld.lm); // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction + assert(cond.regClass() == ctx->program->lane_mask); cond = bool_to_scalar_condition(ctx, cond); - branch.reset(create_instruction(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0)); - branch->operands[0] = Operand(cond); - branch->operands[0].setFixed(scc); - ctx->block->instructions.emplace_back(std::move(branch)); - - unsigned BB_if_idx = ctx->block->index; - Block BB_endif = Block(); - BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth; - BB_endif.kind |= ctx->block->kind & block_kind_top_level; - - /** emit then block */ - Block* BB_then = ctx->program->create_and_insert_block(); - BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth; - add_edge(BB_if_idx, BB_then); - append_logical_start(BB_then); - ctx->block = BB_then; + begin_uniform_if_then(ctx, &ic, cond); visit_cf_list(ctx, &if_stmt->then_list); - BB_then = ctx->block; - bool then_branch = ctx->cf_info.has_branch; - bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch; - - if (!then_branch) { - append_logical_end(BB_then); - /* branch from then block to endif block */ - branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); - BB_then->instructions.emplace_back(std::move(branch)); - add_linear_edge(BB_then->index, &BB_endif); - if (!then_branch_divergent) - add_logical_edge(BB_then->index, &BB_endif); - BB_then->kind |= block_kind_uniform; - } - - ctx->cf_info.has_branch = false; - ctx->cf_info.parent_loop.has_divergent_branch = false; - /** emit else block */ - Block* BB_else = ctx->program->create_and_insert_block(); - BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth; - add_edge(BB_if_idx, BB_else); - append_logical_start(BB_else); - ctx->block = BB_else; + begin_uniform_if_else(ctx, &ic); visit_cf_list(ctx, &if_stmt->else_list); - BB_else = ctx->block; - - if (!ctx->cf_info.has_branch) { - append_logical_end(BB_else); - /* branch from then block to endif block */ - branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); - BB_else->instructions.emplace_back(std::move(branch)); - add_linear_edge(BB_else->index, &BB_endif); - if (!ctx->cf_info.parent_loop.has_divergent_branch) - add_logical_edge(BB_else->index, &BB_endif); - BB_else->kind |= block_kind_uniform; - } - - ctx->cf_info.has_branch &= then_branch; - ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent; - /** emit endif merge block */ - if (!ctx->cf_info.has_branch) { - ctx->block = ctx->program->insert_block(std::move(BB_endif)); - append_logical_start(ctx->block); - } + end_uniform_if(ctx, &ic); } else { /* non-uniform condition */ /** * To maintain a logical and linear CFG without critical edges, @@ -7689,8 +9522,6 @@ static void visit_if(isel_context *ctx, nir_if *if_stmt) * *) Exceptions may be due to break and continue statements within loops **/ - if_context ic; - begin_divergent_if_then(ctx, &ic, cond); visit_cf_list(ctx, &if_stmt->then_list); @@ -7699,9 +9530,11 @@ static void visit_if(isel_context *ctx, nir_if *if_stmt) end_divergent_if(ctx, &ic); } + + return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty(); } -static void visit_cf_list(isel_context *ctx, +static bool visit_cf_list(isel_context *ctx, struct exec_list *list) { foreach_list_typed(nir_cf_node, node, node, list) { @@ -7710,7 +9543,8 @@ static void visit_cf_list(isel_context *ctx, visit_block(ctx, nir_cf_node_as_block(node)); break; case nir_cf_node_if: - visit_if(ctx, nir_cf_node_as_if(node)); + if (!visit_if(ctx, nir_cf_node_as_if(node))) + return true; break; case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); @@ -7719,25 +9553,50 @@ static void visit_cf_list(isel_context *ctx, unreachable("unimplemented cf list type"); } } + return false; +} + +static void create_null_export(isel_context *ctx) +{ + /* Some shader stages always need to have exports. + * So when there is none, we need to add a null export. + */ + + unsigned dest = (ctx->program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS; + bool vm = (ctx->program->stage & hw_fs) || ctx->program->chip_class >= GFX10; + Builder bld(ctx->program, ctx->block); + bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), + /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, vm); } -static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos) +static bool export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos) { - int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot]; - uint64_t mask = ctx->vs_output.mask[slot]; + assert(ctx->stage == vertex_vs || + ctx->stage == tess_eval_vs || + ctx->stage == gs_copy_vs || + ctx->stage == ngg_vertex_gs || + ctx->stage == ngg_tess_eval_gs); + + int offset = (ctx->stage & sw_tes) + ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot] + : ctx->program->info->vs.outinfo.vs_output_param_offset[slot]; + uint64_t mask = ctx->outputs.mask[slot]; if (!is_pos && !mask) - return; + return false; if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED) - return; + return false; aco_ptr exp{create_instruction(aco_opcode::exp, Format::EXP, 4, 0)}; exp->enabled_mask = mask; for (unsigned i = 0; i < 4; ++i) { if (mask & (1 << i)) - exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]); + exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]); else exp->operands[i] = Operand(v1); } - exp->valid_mask = false; + /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. + * Setting valid_mask=1 prevents it and has no other effect. + */ + exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0; exp->done = false; exp->compressed = false; if (is_pos) @@ -7745,6 +9604,8 @@ static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *nex else exp->dest = V_008DFC_SQ_EXP_PARAM + offset; ctx->block->instructions.emplace_back(std::move(exp)); + + return true; } static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos) @@ -7753,23 +9614,23 @@ static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos) exp->enabled_mask = 0; for (unsigned i = 0; i < 4; ++i) exp->operands[i] = Operand(v1); - if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) { - exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]); + if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) { + exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]); exp->enabled_mask |= 0x1; } - if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) { - exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]); + if (ctx->outputs.mask[VARYING_SLOT_LAYER]) { + exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]); exp->enabled_mask |= 0x4; } - if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) { + if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) { if (ctx->options->chip_class < GFX9) { - exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]); + exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]); exp->enabled_mask |= 0x8; } else { Builder bld(ctx->program, ctx->block); Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), - Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0])); + Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u])); if (exp->operands[2].isTemp()) out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]); @@ -7777,52 +9638,424 @@ static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos) exp->enabled_mask |= 0x4; } } - exp->valid_mask = false; + exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0; exp->done = false; exp->compressed = false; exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++; ctx->block->instructions.emplace_back(std::move(exp)); } +static void create_export_phis(isel_context *ctx) +{ + /* Used when exports are needed, but the output temps are defined in a preceding block. + * This function will set up phis in order to access the outputs in the next block. + */ + + assert(ctx->block->instructions.back()->opcode == aco_opcode::p_logical_start); + aco_ptr logical_start = aco_ptr(ctx->block->instructions.back().release()); + ctx->block->instructions.pop_back(); + + Builder bld(ctx->program, ctx->block); + + for (unsigned slot = 0; slot <= VARYING_SLOT_VAR31; ++slot) { + uint64_t mask = ctx->outputs.mask[slot]; + for (unsigned i = 0; i < 4; ++i) { + if (!(mask & (1 << i))) + continue; + + Temp old = ctx->outputs.temps[slot * 4 + i]; + Temp phi = bld.pseudo(aco_opcode::p_phi, bld.def(v1), old, Operand(v1)); + ctx->outputs.temps[slot * 4 + i] = phi; + } + } + + bld.insert(std::move(logical_start)); +} + static void create_vs_exports(isel_context *ctx) { - radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo; + assert(ctx->stage == vertex_vs || + ctx->stage == tess_eval_vs || + ctx->stage == gs_copy_vs || + ctx->stage == ngg_vertex_gs || + ctx->stage == ngg_tess_eval_gs); + + radv_vs_output_info *outinfo = (ctx->stage & sw_tes) + ? &ctx->program->info->tes.outinfo + : &ctx->program->info->vs.outinfo; + + if (outinfo->export_prim_id && !(ctx->stage & hw_ngg_gs)) { + ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1; + ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->vs_prim_id); + } + + if (ctx->options->key.has_multiview_view_index) { + ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1; + ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index)); + } + + /* the order these position exports are created is important */ + int next_pos = 0; + bool exported_pos = export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos); + if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) { + export_vs_psiz_layer_viewport(ctx, &next_pos); + exported_pos = true; + } + if (ctx->num_clip_distances + ctx->num_cull_distances > 0) + exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos); + if (ctx->num_clip_distances + ctx->num_cull_distances > 4) + exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos); + + if (ctx->export_clip_dists) { + if (ctx->num_clip_distances + ctx->num_cull_distances > 0) + export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos); + if (ctx->num_clip_distances + ctx->num_cull_distances > 4) + export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos); + } + + for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { + if (i < VARYING_SLOT_VAR0 && + i != VARYING_SLOT_LAYER && + i != VARYING_SLOT_PRIMITIVE_ID && + i != VARYING_SLOT_VIEWPORT) + continue; + + export_vs_varying(ctx, i, false, NULL); + } + + if (!exported_pos) + create_null_export(ctx); +} + +static bool export_fs_mrt_z(isel_context *ctx) +{ + Builder bld(ctx->program, ctx->block); + unsigned enabled_channels = 0; + bool compr = false; + Operand values[4]; + + for (unsigned i = 0; i < 4; ++i) { + values[i] = Operand(v1); + } + + /* Both stencil and sample mask only need 16-bits. */ + if (!ctx->program->info->ps.writes_z && + (ctx->program->info->ps.writes_stencil || + ctx->program->info->ps.writes_sample_mask)) { + compr = true; /* COMPR flag */ + + if (ctx->program->info->ps.writes_stencil) { + /* Stencil should be in X[23:16]. */ + values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]); + values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]); + enabled_channels |= 0x3; + } + + if (ctx->program->info->ps.writes_sample_mask) { + /* SampleMask should be in Y[15:0]. */ + values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]); + enabled_channels |= 0xc; + } + } else { + if (ctx->program->info->ps.writes_z) { + values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]); + enabled_channels |= 0x1; + } + + if (ctx->program->info->ps.writes_stencil) { + values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]); + enabled_channels |= 0x2; + } + + if (ctx->program->info->ps.writes_sample_mask) { + values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]); + enabled_channels |= 0x4; + } + } + + /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X + * writemask component. + */ + if (ctx->options->chip_class == GFX6 && + ctx->options->family != CHIP_OLAND && + ctx->options->family != CHIP_HAINAN) { + enabled_channels |= 0x1; + } + + bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], + enabled_channels, V_008DFC_SQ_EXP_MRTZ, compr); + + return true; +} + +static bool export_fs_mrt_color(isel_context *ctx, int slot) +{ + Builder bld(ctx->program, ctx->block); + unsigned write_mask = ctx->outputs.mask[slot]; + Operand values[4]; + + for (unsigned i = 0; i < 4; ++i) { + if (write_mask & (1 << i)) { + values[i] = Operand(ctx->outputs.temps[slot * 4u + i]); + } else { + values[i] = Operand(v1); + } + } + + unsigned target, col_format; + unsigned enabled_channels = 0; + aco_opcode compr_op = (aco_opcode)0; + + slot -= FRAG_RESULT_DATA0; + target = V_008DFC_SQ_EXP_MRT + slot; + col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf; + + bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1; + bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1; + + switch (col_format) + { + case V_028714_SPI_SHADER_ZERO: + enabled_channels = 0; /* writemask */ + target = V_008DFC_SQ_EXP_NULL; + break; + + case V_028714_SPI_SHADER_32_R: + enabled_channels = 1; + break; + + case V_028714_SPI_SHADER_32_GR: + enabled_channels = 0x3; + break; + + case V_028714_SPI_SHADER_32_AR: + if (ctx->options->chip_class >= GFX10) { + /* Special case: on GFX10, the outputs are different for 32_AR */ + enabled_channels = 0x3; + values[1] = values[3]; + values[3] = Operand(v1); + } else { + enabled_channels = 0x9; + } + break; + + case V_028714_SPI_SHADER_FP16_ABGR: + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pkrtz_f16_f32; + break; + + case V_028714_SPI_SHADER_UNORM16_ABGR: + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pknorm_u16_f32; + break; + + case V_028714_SPI_SHADER_SNORM16_ABGR: + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pknorm_i16_f32; + break; + + case V_028714_SPI_SHADER_UINT16_ABGR: { + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pk_u16_u32; + if (is_int8 || is_int10) { + /* clamp */ + uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0; + Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb)); + + for (unsigned i = 0; i < 4; i++) { + if ((write_mask >> i) & 1) { + values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), + i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val), + values[i]); + } + } + } + break; + } + + case V_028714_SPI_SHADER_SINT16_ABGR: + enabled_channels = 0x5; + compr_op = aco_opcode::v_cvt_pk_i16_i32; + if (is_int8 || is_int10) { + /* clamp */ + uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0; + uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0; + Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb)); + Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb)); + + for (unsigned i = 0; i < 4; i++) { + if ((write_mask >> i) & 1) { + values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), + i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val), + values[i]); + values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), + i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val), + values[i]); + } + } + } + break; + + case V_028714_SPI_SHADER_32_ABGR: + enabled_channels = 0xF; + break; + + default: + break; + } + + if (target == V_008DFC_SQ_EXP_NULL) + return false; + + if ((bool) compr_op) { + for (int i = 0; i < 2; i++) { + /* check if at least one of the values to be compressed is enabled */ + unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1; + if (enabled) { + enabled_channels |= enabled << (i*2); + values[i] = bld.vop3(compr_op, bld.def(v1), + values[i*2].isUndefined() ? Operand(0u) : values[i*2], + values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]); + } else { + values[i] = Operand(v1); + } + } + values[2] = Operand(v1); + values[3] = Operand(v1); + } else { + for (int i = 0; i < 4; i++) + values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1); + } + + bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], + enabled_channels, target, (bool) compr_op); + return true; +} + +static void create_fs_exports(isel_context *ctx) +{ + bool exported = false; + + /* Export depth, stencil and sample mask. */ + if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || + ctx->outputs.mask[FRAG_RESULT_STENCIL] || + ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK]) + exported |= export_fs_mrt_z(ctx); + + /* Export all color render targets. */ + for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i) + if (ctx->outputs.mask[i]) + exported |= export_fs_mrt_color(ctx, i); + + if (!exported) + create_null_export(ctx); +} + +static void write_tcs_tess_factors(isel_context *ctx) +{ + unsigned outer_comps; + unsigned inner_comps; + + switch (ctx->args->options->key.tcs.primitive_mode) { + case GL_ISOLINES: + outer_comps = 2; + inner_comps = 0; + break; + case GL_TRIANGLES: + outer_comps = 3; + inner_comps = 1; + break; + case GL_QUADS: + outer_comps = 4; + inner_comps = 2; + break; + default: + return; + } + + Builder bld(ctx->program, ctx->block); + + bld.barrier(aco_opcode::p_memory_barrier_shared); + if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size)) + bld.sopp(aco_opcode::s_barrier); + + Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids); + Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u)); + + Temp invocation_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), invocation_id); + if_context ic_invocation_id_is_zero; + begin_divergent_if_then(ctx, &ic_invocation_id_is_zero, invocation_id_is_zero); + bld.reset(ctx->block); + + Temp hs_ring_tess_factor = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_FACTOR * 16u)); + + std::pair lds_base = get_tcs_output_lds_offset(ctx); + unsigned stride = inner_comps + outer_comps; + unsigned lds_align = calculate_lds_alignment(ctx, lds_base.second); + Temp tf_inner_vec; + Temp tf_outer_vec; + Temp out[6]; + assert(stride <= (sizeof(out) / sizeof(Temp))); + + if (ctx->args->options->key.tcs.primitive_mode == GL_ISOLINES) { + // LINES reversal + tf_outer_vec = load_lds(ctx, 4, bld.tmp(v2), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align); + out[1] = emit_extract_vector(ctx, tf_outer_vec, 0, v1); + out[0] = emit_extract_vector(ctx, tf_outer_vec, 1, v1); + } else { + tf_outer_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, outer_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align); + tf_inner_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, inner_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_in_loc, lds_align); - if (outinfo->export_prim_id) { - ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1; - ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id); + for (unsigned i = 0; i < outer_comps; ++i) + out[i] = emit_extract_vector(ctx, tf_outer_vec, i, v1); + for (unsigned i = 0; i < inner_comps; ++i) + out[outer_comps + i] = emit_extract_vector(ctx, tf_inner_vec, i, v1); } - if (ctx->options->key.has_multiview_view_index) { - ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1; - ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index)); - } + Temp rel_patch_id = get_tess_rel_patch_id(ctx); + Temp tf_base = get_arg(ctx, ctx->args->tess_factor_offset); + Temp byte_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, stride * 4u); + unsigned tf_const_offset = 0; - /* the order these position exports are created is important */ - int next_pos = 0; - export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos); - if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) { - export_vs_psiz_layer_viewport(ctx, &next_pos); - } - if (ctx->num_clip_distances + ctx->num_cull_distances > 0) - export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos); - if (ctx->num_clip_distances + ctx->num_cull_distances > 4) - export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos); + if (ctx->program->chip_class <= GFX8) { + Temp rel_patch_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), rel_patch_id); + if_context ic_rel_patch_id_is_zero; + begin_divergent_if_then(ctx, &ic_rel_patch_id_is_zero, rel_patch_id_is_zero); + bld.reset(ctx->block); - if (ctx->options->key.vs_common_out.export_clip_dists) { - if (ctx->num_clip_distances + ctx->num_cull_distances > 0) - export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos); - if (ctx->num_clip_distances + ctx->num_cull_distances > 4) - export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos); + /* Store the dynamic HS control word. */ + Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u)); + bld.mubuf(aco_opcode::buffer_store_dword, + /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word, + /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false, + /* disable_wqm */ false, /* glc */ true); + tf_const_offset += 4; + + begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero); + end_divergent_if(ctx, &ic_rel_patch_id_is_zero); + bld.reset(ctx->block); } - for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { - if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER && - i != VARYING_SLOT_PRIMITIVE_ID) - continue; + assert(stride == 2 || stride == 4 || stride == 6); + Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr, 4u); + store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false); - export_vs_varying(ctx, i, false, NULL); + /* Store to offchip for TES to read - only if TES reads them */ + if (ctx->args->options->key.tcs.tes_reads_tess_factors) { + Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u)); + Temp oc_lds = get_arg(ctx, ctx->args->oc_lds); + + std::pair vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_out_loc); + store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false); + + if (likely(inner_comps)) { + std::pair vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_in_loc); + store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false); + } } + + begin_divergent_if_else(ctx, &ic_invocation_id_is_zero); + end_divergent_if(ctx, &ic_invocation_id_is_zero); } static void emit_stream_output(isel_context *ctx, @@ -7831,9 +10064,9 @@ static void emit_stream_output(isel_context *ctx, const struct radv_stream_output *output) { unsigned num_comps = util_bitcount(output->component_mask); + unsigned writemask = (1 << num_comps) - 1; unsigned loc = output->location; unsigned buf = output->buffer; - unsigned offset = output->offset; assert(num_comps && num_comps <= 4); if (!num_comps || num_comps > 4) @@ -7843,55 +10076,69 @@ static void emit_stream_output(isel_context *ctx, Temp out[4]; bool all_undef = true; - assert(ctx->stage == vertex_vs); + assert(ctx->stage & hw_vs); for (unsigned i = 0; i < num_comps; i++) { - out[i] = ctx->vs_output.outputs[loc][start + i]; + out[i] = ctx->outputs.temps[loc * 4 + start + i]; all_undef = all_undef && !out[i].id(); } if (all_undef) return; - Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_comps)}; - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_comps, 1)}; - for (unsigned i = 0; i < num_comps; ++i) - vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << i) ? Operand(out[i]) : Operand(0u); - vec->definitions[0] = Definition(write_data); - ctx->block->instructions.emplace_back(std::move(vec)); + while (writemask) { + int start, count; + u_bit_scan_consecutive_range(&writemask, &start, &count); + if (count == 3 && ctx->options->chip_class == GFX6) { + /* GFX6 doesn't support storing vec3, split it. */ + writemask |= 1u << (start + 2); + count = 2; + } - aco_opcode opcode; - switch (num_comps) { - case 1: - opcode = aco_opcode::buffer_store_dword; - break; - case 2: - opcode = aco_opcode::buffer_store_dwordx2; - break; - case 3: - opcode = aco_opcode::buffer_store_dwordx3; - break; - case 4: - opcode = aco_opcode::buffer_store_dwordx4; - break; - } + unsigned offset = output->offset + start * 4; - aco_ptr store{create_instruction(opcode, Format::MUBUF, 4, 0)}; - store->operands[0] = Operand(so_write_offset[buf]); - store->operands[1] = Operand(so_buffers[buf]); - store->operands[2] = Operand((uint32_t) 0); - store->operands[3] = Operand(write_data); - if (offset > 4095) { - /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */ - Builder bld(ctx->program, ctx->block); - store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf])); - } else { - store->offset = offset; + Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, count)}; + aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; + for (int i = 0; i < count; ++i) + vec->operands[i] = (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u); + vec->definitions[0] = Definition(write_data); + ctx->block->instructions.emplace_back(std::move(vec)); + + aco_opcode opcode; + switch (count) { + case 1: + opcode = aco_opcode::buffer_store_dword; + break; + case 2: + opcode = aco_opcode::buffer_store_dwordx2; + break; + case 3: + opcode = aco_opcode::buffer_store_dwordx3; + break; + case 4: + opcode = aco_opcode::buffer_store_dwordx4; + break; + default: + unreachable("Unsupported dword count."); + } + + aco_ptr store{create_instruction(opcode, Format::MUBUF, 4, 0)}; + store->operands[0] = Operand(so_buffers[buf]); + store->operands[1] = Operand(so_write_offset[buf]); + store->operands[2] = Operand((uint32_t) 0); + store->operands[3] = Operand(write_data); + if (offset > 4095) { + /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */ + Builder bld(ctx->program, ctx->block); + store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf])); + } else { + store->offset = offset; + } + store->offen = true; + store->glc = true; + store->dlc = false; + store->slc = true; + store->can_reorder = true; + ctx->block->instructions.emplace_back(std::move(store)); } - store->offen = true; - store->glc = true; - store->dlc = false; - store->slc = true; - store->can_reorder = true; - ctx->block->instructions.emplace_back(std::move(store)); } static void emit_streamout(isel_context *ctx, unsigned stream) @@ -7905,7 +10152,8 @@ static void emit_streamout(isel_context *ctx, unsigned stream) if (!stride) continue; - so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, Operand(i * 16u)); + Operand off = bld.copy(bld.def(s1), Operand(i * 16u)); + so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, off); } Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), @@ -7913,7 +10161,7 @@ static void emit_streamout(isel_context *ctx, unsigned stream) Temp tid = emit_mbcnt(ctx, bld.def(v1)); - Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid); + Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid); if_context ic; begin_divergent_if_then(ctx, &ic, can_emit); @@ -7959,12 +10207,42 @@ static void emit_streamout(isel_context *ctx, unsigned stream) } /* end namespace */ +void fix_ls_vgpr_init_bug(isel_context *ctx, Pseudo_instruction *startpgm) +{ + assert(ctx->shader->info.stage == MESA_SHADER_VERTEX); + Builder bld(ctx->program, ctx->block); + constexpr unsigned hs_idx = 1u; + Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->merged_wave_info), + Operand((8u << 16) | (hs_idx * 8u))); + Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp()); + + /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */ + + Temp instance_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + get_arg(ctx, ctx->args->rel_auto_id), + get_arg(ctx, ctx->args->ac.instance_id), + ls_has_nonzero_hs_threads); + Temp rel_auto_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + get_arg(ctx, ctx->args->ac.tcs_rel_ids), + get_arg(ctx, ctx->args->rel_auto_id), + ls_has_nonzero_hs_threads); + Temp vertex_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), + get_arg(ctx, ctx->args->ac.tcs_patch_id), + get_arg(ctx, ctx->args->ac.vertex_id), + ls_has_nonzero_hs_threads); + + ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id; + ctx->arg_temps[ctx->args->rel_auto_id.arg_index] = rel_auto_id; + ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id; +} + void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm) { /* Split all arguments except for the first (ring_offsets) and the last * (exec) so that the dead channels don't stay live throughout the program. */ - for (unsigned i = 1; i < startpgm->definitions.size() - 1; i++) { + for (int i = 1; i < startpgm->definitions.size() - 1; i++) { if (startpgm->definitions[i].regClass().size() > 1) { emit_split_vector(ctx, startpgm->definitions[i].getTemp(), startpgm->definitions[i].regClass().size()); @@ -8065,13 +10343,260 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader) ctx->block->fp_mode = program->next_fp_mode; } +void cleanup_cfg(Program *program) +{ + /* create linear_succs/logical_succs */ + for (Block& BB : program->blocks) { + for (unsigned idx : BB.linear_preds) + program->blocks[idx].linear_succs.emplace_back(BB.index); + for (unsigned idx : BB.logical_preds) + program->blocks[idx].logical_succs.emplace_back(BB.index); + } +} + +Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i) +{ + Builder bld(ctx->program, ctx->block); + + /* The s_bfm only cares about s0.u[5:0] so we don't need either s_bfe nor s_and here */ + Temp count = i == 0 + ? get_arg(ctx, ctx->args->merged_wave_info) + : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->merged_wave_info), Operand(i * 8u)); + + Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand(0u)); + Temp cond; + + if (ctx->program->wave_size == 64) { + /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */ + Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */)); + cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), mask, bld.scc(active_64)); + } else { + /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of the register */ + cond = emit_extract_vector(ctx, mask, 0, bld.lm); + } + + return cond; +} + +bool ngg_early_prim_export(isel_context *ctx) +{ + /* TODO: Check edge flags, and if they are written, return false. (Needed for OpenGL, not for Vulkan.) */ + return true; +} + +void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx) +{ + Builder bld(ctx->program, ctx->block); + + /* It is recommended to do the GS_ALLOC_REQ as soon and as quickly as possible, so we set the maximum priority (3). */ + bld.sopp(aco_opcode::s_setprio, -1u, 0x3u); + + /* Get the id of the current wave within the threadgroup (workgroup) */ + Builder::Result wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16))); + + /* Execute the following code only on the first wave (wave id 0), + * use the SCC def to tell if the wave id is zero or not. + */ + Temp cond = wave_id_in_tg.def(1).getTemp(); + if_context ic; + begin_uniform_if_then(ctx, &ic, cond); + begin_uniform_if_else(ctx, &ic); + bld.reset(ctx->block); + + /* Number of vertices output by VS/TES */ + Temp vtx_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->gs_tg_info), Operand(12u | (9u << 16u))); + /* Number of primitives output by VS/TES */ + Temp prm_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->gs_tg_info), Operand(22u | (9u << 16u))); + + /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */ + Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand(12u)); + tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt); + + /* Request the SPI to allocate space for the primitives and vertices that will be exported by the threadgroup. */ + bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req); + + end_uniform_if(ctx, &ic); + + /* After the GS_ALLOC_REQ is done, reset priority to default (0). */ + bld.reset(ctx->block); + bld.sopp(aco_opcode::s_setprio, -1u, 0x0u); +} + +Temp ngg_get_prim_exp_arg(isel_context *ctx, unsigned num_vertices, const Temp vtxindex[]) +{ + Builder bld(ctx->program, ctx->block); + + if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) { + return get_arg(ctx, ctx->args->gs_vtx_offset[0]); + } + + Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id); + Temp tmp; + + for (unsigned i = 0; i < num_vertices; ++i) { + assert(vtxindex[i].id()); + + if (i) + tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), vtxindex[i], Operand(10u * i), tmp); + else + tmp = vtxindex[i]; + + /* The initial edge flag is always false in tess eval shaders. */ + if (ctx->stage == ngg_vertex_gs) { + Temp edgeflag = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), gs_invocation_id, Operand(8 + i), Operand(1u)); + tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), edgeflag, Operand(10u * i + 9u), tmp); + } + } + + /* TODO: Set isnull field in case of merged NGG VS+GS. */ + + return tmp; +} + +void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive, const Temp vtxindex[]) +{ + Builder bld(ctx->program, ctx->block); + Temp prim_exp_arg = ngg_get_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex); + + bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1), + 1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, + false /* compressed */, true/* done */, false /* valid mask */); +} + +void ngg_emit_nogs_gsthreads(isel_context *ctx) +{ + /* Emit the things that NGG GS threads need to do, for shaders that don't have SW GS. + * These must always come before VS exports. + * + * It is recommended to do these as early as possible. They can be at the beginning when + * there is no SW GS and the shader doesn't write edge flags. + */ + + if_context ic; + Temp is_gs_thread = merged_wave_info_to_mask(ctx, 1); + begin_divergent_if_then(ctx, &ic, is_gs_thread); + + Builder bld(ctx->program, ctx->block); + constexpr unsigned max_vertices_per_primitive = 3; + unsigned num_vertices_per_primitive = max_vertices_per_primitive; + + if (ctx->stage == ngg_vertex_gs) { + /* TODO: optimize for points & lines */ + } else if (ctx->stage == ngg_tess_eval_gs) { + if (ctx->shader->info.tess.point_mode) + num_vertices_per_primitive = 1; + else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES) + num_vertices_per_primitive = 2; + } else { + unreachable("Unsupported NGG shader stage"); + } + + Temp vtxindex[max_vertices_per_primitive]; + vtxindex[0] = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu), + get_arg(ctx, ctx->args->gs_vtx_offset[0])); + vtxindex[1] = num_vertices_per_primitive < 2 ? Temp(0, v1) : + bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), + get_arg(ctx, ctx->args->gs_vtx_offset[0]), Operand(16u), Operand(16u)); + vtxindex[2] = num_vertices_per_primitive < 3 ? Temp(0, v1) : + bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu), + get_arg(ctx, ctx->args->gs_vtx_offset[2])); + + /* Export primitive data to the index buffer. */ + ngg_emit_prim_export(ctx, num_vertices_per_primitive, vtxindex); + + /* Export primitive ID. */ + if (ctx->stage == ngg_vertex_gs && ctx->args->options->key.vs_common_out.export_prim_id) { + /* Copy Primitive IDs from GS threads to the LDS address corresponding to the ES thread of the provoking vertex. */ + Temp prim_id = get_arg(ctx, ctx->args->ac.gs_prim_id); + Temp provoking_vtx_index = vtxindex[0]; + Temp addr = bld.v_mul_imm(bld.def(v1), provoking_vtx_index, 4u); + + store_lds(ctx, 4, prim_id, 0x1u, addr, 0u, 4u); + } + + begin_divergent_if_else(ctx, &ic); + end_divergent_if(ctx, &ic); +} + +void ngg_emit_nogs_output(isel_context *ctx) +{ + /* Emits NGG GS output, for stages that don't have SW GS. */ + + if_context ic; + Builder bld(ctx->program, ctx->block); + bool late_prim_export = !ngg_early_prim_export(ctx); + + /* NGG streamout is currently disabled by default. */ + assert(!ctx->args->shader_info->so.num_outputs); + + if (late_prim_export) { + /* VS exports are output to registers in a predecessor block. Emit phis to get them into this block. */ + create_export_phis(ctx); + /* Do what we need to do in the GS threads. */ + ngg_emit_nogs_gsthreads(ctx); + + /* What comes next should be executed on ES threads. */ + Temp is_es_thread = merged_wave_info_to_mask(ctx, 0); + begin_divergent_if_then(ctx, &ic, is_es_thread); + bld.reset(ctx->block); + } + + /* Export VS outputs */ + ctx->block->kind |= block_kind_export_end; + create_vs_exports(ctx); + + /* Export primitive ID */ + if (ctx->args->options->key.vs_common_out.export_prim_id) { + Temp prim_id; + + if (ctx->stage == ngg_vertex_gs) { + /* Wait for GS threads to store primitive ID in LDS. */ + bld.barrier(aco_opcode::p_memory_barrier_shared); + bld.sopp(aco_opcode::s_barrier); + + /* Calculate LDS address where the GS threads stored the primitive ID. */ + Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), + get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16))); + Temp thread_id_in_wave = emit_mbcnt(ctx, bld.def(v1)); + Temp wave_id_mul = bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_id_in_tg), ctx->program->wave_size); + Temp thread_id_in_tg = bld.vadd32(bld.def(v1), Operand(wave_id_mul), Operand(thread_id_in_wave)); + Temp addr = bld.v_mul24_imm(bld.def(v1), thread_id_in_tg, 4u); + + /* Load primitive ID from LDS. */ + prim_id = load_lds(ctx, 4, bld.tmp(v1), addr, 0u, 4u); + } else if (ctx->stage == ngg_tess_eval_gs) { + /* TES: Just use the patch ID as the primitive ID. */ + prim_id = get_arg(ctx, ctx->args->ac.tes_patch_id); + } else { + unreachable("unsupported NGG shader stage."); + } + + ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1; + ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = prim_id; + + export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, nullptr); + } + + if (late_prim_export) { + begin_divergent_if_else(ctx, &ic); + end_divergent_if(ctx, &ic); + bld.reset(ctx->block); + } +} + void select_program(Program *program, unsigned shader_count, struct nir_shader *const *shaders, ac_shader_config* config, struct radv_shader_args *args) { - isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args); + isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false); + if_context ic_merged_wave_info; + bool ngg_no_gs = ctx.stage == ngg_vertex_gs || ctx.stage == ngg_tess_eval_gs; for (unsigned i = 0; i < shader_count; i++) { nir_shader *nir = shaders[i]; @@ -8083,43 +10608,88 @@ void select_program(Program *program, /* needs to be after init_context() for FS */ Pseudo_instruction *startpgm = add_startpgm(&ctx); append_logical_start(ctx.block); + + if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs)) + fix_ls_vgpr_init_bug(&ctx, startpgm); + split_arguments(&ctx, startpgm); } - if_context ic; - if (shader_count >= 2) { - Builder bld(ctx.program, ctx.block); - Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u))); - Temp thread_id = emit_mbcnt(&ctx, bld.def(v1)); - Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(bld.lm)), count, thread_id); + if (ngg_no_gs) { + ngg_emit_sendmsg_gs_alloc_req(&ctx); + + if (ngg_early_prim_export(&ctx)) + ngg_emit_nogs_gsthreads(&ctx); + } + + /* In a merged VS+TCS HS, the VS implementation can be completely empty. */ + nir_function_impl *func = nir_shader_get_entrypoint(nir); + bool empty_shader = nir_cf_list_is_empty_block(&func->body) && + ((nir->info.stage == MESA_SHADER_VERTEX && + (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) || + (nir->info.stage == MESA_SHADER_TESS_EVAL && + ctx.stage == tess_eval_geometry_gs)); - begin_divergent_if_then(&ctx, &ic, cond); + bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : ((shader_count >= 2 && !empty_shader) || ngg_no_gs); + bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info; + if (check_merged_wave_info) { + Temp cond = merged_wave_info_to_mask(&ctx, i); + begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond); } if (i) { Builder bld(ctx.program, ctx.block); - bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages + + bld.barrier(aco_opcode::p_memory_barrier_shared); bld.sopp(aco_opcode::s_barrier); - } + + if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) { + ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u)); + } + } else if (ctx.stage == geometry_gs) + ctx.gs_wave_id = get_arg(&ctx, args->gs_wave_id); if (ctx.stage == fragment_fs) handle_bc_optimize(&ctx); - nir_function_impl *func = nir_shader_get_entrypoint(nir); visit_cf_list(&ctx, &func->body); - if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */) + if (ctx.program->info->so.num_outputs && (ctx.stage & hw_vs)) emit_streamout(&ctx, 0); - if (ctx.stage == vertex_vs) + if (ctx.stage & hw_vs) { create_vs_exports(&ctx); + ctx.block->kind |= block_kind_export_end; + } else if (ngg_no_gs && ngg_early_prim_export(&ctx)) { + ngg_emit_nogs_output(&ctx); + } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { + Builder bld(ctx.program, ctx.block); + bld.barrier(aco_opcode::p_memory_barrier_gs_data); + bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0)); + } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) { + write_tcs_tess_factors(&ctx); + } - if (shader_count >= 2) { - begin_divergent_if_else(&ctx, &ic); - end_divergent_if(&ctx, &ic); + if (ctx.stage == fragment_fs) { + create_fs_exports(&ctx); + ctx.block->kind |= block_kind_export_end; } + if (endif_merged_wave_info) { + begin_divergent_if_else(&ctx, &ic_merged_wave_info); + end_divergent_if(&ctx, &ic_merged_wave_info); + } + + if (ngg_no_gs && !ngg_early_prim_export(&ctx)) + ngg_emit_nogs_output(&ctx); + ralloc_free(ctx.divergent_vals); + + if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) { + /* Outputs of the previous stage are inputs to the next stage */ + ctx.inputs = ctx.outputs; + ctx.outputs = shader_io_state(); + } } program->config->float_mode = program->blocks[0].fp_mode.val; @@ -8131,12 +10701,162 @@ void select_program(Program *program, bld.smem(aco_opcode::s_dcache_wb, false); bld.sopp(aco_opcode::s_endpgm); - /* cleanup CFG */ - for (Block& BB : program->blocks) { - for (unsigned idx : BB.linear_preds) - program->blocks[idx].linear_succs.emplace_back(BB.index); - for (unsigned idx : BB.logical_preds) - program->blocks[idx].logical_succs.emplace_back(BB.index); + cleanup_cfg(program); +} + +void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, + ac_shader_config* config, + struct radv_shader_args *args) +{ + isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true); + + program->next_fp_mode.preserve_signed_zero_inf_nan32 = false; + program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false; + program->next_fp_mode.must_flush_denorms32 = false; + program->next_fp_mode.must_flush_denorms16_64 = false; + program->next_fp_mode.care_about_round32 = false; + program->next_fp_mode.care_about_round16_64 = false; + program->next_fp_mode.denorm16_64 = fp_denorm_keep; + program->next_fp_mode.denorm32 = 0; + program->next_fp_mode.round32 = fp_round_ne; + program->next_fp_mode.round16_64 = fp_round_ne; + ctx.block->fp_mode = program->next_fp_mode; + + add_startpgm(&ctx); + append_logical_start(ctx.block); + + Builder bld(ctx.program, ctx.block); + + Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u)); + + Operand stream_id(0u); + if (args->shader_info->so.num_outputs) + stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), + get_arg(&ctx, ctx.args->streamout_config), Operand(0x20018u)); + + Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id)); + + std::stack endif_blocks; + + for (unsigned stream = 0; stream < 4; stream++) { + if (stream_id.isConstant() && stream != stream_id.constantValue()) + continue; + + unsigned num_components = args->shader_info->gs.num_stream_output_components[stream]; + if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs)) + continue; + + memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask)); + + unsigned BB_if_idx = ctx.block->index; + Block BB_endif = Block(); + if (!stream_id.isConstant()) { + /* begin IF */ + Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream)); + append_logical_end(ctx.block); + ctx.block->kind |= block_kind_uniform; + bld.branch(aco_opcode::p_cbranch_z, cond); + + BB_endif.kind |= ctx.block->kind & block_kind_top_level; + + ctx.block = ctx.program->create_and_insert_block(); + add_edge(BB_if_idx, ctx.block); + bld.reset(ctx.block); + append_logical_start(ctx.block); + } + + unsigned offset = 0; + for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { + if (args->shader_info->gs.output_streams[i] != stream) + continue; + + unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i]; + unsigned length = util_last_bit(output_usage_mask); + for (unsigned j = 0; j < length; ++j) { + if (!(output_usage_mask & (1 << j))) + continue; + + unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4; + Temp voffset = vtx_offset; + if (const_offset >= 4096u) { + voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset); + const_offset %= 4096u; + } + + aco_ptr mubuf{create_instruction(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)}; + mubuf->definitions[0] = bld.def(v1); + mubuf->operands[0] = Operand(gsvs_ring); + mubuf->operands[1] = Operand(voffset); + mubuf->operands[2] = Operand(0u); + mubuf->offen = true; + mubuf->offset = const_offset; + mubuf->glc = true; + mubuf->slc = true; + mubuf->dlc = args->options->chip_class >= GFX10; + mubuf->barrier = barrier_none; + mubuf->can_reorder = true; + + ctx.outputs.mask[i] |= 1 << j; + ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp(); + + bld.insert(std::move(mubuf)); + + offset++; + } + } + + if (args->shader_info->so.num_outputs) { + emit_streamout(&ctx, stream); + bld.reset(ctx.block); + } + + if (stream == 0) { + create_vs_exports(&ctx); + ctx.block->kind |= block_kind_export_end; + } + + if (!stream_id.isConstant()) { + append_logical_end(ctx.block); + + /* branch from then block to endif block */ + bld.branch(aco_opcode::p_branch); + add_edge(ctx.block->index, &BB_endif); + ctx.block->kind |= block_kind_uniform; + + /* emit else block */ + ctx.block = ctx.program->create_and_insert_block(); + add_edge(BB_if_idx, ctx.block); + bld.reset(ctx.block); + append_logical_start(ctx.block); + + endif_blocks.push(std::move(BB_endif)); + } + } + + while (!endif_blocks.empty()) { + Block BB_endif = std::move(endif_blocks.top()); + endif_blocks.pop(); + + Block *BB_else = ctx.block; + + append_logical_end(BB_else); + /* branch from else block to endif block */ + bld.branch(aco_opcode::p_branch); + add_edge(BB_else->index, &BB_endif); + BB_else->kind |= block_kind_uniform; + + /** emit endif merge block */ + ctx.block = program->insert_block(std::move(BB_endif)); + bld.reset(ctx.block); + append_logical_start(ctx.block); } + + program->config->float_mode = program->blocks[0].fp_mode.val; + + append_logical_end(ctx.block); + ctx.block->kind |= block_kind_uniform; + bld.sopp(aco_opcode::s_endpgm); + + cleanup_cfg(program); } }