X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_instruction_selection.cpp;h=80ea1e133d661420025877f3f024807687d4555e;hb=626081fe4bf653a18cff9e25e6da2636bc58774a;hp=beaebdb125700bcd2b71c360e49f94c672bad93b;hpb=3ee3ad561a29d5429309571db489f95e4ccaec5b;p=mesa.git diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index beaebdb1257..80ea1e133d6 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -136,8 +136,11 @@ Temp emit_mbcnt(isel_context *ctx, Definition dst, if (ctx->program->wave_size == 32) { return thread_id_lo; + } else if (ctx->program->chip_class <= GFX7) { + Temp thread_id_hi = bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo); + return thread_id_hi; } else { - Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo); + Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, dst, mask_hi, thread_id_lo); return thread_id_hi; } } @@ -169,33 +172,69 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data if (index.regClass() == s1) return bld.readlane(bld.def(s1), data, index); - Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index); - - /* Currently not implemented on GFX6-7 */ - assert(ctx->options->chip_class >= GFX8); - - if (ctx->options->chip_class <= GFX9 || ctx->program->wave_size == 32) { + if (ctx->options->chip_class <= GFX7) { + /* GFX6-7: there is no bpermute instruction */ + Operand index_op(index); + Operand input_data(data); + index_op.setLateKill(true); + input_data.setLateKill(true); + + return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data); + } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) { + /* GFX10 wave64 mode: emulate full-wave bpermute */ + if (!ctx->has_gfx10_wave64_bpermute) { + ctx->has_gfx10_wave64_bpermute = true; + ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */ + ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */ + } + + Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index); + Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo); + Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp()); + Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1); + Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index); + Operand input_data(data); + + index_x4.setLateKill(true); + input_data.setLateKill(true); + same_half.setLateKill(true); + + return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half); + } else { + /* GFX8-9 or GFX10 wave32: bpermute works normally */ + Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index); return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data); } +} - /* GFX10, wave64 mode: - * The bpermute instruction is limited to half-wave operation, which means that it can't - * properly support subgroup shuffle like older generations (or wave32 mode), so we - * emulate it here. - */ - if (!ctx->has_gfx10_wave64_bpermute) { - ctx->has_gfx10_wave64_bpermute = true; - ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */ - ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */ - } +static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask) +{ + if (ctx->options->chip_class >= GFX8) { + unsigned and_mask = mask & 0x1f; + unsigned or_mask = (mask >> 5) & 0x1f; + unsigned xor_mask = (mask >> 10) & 0x1f; + + uint16_t dpp_ctrl = 0xffff; + + // TODO: we could use DPP8 for some swizzles + if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) { + unsigned res[4] = {0, 1, 2, 3}; + for (unsigned i = 0; i < 4; i++) + res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3; + dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]); + } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) { + dpp_ctrl = dpp_row_rr(8); + } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) { + dpp_ctrl = dpp_row_mirror; + } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) { + dpp_ctrl = dpp_row_half_mirror; + } - Temp lane_id = emit_mbcnt(ctx, bld.def(v1)); - Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id); - Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index); - Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), lane_is_hi, index_is_hi); + if (dpp_ctrl != 0xffff) + return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl); + } - return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), - bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute); + return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false); } Temp as_vgpr(isel_context *ctx, Temp val) @@ -396,7 +435,7 @@ void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst) bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec); hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u)); if (select != Temp()) - hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), select); + hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select)); lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift); Temp mid = bld.tmp(s1); lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo); @@ -407,38 +446,67 @@ void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst) } } -/* this function trims subdword vectors: - * if dst is vgpr - split the src and create a shrunk version according to the mask. - * if dst is sgpr - split the src, but move the original to sgpr. */ -void trim_subdword_vector(isel_context *ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask) +void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size) { - assert(vec_src.type() == RegType::vgpr); - emit_split_vector(ctx, vec_src, num_components); - Builder bld(ctx->program, ctx->block); - std::array elems; - unsigned component_size = vec_src.bytes() / num_components; - RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword(); + if (offset.isTemp()) { + Temp tmp[4] = {vec, vec, vec, vec}; - unsigned k = 0; - for (unsigned i = 0; i < num_components; i++) { - if (mask & (1 << i)) - elems[k++] = emit_extract_vector(ctx, vec_src, i, rc); + if (vec.size() == 4) { + tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec); + } else if (vec.size() == 3) { + tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec); + } else if (vec.size() == 2) { + tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1]; + bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec); + } + for (unsigned i = 0; i < dst.size(); i++) + tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset); + + vec = tmp[0]; + if (dst.size() == 2) + vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]); + + offset = Operand(0u); + } + + unsigned num_components = vec.bytes() / component_size; + if (vec.regClass() == dst.regClass()) { + assert(offset.constantValue() == 0); + bld.copy(Definition(dst), vec); + emit_split_vector(ctx, dst, num_components); + return; } + emit_split_vector(ctx, vec, num_components); + std::array elems; + RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword(); + + assert(offset.constantValue() % component_size == 0); + unsigned skip = offset.constantValue() / component_size; + for (unsigned i = skip; i < num_components; i++) + elems[i - skip] = emit_extract_vector(ctx, vec, i, rc); + + /* if dst is vgpr - split the src and create a shrunk version according to the mask. */ if (dst.type() == RegType::vgpr) { - assert(dst.bytes() == k * component_size); - aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, k, 1)}; - for (unsigned i = 0; i < k; i++) - vec->operands[i] = Operand(elems[i]); - vec->definitions[0] = Definition(dst); - bld.insert(std::move(vec)); + num_components = dst.bytes() / component_size; + aco_ptr create_vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; + for (unsigned i = 0; i < num_components; i++) + create_vec->operands[i] = Operand(elems[i]); + create_vec->definitions[0] = Definition(dst); + bld.insert(std::move(create_vec)); + + /* if dst is sgpr - split the src, but move the original to sgpr. */ + } else if (skip) { + vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec); + byte_align_scalar(ctx, vec, offset, dst); } else { - // TODO: alignbyte if mask doesn't start with 1? - assert(mask & 1); - assert(dst.size() == vec_src.size()); - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src); + assert(dst.size() == vec.size()); + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec); } + ctx->allocated_vec.emplace(dst.id(), elems); } @@ -547,6 +615,8 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0])); sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1])); sop2->definitions[0] = Definition(dst); + if (instr->no_unsigned_wrap) + sop2->definitions[0].setNUW(true); if (writes_scc) sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1); ctx->block->instructions.emplace_back(std::move(sop2)); @@ -556,6 +626,8 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o bool commutative, bool swap_srcs=false, bool flush_denorms = false) { Builder bld(ctx->program, ctx->block); + bld.is_precise = instr->exact; + Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]); Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]); if (src1.type() == RegType::sgpr) { @@ -577,6 +649,31 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o } } +void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr, + aco_opcode op, Temp dst) +{ + Builder bld(ctx->program, ctx->block); + bld.is_precise = instr->exact; + + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); + + if (src1.type() == RegType::sgpr) { + assert(src0.type() == RegType::vgpr); + std::swap(src0, src1); + } + + Temp src00 = bld.tmp(src0.type(), 1); + Temp src01 = bld.tmp(src0.type(), 1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); + Temp src10 = bld.tmp(v1); + Temp src11 = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); + Temp lo = bld.vop2(op, bld.def(v1), src00, src10); + Temp hi = bld.vop2(op, bld.def(v1), src01, src11); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); +} + void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool flush_denorms = false) { @@ -594,6 +691,7 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode src2 = as_vgpr(ctx, src2); Builder bld(ctx->program, ctx->block); + bld.is_precise = instr->exact; if (flush_denorms && ctx->program->chip_class < GFX9) { assert(dst.size() == 1); Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2); @@ -606,7 +704,12 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) { Builder bld(ctx->program, ctx->block); - bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0])); + bld.is_precise = instr->exact; + if (dst.type() == RegType::sgpr) + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), + bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0]))); + else + bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0])); } void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst) @@ -711,9 +814,8 @@ void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst, { aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes; aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op; - bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index]; bool use_valu = s_op == aco_opcode::num_opcodes || - divergent_vals || + nir_dest_is_divergent(instr->dest.dest) || ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr || ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr; aco_opcode op = use_valu ? v_op : s_op; @@ -750,18 +852,12 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) if (dst.type() == RegType::vgpr) { aco_ptr bcsel; - if (dst.regClass() == v2b) { - then = as_vgpr(ctx, then); - els = as_vgpr(ctx, els); - - Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), els, then, cond); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); - } else if (dst.regClass() == v1) { + if (dst.size() == 1) { then = as_vgpr(ctx, then); els = as_vgpr(ctx, els); bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond); - } else if (dst.regClass() == v2) { + } else if (dst.size() == 2) { Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then); Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1); @@ -785,7 +881,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) assert(els.regClass() == bld.lm); } - if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */ + if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */ if (dst.regClass() == s1 || dst.regClass() == s2) { assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass()); assert(dst.size() == then.size()); @@ -884,7 +980,8 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val); /* Extract the exponent and compute the unbiased value. */ - Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f64, bld.def(v1), val); + Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u)); + exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u)); /* Extract the fractional part. */ Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu)); @@ -900,7 +997,7 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp); /* Get the sign bit. */ - Temp sign = bld.vop2(aco_opcode::v_ashr_i32, bld.def(v1), Operand(31u), val_hi); + Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi); /* Decide the operation to apply depending on the unbiased exponent. */ Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u)); @@ -918,7 +1015,8 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) if (ctx->options->chip_class >= GFX7) return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val); - /* GFX6 doesn't support V_FLOOR_F64, lower it. */ + /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually + * lowered at NIR level for precision reasons). */ Temp src0 = as_vgpr(ctx, val); Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */ @@ -944,7 +1042,7 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) return add->definitions[0].getTemp(); } -Temp convert_int(Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) { +Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) { if (!dst.id()) { if (dst_bits % 32 == 0 || src.type() == RegType::sgpr) dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u)); @@ -967,7 +1065,7 @@ Temp convert_int(Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, b bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src); else bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src); - } else { + } else if (ctx->options->chip_class >= GFX8) { assert(src_bits != 8 || src.regClass() == v1b); assert(src_bits != 16 || src.regClass() == v2b); aco_ptr sdwa{create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; @@ -979,6 +1077,10 @@ Temp convert_int(Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, b sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword; sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword; bld.insert(std::move(sdwa)); + } else { + assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7); + aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32; + bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u)); } if (dst_bits == 64) { @@ -1005,6 +1107,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) abort(); } Builder bld(ctx->program, ctx->block); + bld.is_precise = instr->exact; Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa); switch(instr->op) { case nir_op_vec2: @@ -1062,13 +1165,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src); else unreachable("wrong src register class for nir_op_imov"); - } else if (dst.regClass() == v1) { - bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src); - } else if (dst.regClass() == v2) { - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src); } else { - nir_print_instr(&instr->instr, stderr); - unreachable("Should have been lowered to scalar."); + if (dst.regClass() == v1) + bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src); + else if (dst.regClass() == v1b || + dst.regClass() == v2b || + dst.regClass() == v2) + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src); + else + unreachable("wrong src register class for nir_op_imov"); } break; } @@ -1082,6 +1187,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm)); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst); + } else if (dst.regClass() == v2) { + Temp lo = bld.tmp(v1), hi = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); + lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo); + hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); } else if (dst.type() == RegType::sgpr) { aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64; bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src); @@ -1217,6 +1328,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) emit_boolean_logic(ctx, instr, Builder::s_or, dst); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true); + } else if (dst.regClass() == v2) { + emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst); } else if (dst.regClass() == s1) { emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true); } else if (dst.regClass() == s2) { @@ -1233,6 +1346,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) emit_boolean_logic(ctx, instr, Builder::s_and, dst); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true); + } else if (dst.regClass() == v2) { + emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst); } else if (dst.regClass() == s1) { emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true); } else if (dst.regClass() == s2) { @@ -1249,6 +1364,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) emit_boolean_logic(ctx, instr, Builder::s_xor, dst); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true); + } else if (dst.regClass() == v2) { + emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst); } else if (dst.regClass() == s1) { emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true); } else if (dst.regClass() == s2) { @@ -1601,9 +1718,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1])); if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); - emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, tmp, true); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true); } else if (dst.regClass() == v2) { @@ -1619,9 +1734,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1])); if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); - emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, tmp, true); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true); } else if (dst.regClass() == v2) { @@ -1637,12 +1750,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr) - emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, tmp, false); + emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false); else - emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, tmp, true); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true); } else if (dst.regClass() == v1) { if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr) emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false); @@ -1665,9 +1776,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1])); if (dst.regClass() == v2b) { // TODO: check fp_mode.must_flush_denorms16_64 - Temp tmp = bld.tmp(v1); - emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, tmp, true); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32); } else if (dst.regClass() == v2) { @@ -1689,9 +1798,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1])); if (dst.regClass() == v2b) { // TODO: check fp_mode.must_flush_denorms16_64 - Temp tmp = bld.tmp(v1); - emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, tmp, true); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32); } else if (dst.regClass() == v2) { @@ -1710,9 +1817,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fmax3: { if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); - emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, tmp, false); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false); } else if (dst.regClass() == v1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); } else { @@ -1724,9 +1829,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fmin3: { if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); - emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, tmp, false); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false); } else if (dst.regClass() == v1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); } else { @@ -1738,9 +1841,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fmed3: { if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); - emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, tmp, false); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false); } else if (dst.regClass() == v1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); } else { @@ -1839,11 +1940,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_frsq: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_rsq_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst); } else if (dst.regClass() == v1) { emit_rsq(ctx, bld, Definition(dst), src); } else if (dst.regClass() == v2) { + /* Lowered at NIR level for precision reasons. */ emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1855,8 +1956,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fneg: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x8000u), as_vgpr(ctx, src)); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + if (ctx->block->fp_mode.must_flush_denorms16_64) + src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src)); + bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src)); } else if (dst.regClass() == v1) { if (ctx->block->fp_mode.must_flush_denorms32) src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src)); @@ -1878,8 +1980,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fabs: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFu), as_vgpr(ctx, src)); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + if (ctx->block->fp_mode.must_flush_denorms16_64) + src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src)); + bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src)); } else if (dst.regClass() == v1) { if (ctx->block->fp_mode.must_flush_denorms32) src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src)); @@ -1901,8 +2004,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fsat: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop3(aco_opcode::v_med3_f16, bld.def(v1), Operand(0u), Operand(0x3f800000u), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src); } else if (dst.regClass() == v1) { bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src); /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */ @@ -1921,8 +2023,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_flog2: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_log_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst); } else if (dst.regClass() == v1) { emit_log2(ctx, bld, Definition(dst), src); } else { @@ -1935,11 +2036,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_frcp: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_rcp_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst); } else if (dst.regClass() == v1) { emit_rcp(ctx, bld, Definition(dst), src); } else if (dst.regClass() == v2) { + /* Lowered at NIR level for precision reasons. */ emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1950,9 +2051,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fexp2: { if (dst.regClass() == v2b) { - Temp src = get_alu_src(ctx, instr->src[0]); - Temp tmp = bld.vop1(aco_opcode::v_exp_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst); } else { @@ -1965,11 +2064,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fsqrt: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_sqrt_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst); } else if (dst.regClass() == v1) { emit_sqrt(ctx, bld, Definition(dst), src); } else if (dst.regClass() == v2) { + /* Lowered at NIR level for precision reasons. */ emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -1980,9 +2079,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_ffract: { if (dst.regClass() == v2b) { - Temp src = get_alu_src(ctx, instr->src[0]); - Temp tmp = bld.vop1(aco_opcode::v_fract_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst); } else if (dst.regClass() == v2) { @@ -1997,8 +2094,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_ffloor: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_floor_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst); } else if (dst.regClass() == v2) { @@ -2013,8 +2109,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fceil: { Temp src0 = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_ceil_f16, bld.def(v1), src0); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst); } else if (dst.regClass() == v2) { @@ -2044,8 +2139,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_ftrunc: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_trunc_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst); } else if (dst.regClass() == v2) { @@ -2060,8 +2154,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fround_even: { Temp src0 = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_rndne_f16, bld.def(v1), src0); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst); } else if (dst.regClass() == v2) { @@ -2102,13 +2195,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fcos: { Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); aco_ptr norm; - Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u)); if (dst.regClass() == v2b) { + Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u)); Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src); aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16; - tmp = bld.vop1(opcode, bld.def(v1), tmp); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop1(opcode, Definition(dst), tmp); } else if (dst.regClass() == v1) { + Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u)); Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src); /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */ @@ -2128,9 +2221,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); if (dst.regClass() == v2b) { - Temp tmp = bld.tmp(v1); - emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, tmp, false); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false); } else if (dst.regClass() == v1) { bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1); } else if (dst.regClass() == v2) { @@ -2145,8 +2236,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_frexp_sig: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v2b) { - Temp tmp = bld.vop1(aco_opcode::v_frexp_mant_f16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src); } else if (dst.regClass() == v1) { bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src); } else if (dst.regClass() == v2) { @@ -2163,7 +2253,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (instr->src[0].src.ssa->bit_size == 16) { Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src); tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand(0u)); - convert_int(bld, tmp, 8, 32, true, dst); + convert_int(ctx, bld, tmp, 8, 32, true, dst); } else if (instr->src[0].src.ssa->bit_size == 32) { bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src); } else if (instr->src[0].src.ssa->bit_size == 64) { @@ -2183,8 +2273,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond); cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); - Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), minus_one, src, cond); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond); } else if (dst.regClass() == v1) { Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond); @@ -2212,16 +2301,20 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size == 64) src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src); - src = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src); + if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne) + /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to + * keep value numbering and the scheduler simpler. + */ + bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src); + else + bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src); break; } case nir_op_f2f16_rtz: { Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size == 64) src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src); - src = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), src, Operand(0u)); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src); + bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u)); break; } case nir_op_f2f32: { @@ -2247,16 +2340,17 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) assert(dst.regClass() == v2b); Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size == 8) - src = convert_int(bld, src, 8, 16, true); - Temp tmp = bld.vop1(aco_opcode::v_cvt_f16_i16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + src = convert_int(ctx, bld, src, 8, 16, true); + else if (instr->src[0].src.ssa->bit_size == 64) + src = convert_int(ctx, bld, src, 64, 32, false); + bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src); break; } case nir_op_i2f32: { assert(dst.size() == 1); Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size <= 16) - src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true); + src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true); bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src); break; } @@ -2264,7 +2358,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (instr->src[0].src.ssa->bit_size <= 32) { Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size <= 16) - src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true); + src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true); bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src); } else if (instr->src[0].src.ssa->bit_size == 64) { Temp src = get_alu_src(ctx, instr->src[0]); @@ -2287,20 +2381,20 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) assert(dst.regClass() == v2b); Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size == 8) - src = convert_int(bld, src, 8, 16, false); - Temp tmp = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v1), src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + src = convert_int(ctx, bld, src, 8, 16, false); + else if (instr->src[0].src.ssa->bit_size == 64) + src = convert_int(ctx, bld, src, 64, 32, false); + bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src); break; } case nir_op_u2f32: { assert(dst.size() == 1); Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size == 8) { - //TODO: we should use v_cvt_f32_ubyte1/v_cvt_f32_ubyte2/etc depending on the register assignment bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src); } else { if (instr->src[0].src.ssa->bit_size == 16) - src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true); + src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true); bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src); } break; @@ -2309,7 +2403,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (instr->src[0].src.ssa->bit_size <= 32) { Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size <= 16) - src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, false); + src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false); bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src); } else if (instr->src[0].src.ssa->bit_size == 64) { Temp src = get_alu_src(ctx, instr->src[0]); @@ -2329,34 +2423,22 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_f2i8: case nir_op_f2i16: { - Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size == 16) - src = bld.vop1(aco_opcode::v_cvt_i16_f16, bld.def(v1), src); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst); else if (instr->src[0].src.ssa->bit_size == 32) - src = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src); - else - src = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src); - - if (dst.type() == RegType::vgpr) - bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u)); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst); else - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst); break; } case nir_op_f2u8: case nir_op_f2u16: { - Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size == 16) - src = bld.vop1(aco_opcode::v_cvt_u16_f16, bld.def(v1), src); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst); else if (instr->src[0].src.ssa->bit_size == 32) - src = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst); else - src = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src); - - if (dst.type() == RegType::vgpr) - bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u)); - else - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src); + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst); break; } case nir_op_f2i32: { @@ -2370,19 +2452,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp)); } } else if (instr->src[0].src.ssa->bit_size == 32) { - if (dst.type() == RegType::vgpr) - bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src); - else - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), - bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src)); - + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst); } else if (instr->src[0].src.ssa->bit_size == 64) { - if (dst.type() == RegType::vgpr) - bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src); - else - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), - bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src)); - + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -2401,19 +2473,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp)); } } else if (instr->src[0].src.ssa->bit_size == 32) { - if (dst.type() == RegType::vgpr) - bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src); - else - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), - bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src)); - + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst); } else if (instr->src[0].src.ssa->bit_size == 64) { - if (dst.type() == RegType::vgpr) - bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src); - else - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), - bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src)); - + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -2581,8 +2643,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3c00u), src); } else if (dst.regClass() == v2b) { Temp one = bld.copy(bld.def(v1), Operand(0x3c00u)); - Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src); - bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), one, src); } else { unreachable("Wrong destination register class for nir_op_b2f16."); } @@ -2622,7 +2683,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_i2i16: case nir_op_i2i32: case nir_op_i2i64: { - convert_int(bld, get_alu_src(ctx, instr->src[0]), + convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst); break; } @@ -2630,23 +2691,30 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_u2u16: case nir_op_u2u32: case nir_op_u2u64: { - convert_int(bld, get_alu_src(ctx, instr->src[0]), + convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst); break; } case nir_op_b2b32: - case nir_op_b2i32: { + case nir_op_b2i8: + case nir_op_b2i16: + case nir_op_b2i32: + case nir_op_b2i64: { Temp src = get_alu_src(ctx, instr->src[0]); assert(src.regClass() == bld.lm); - if (dst.regClass() == s1) { + Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst; + if (tmp.regClass() == s1) { // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ - bool_to_scalar_condition(ctx, src, dst); - } else if (dst.regClass() == v1) { - bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src); + bool_to_scalar_condition(ctx, src, tmp); + } else if (tmp.type() == RegType::vgpr) { + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand(0u), Operand(1u), src); } else { unreachable("Invalid register class for b2i32"); } + + if (tmp != dst) + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u)); break; } case nir_op_b2b1: @@ -2735,7 +2803,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_unpack_half_2x16_split_x: { if (dst.regClass() == v1) { - Builder bld(ctx->program, ctx->block); bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0])); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); @@ -2746,7 +2813,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_unpack_half_2x16_split_y: { if (dst.regClass() == v1) { - Builder bld(ctx->program, ctx->block); /* TODO: use SDWA here */ bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0])))); @@ -3058,35 +3124,6 @@ uint32_t widen_mask(uint32_t mask, unsigned multiplier) return new_mask; } -void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst) -{ - Builder bld(ctx->program, ctx->block); - if (offset.isTemp()) { - Temp tmp[3] = {vec, vec, vec}; - - if (vec.size() == 3) { - tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1); - bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec); - } else if (vec.size() == 2) { - tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1]; - bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec); - } - for (unsigned i = 0; i < dst.size(); i++) - tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset); - - vec = tmp[0]; - if (dst.size() == 2) - vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]); - - offset = Operand(0u); - } - - if (vec.bytes() == dst.bytes() && offset.constantValue() == 0) - bld.copy(Definition(dst), vec); - else - trim_subdword_vector(ctx, vec, dst, vec.bytes(), ((1 << dst.bytes()) - 1) << offset.constantValue()); -} - struct LoadEmitInfo { Operand offset; Temp dst; @@ -3131,7 +3168,9 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info) int byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1; if (byte_align) { - if ((bytes_needed > 2 || !supports_8bit_16bit_loads) && byte_align_loads) { + if ((bytes_needed > 2 || + (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) || + !supports_8bit_16bit_loads) && byte_align_loads) { if (info->component_stride) { assert(supports_8bit_16bit_loads && "unimplemented"); bytes_needed = 2; @@ -3193,7 +3232,9 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info) /* align offset down if needed */ Operand aligned_offset = offset; + unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul; if (need_to_align_offset) { + align = 4; Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp(); if (offset.isConstant()) { aligned_offset = Operand(offset.constantValue() & 0xfffffffcu); @@ -3213,12 +3254,18 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info) Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset); - unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul; Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align, reduced_const_offset, byte_align ? Temp() : info->dst); + /* the callback wrote directly to dst */ + if (val == info->dst) { + assert(num_vals == 0); + emit_split_vector(ctx, info->dst, info->num_components); + return; + } + /* shift result right if needed */ - if (byte_align) { + if (info->component_size < 4 && byte_align_loads) { Operand align((uint32_t)byte_align); if (byte_align == -1) { if (offset.isConstant()) @@ -3229,15 +3276,12 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info) align = offset; } - if (align.isTemp() || align.constantValue()) { - assert(val.bytes() >= load_size && "unimplemented"); - Temp new_val = bld.tmp(RegClass::get(val.type(), load_size)); - if (val.type() == RegType::sgpr) - byte_align_scalar(ctx, val, align, new_val); - else - byte_align_vector(ctx, val, align, new_val); - val = new_val; - } + assert(val.bytes() >= load_size && "unimplemented"); + if (val.type() == RegType::sgpr) + byte_align_scalar(ctx, val, align, info->dst); + else + byte_align_vector(ctx, val, align, info->dst, component_size); + return; } /* add result to list and advance */ @@ -3253,13 +3297,6 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info) vals[num_vals++] = val; } - /* the callback wrote directly to dst */ - if (vals[0] == info->dst) { - assert(num_vals == 1); - emit_split_vector(ctx, info->dst, info->num_components); - return; - } - /* create array of components */ unsigned components_split = 0; std::array allocated_vec; @@ -3278,7 +3315,7 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info) if (num_tmps > 1) { aco_ptr vec{create_instruction( aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)}; - for (unsigned i = 0; i < num_vals; i++) + for (unsigned i = 0; i < num_tmps; i++) vec->operands[i] = Operand(tmp[i]); tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size)); vec->definitions[0] = Definition(tmp[0]); @@ -3477,10 +3514,10 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info, unsigned bytes_size = 0; aco_opcode op; - if (bytes_needed == 1) { + if (bytes_needed == 1 || align_ % 2) { bytes_size = 1; op = aco_opcode::buffer_load_ubyte; - } else if (bytes_needed == 2) { + } else if (bytes_needed == 2 || align_ % 4) { bytes_size = 2; op = aco_opcode::buffer_load_ushort; } else if (bytes_needed <= 4) { @@ -3506,18 +3543,17 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info, mubuf->barrier = info->barrier; mubuf->can_reorder = info->can_reorder; mubuf->offset = const_offset; - RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4)); + mubuf->swizzled = info->swizzle_component_size != 0; + RegClass rc = RegClass::get(RegType::vgpr, bytes_size); Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc); mubuf->definitions[0] = Definition(val); bld.insert(std::move(mubuf)); - if (bytes_size < 4) - val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u)); - return val; } static auto emit_mubuf_load = emit_load; +static auto emit_scratch_load = emit_load; Temp get_gfx6_global_rsrc(Builder& bld, Temp addr) { @@ -3586,9 +3622,6 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info, bld.insert(std::move(flat)); } - if (bytes_size < 4) - val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u)); - return val; } @@ -3645,13 +3678,15 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp /* use allocated_vec if possible */ auto it = ctx->allocated_vec.find(src.id()); if (it != ctx->allocated_vec.end()) { - unsigned total_size = 0; - for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++) - total_size += it->second[i].bytes(); - if (total_size != src.bytes()) + if (!it->second[0].id()) goto split; - unsigned elem_size = it->second[0].bytes(); + assert(src.bytes() % elem_size == 0); + + for (unsigned i = 0; i < src.bytes() / elem_size; i++) { + if (!it->second[i].id()) + goto split; + } for (unsigned i = 0; i < count; i++) { if (offsets[i] % elem_size || dst[i].bytes() % elem_size) @@ -3683,10 +3718,11 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp } } + split: + if (dst_type == RegType::sgpr) src = bld.as_uniform(src); - split: /* just split it */ aco_ptr split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)}; split->operands[0] = Operand(src); @@ -3886,10 +3922,10 @@ void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem /* dword or larger stores have to be dword-aligned */ unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4; - unsigned align_offset = instr ? nir_intrinsic_align_mul(instr) : 0; - bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0; - if (bytes >= 4 && !dword_aligned) - bytes = MIN2(bytes, 2); + unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset; + bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0; + if (!dword_aligned) + bytes = MIN2(bytes, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1); advance_write_mask(&todo, offset, bytes); write_count_with_skips++; @@ -3963,7 +3999,8 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un } void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata, - unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false) + unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false, + bool swizzled = false) { assert(vdata.id()); assert(vdata.size() != 3 || ctx->program->chip_class != GFX6); @@ -3976,8 +4013,9 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1); Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u); Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset, - /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false, - /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc); + /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled, + /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true, + /* dlc*/ false, /* slc */ slc); static_cast(r.instr)->can_reorder = allow_reorder; } @@ -3987,7 +4025,7 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset bool allow_combining = true, bool reorder = true, bool slc = false) { Builder bld(ctx->program, ctx->block); - assert(elem_size_bytes == 4 || elem_size_bytes == 8); + assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8); assert(write_mask); write_mask = widen_mask(write_mask, elem_size_bytes); @@ -3999,7 +4037,7 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset for (unsigned i = 0; i < write_count; i++) { unsigned const_offset = offsets[i] + base_const_offset; - emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc); + emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc, !allow_combining); } } @@ -4007,8 +4045,8 @@ void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components, unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true) { - assert(elem_size_bytes == 4 || elem_size_bytes == 8); - assert((num_components * elem_size_bytes / 4) == dst.size()); + assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8); + assert((num_components * elem_size_bytes) == dst.bytes()); assert(!!stride != allow_combining); Builder bld(ctx->program, ctx->block); @@ -4153,11 +4191,9 @@ std::pair get_tcs_output_lds_offset(isel_context *ctx, nir_intri Builder bld(ctx->program, ctx->block); uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16; - uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written); - uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written); - uint32_t output_vertex_size = num_tcs_outputs * 16; + uint32_t output_vertex_size = ctx->tcs_num_outputs * 16; uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; - uint32_t output_patch_stride = pervertex_output_patch_size + num_tcs_patch_outputs * 16; + uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16; std::pair offs = instr ? get_intrinsic_io_basic_offset(ctx, instr, 4u) @@ -4205,11 +4241,7 @@ std::pair get_tcs_per_patch_output_vmem_offset(isel_context *ctx { Builder bld(ctx->program, ctx->block); - unsigned num_tcs_outputs = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL - ? util_last_bit64(ctx->args->shader_info->tcs.outputs_written) - : ctx->args->options->key.tes.tcs_num_outputs; - - unsigned output_vertex_size = num_tcs_outputs * 16; + unsigned output_vertex_size = ctx->tcs_num_outputs * 16; unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size; unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches; unsigned attr_stride = ctx->tcs_num_patches; @@ -4230,10 +4262,12 @@ std::pair get_tcs_per_patch_output_vmem_offset(isel_context *ctx bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect) { + assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL); + if (mask == 0) return false; - unsigned off = nir_intrinsic_base(instr) * 4u; + unsigned drv_loc = nir_intrinsic_base(instr); nir_src *off_src = nir_get_io_offset_src(instr); if (!nir_src_is_const(*off_src)) { @@ -4242,15 +4276,10 @@ bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr } *indirect = false; - off += nir_src_as_uint(*off_src) * 16u; - - while (mask) { - unsigned slot = u_bit_scan64(&mask) + (per_vertex ? 0 : VARYING_SLOT_PATCH0); - if (off == shader_io_get_unique_index((gl_varying_slot) slot) * 16u) - return true; - } - - return false; + uint64_t slot = per_vertex + ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4] + : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0); + return (((uint64_t) 1) << slot) & mask; } bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr) @@ -4269,10 +4298,12 @@ bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr) if (instr->src[0].ssa->bit_size == 64) write_mask = widen_mask(write_mask, 2); + RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1; + for (unsigned i = 0; i < 8; ++i) { if (write_mask & (1 << i)) { ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u); - ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, v1); + ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc); } idx++; } @@ -4344,9 +4375,8 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr) /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS. * GFX9+: LS is merged into HS, but still uses the same LDS layout. */ - unsigned num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written); Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id); - lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, num_tcs_inputs * 16u); + lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u); } else { unreachable("Invalid LS or ES stage"); } @@ -4477,10 +4507,40 @@ void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp Temp coord2 = emit_extract_vector(ctx, src, 1, v1); Builder bld(ctx->program, ctx->block); - Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component); - if (ctx->program->has_16bank_lds) - interp_p1.instr->operands[0].setLateKill(true); - bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, component); + + if (dst.regClass() == v2b) { + if (ctx->program->has_16bank_lds) { + assert(ctx->options->chip_class <= GFX8); + Builder::Result interp_p1 = + bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), + Operand(2u) /* P0 */, bld.m0(prim_mask), idx, component); + interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), + coord1, bld.m0(prim_mask), interp_p1, idx, component); + bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, + bld.m0(prim_mask), interp_p1, idx, component); + } else { + aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16; + + if (ctx->options->chip_class == GFX8) + interp_p2_op = aco_opcode::v_interp_p2_legacy_f16; + + Builder::Result interp_p1 = + bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), + coord1, bld.m0(prim_mask), idx, component); + bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), + interp_p1, idx, component); + } + } else { + Builder::Result interp_p1 = + bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, + bld.m0(prim_mask), idx, component); + + if (ctx->program->has_16bank_lds) + interp_p1.instr->operands[0].setLateKill(true); + + bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, + bld.m0(prim_mask), interp_p1, idx, component); + } } void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components) @@ -4646,6 +4706,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset; unsigned component = nir_intrinsic_component(instr); + unsigned bitsize = instr->dest.ssa.bit_size; unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location]; uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location]; uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location]; @@ -4702,7 +4763,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) /* load channels */ while (channel_start < num_channels) { - unsigned fetch_size = num_channels - channel_start; + unsigned fetch_component = num_channels - channel_start; unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size; bool expanded = false; @@ -4714,15 +4775,17 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) vtx_info->chan_byte_size == 4; unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID; if (!use_mubuf) { - fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size); + fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component); } else { - if (fetch_size == 3 && ctx->options->chip_class == GFX6) { + if (fetch_component == 3 && ctx->options->chip_class == GFX6) { /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */ - fetch_size = 4; + fetch_component = 4; expanded = true; } } + unsigned fetch_bytes = fetch_component * bitsize / 8; + Temp fetch_index = index; if (attrib_stride != 0 && fetch_offset > attrib_stride) { fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index); @@ -4736,19 +4799,37 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) } aco_opcode opcode; - switch (fetch_size) { - case 1: - opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x; - break; + switch (fetch_bytes) { case 2: - opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy; + assert(!use_mubuf && bitsize == 16); + opcode = aco_opcode::tbuffer_load_format_d16_x; + break; + case 4: + if (bitsize == 16) { + assert(!use_mubuf); + opcode = aco_opcode::tbuffer_load_format_d16_xy; + } else { + opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x; + } + break; + case 6: + assert(!use_mubuf && bitsize == 16); + opcode = aco_opcode::tbuffer_load_format_d16_xyz; + break; + case 8: + if (bitsize == 16) { + assert(!use_mubuf); + opcode = aco_opcode::tbuffer_load_format_d16_xyzw; + } else { + opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy; + } break; - case 3: + case 12: assert(ctx->options->chip_class >= GFX7 || (!use_mubuf && ctx->options->chip_class == GFX6)); opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz; break; - case 4: + case 16: opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw; break; default: @@ -4756,19 +4837,19 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) } Temp fetch_dst; - if (channel_start == 0 && fetch_size == dst.size() && !post_shuffle && + if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE || num_channels <= 3)) { direct_fetch = true; fetch_dst = dst; } else { - fetch_dst = bld.tmp(RegType::vgpr, fetch_size); + fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes)); } if (use_mubuf) { Instruction *mubuf = bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index, soffset, - fetch_offset, false, true).instr; + fetch_offset, false, false, true).instr; static_cast(mubuf)->can_reorder = true; } else { Instruction *mtbuf = bld.mtbuf(opcode, @@ -4779,14 +4860,15 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) emit_split_vector(ctx, fetch_dst, fetch_dst.size()); - if (fetch_size == 1) { + if (fetch_component == 1) { channels[channel_start] = fetch_dst; } else { - for (unsigned i = 0; i < MIN2(fetch_size, num_channels - channel_start); i++) - channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, v1); + for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++) + channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, + bitsize == 16 ? v2b : v1); } - channel_start += fetch_size; + channel_start += fetch_component; } if (!direct_fetch) { @@ -5066,7 +5148,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) { Builder bld(ctx->program, ctx->block); Temp index = get_ssa_temp(ctx, instr->src[0].ssa); - if (!ctx->divergent_vals[instr->dest.ssa.index]) + if (!nir_dest_is_divergent(instr->dest)) index = bld.as_uniform(index); unsigned desc_set = nir_intrinsic_desc_set(instr); unsigned binding = nir_intrinsic_binding(instr); @@ -5126,11 +5208,11 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size, Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, - bool glc=false, bool readonly=true) + bool glc=false, bool readonly=true, bool allow_smem=true) { Builder bld(ctx->program, ctx->block); - bool use_smem = dst.type() != RegType::vgpr && ((ctx->options->chip_class >= GFX8 && component_size >= 4) || readonly); + bool use_smem = dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem; if (use_smem) offset = bld.as_uniform(offset); @@ -5213,7 +5295,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); if (offset != 0) // TODO check if index != 0 as well - index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index); + index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index); Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants)); Temp vec = dst; bool trim = false; @@ -5255,7 +5337,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) unreachable("unimplemented or forbidden load_push_constant."); } - bld.smem(op, Definition(vec), ptr, index); + static_cast(bld.smem(op, Definition(vec), ptr, index).instr)->prevent_overflow = true; if (!aligned) { Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index); @@ -5299,7 +5381,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr) Temp offset = get_ssa_temp(ctx, instr->src[0].ssa); if (base && offset.type() == RegType::sgpr) - offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base)); + offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base)); else if (base && offset.type() == RegType::vgpr) offset = bld.vadd32(bld.def(v1), Operand(base), offset); @@ -5659,8 +5741,12 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec ctx->block->instructions.emplace_back(std::move(load)); Operand sample_index4; - if (sample_index.isConstant() && sample_index.constantValue() < 16) { - sample_index4 = Operand(sample_index.constantValue() << 2); + if (sample_index.isConstant()) { + if (sample_index.constantValue() < 16) { + sample_index4 = Operand(sample_index.constantValue() << 2); + } else { + sample_index4 = Operand(0u); + } } else if (sample_index.regClass() == s1) { sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u)); } else { @@ -6121,10 +6207,19 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); - bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); + unsigned access = nir_intrinsic_access(instr); + bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT); unsigned size = instr->dest.ssa.bit_size / 8; + + uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[0].ssa, access); + /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores + * TODO: this optimization is disabled for now because we still need to ensure correct ordering + */ + bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_store : has_vmem_store)); + allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER); + load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), - nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false); + nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem); } void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) @@ -6138,9 +6233,17 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); - bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] && + bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr)); + /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores + * TODO: this optimization is disabled for now because we still need to ensure correct ordering + */ + bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore)); + + bool smem = !nir_src_is_divergent(instr->src[2]) && ctx->options->chip_class >= GFX8 && - elem_size_bytes >= 4; + (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) && + allow_smem; if (smem) offset = bld.as_uniform(offset); bool smem_nonfs = smem && ctx->stage != fragment_fs; @@ -6160,8 +6263,8 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) aco_ptr store{create_instruction(op, Format::SMEM, 3, 0)}; store->operands[0] = Operand(rsrc); if (offsets[i]) { - Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), - offset, Operand(offsets[i])); + Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), + offset, Operand(offsets[i])); store->operands[1] = Operand(off); } else { store->operands[1] = Operand(offset); @@ -6169,7 +6272,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) if (op != aco_opcode::p_fs_buffer_store_smem) store->operands[1].setFixed(m0); store->operands[2] = Operand(write_datas[i]); - store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + store->glc = glc; store->dlc = false; store->disable_wqm = true; store->barrier = barrier_buffer; @@ -6187,7 +6290,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) store->operands[3] = Operand(write_datas[i]); store->offset = offsets[i]; store->offen = (offset.type() == RegType::vgpr); - store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + store->glc = glc; store->dlc = false; store->disable_wqm = true; store->barrier = barrier_buffer; @@ -6687,7 +6790,7 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr) op32 = aco_opcode::ds_write_b32; op64 = aco_opcode::ds_write_b64; op32_rtn = aco_opcode::ds_wrxchg_rtn_b32; - op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64; + op64_rtn = aco_opcode::ds_wrxchg_rtn_b64; break; case nir_intrinsic_shared_atomic_comp_swap: op32 = aco_opcode::ds_cmpst_b32; @@ -6696,6 +6799,12 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr) op64_rtn = aco_opcode::ds_cmpst_rtn_b64; num_operands = 4; break; + case nir_intrinsic_shared_atomic_fadd: + op32 = aco_opcode::ds_add_f32; + op32_rtn = aco_opcode::ds_add_rtn_f32; + op64 = aco_opcode::num_opcodes; + op64_rtn = aco_opcode::num_opcodes; + break; default: unreachable("Unhandled shared atomic intrinsic"); } @@ -6746,7 +6855,7 @@ Temp get_scratch_resource(isel_context *ctx) scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u)); uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) | - S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);; + S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2); if (ctx->program->chip_class >= GFX10) { rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | @@ -6757,9 +6866,9 @@ Temp get_scratch_resource(isel_context *ctx) S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); } - /* older generations need element size = 16 bytes. element size removed in GFX9 */ + /* older generations need element size = 4 bytes. element size removed in GFX9 */ if (ctx->program->chip_class <= GFX8) - rsrc_conf |= S_008F0C_ELEMENT_SIZE(3); + rsrc_conf |= S_008F0C_ELEMENT_SIZE(1); return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf)); } @@ -6774,10 +6883,10 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { instr->dest.ssa.bit_size / 8u, rsrc}; info.align_mul = nir_intrinsic_align_mul(instr); info.align_offset = nir_intrinsic_align_offset(instr); - info.swizzle_component_size = 16; + info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0; info.can_reorder = false; info.soffset = ctx->program->scratch_offset; - emit_mubuf_load(ctx, bld, &info); + emit_scratch_load(ctx, bld, &info); } void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { @@ -6792,12 +6901,13 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { unsigned write_count = 0; Temp write_datas[32]; unsigned offsets[32]; + unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16; split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, - 16, &write_count, write_datas, offsets); + swizzle_component_size, &write_count, write_datas, offsets); for (unsigned i = 0; i < write_count; i++) { aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes()); - bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true); + bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true); } } @@ -7178,6 +7288,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]); Temp private_segment_buffer = ctx->program->private_segment_buffer; + //TODO: bounds checking? if (addr.type() == RegType::sgpr) { Operand offset; if (const_addr) { @@ -7350,6 +7461,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_shared_atomic_xor: case nir_intrinsic_shared_atomic_exchange: case nir_intrinsic_shared_atomic_comp_swap: + case nir_intrinsic_shared_atomic_fadd: visit_shared_atomic(ctx, instr); break; case nir_intrinsic_image_deref_load: @@ -7529,14 +7641,21 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_shuffle: case nir_intrinsic_read_invocation: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - if (!ctx->divergent_vals[instr->src[0].ssa->index]) { + if (!nir_src_is_divergent(instr->src[0])) { emit_uniform_subgroup(ctx, instr, src); } else { Temp tid = get_ssa_temp(ctx, instr->src[1].ssa); - if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index]) + if (instr->intrinsic == nir_intrinsic_read_invocation || !nir_src_is_divergent(instr->src[1])) tid = bld.as_uniform(tid); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - if (src.regClass() == v1) { + if (src.regClass() == v1b || src.regClass() == v2b) { + Temp tmp = bld.tmp(v1); + tmp = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), tmp); + if (dst.type() == RegType::vgpr) + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(src.regClass() == v1b ? v3b : v2b), tmp); + else + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); + } else if (src.regClass() == v1) { emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst); } else if (src.regClass() == v2) { Temp lo = bld.tmp(v1), hi = bld.tmp(v1); @@ -7581,7 +7700,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_read_first_invocation: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - if (src.regClass() == v1) { + if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) { emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst); @@ -7639,7 +7758,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) nir_intrinsic_cluster_size(instr) : 0; cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size)); - if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) { + if (!nir_src_is_divergent(instr->src[0]) && (op == nir_op_ior || op == nir_op_iand)) { emit_uniform_subgroup(ctx, instr, src); } else if (instr->dest.ssa.bit_size == 1) { if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin) @@ -7666,27 +7785,31 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } else if (cluster_size == 1) { bld.copy(Definition(dst), src); } else { - src = as_vgpr(ctx, src); + unsigned bit_size = instr->src[0].ssa->bit_size; + + src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8)); ReduceOp reduce_op; switch (op) { - #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break; - CASE(iadd) - CASE(imul) - CASE(fadd) - CASE(fmul) - CASE(imin) - CASE(umin) - CASE(fmin) - CASE(imax) - CASE(umax) - CASE(fmax) - CASE(iand) - CASE(ior) - CASE(ixor) + #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break; + #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break; + CASEI(iadd) + CASEI(imul) + CASEI(imin) + CASEI(umin) + CASEI(imax) + CASEI(umax) + CASEI(iand) + CASEI(ior) + CASEI(ixor) + CASEF(fadd) + CASEF(fmul) + CASEF(fmin) + CASEF(fmax) default: unreachable("unknown reduction op"); - #undef CASE + #undef CASEI + #undef CASEF } aco_opcode aco_op; @@ -7722,7 +7845,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } case nir_intrinsic_quad_broadcast: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - if (!ctx->divergent_vals[instr->dest.ssa.index]) { + if (!nir_dest_is_divergent(instr->dest)) { emit_uniform_subgroup(ctx, instr, src); } else { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); @@ -7739,6 +7862,20 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)))); emit_wqm(ctx, tmp, dst); + } else if (instr->dest.ssa.bit_size == 8) { + Temp tmp = bld.tmp(v1); + if (ctx->program->chip_class >= GFX8) + emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp); + else + emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp); + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp); + } else if (instr->dest.ssa.bit_size == 16) { + Temp tmp = bld.tmp(v1); + if (ctx->program->chip_class >= GFX8) + emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp); + else + emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp); + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); } else if (instr->dest.ssa.bit_size == 32) { if (ctx->program->chip_class >= GFX8) emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst); @@ -7769,7 +7906,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_quad_swap_diagonal: case nir_intrinsic_quad_swizzle_amd: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - if (!ctx->divergent_vals[instr->dest.ssa.index]) { + if (!nir_dest_is_divergent(instr->dest)) { emit_uniform_subgroup(ctx, instr, src); break; } @@ -7803,6 +7940,20 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl); Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src); emit_wqm(ctx, tmp, dst); + } else if (instr->dest.ssa.bit_size == 8) { + Temp tmp = bld.tmp(v1); + if (ctx->program->chip_class >= GFX8) + emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp); + else + emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp); + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp); + } else if (instr->dest.ssa.bit_size == 16) { + Temp tmp = bld.tmp(v1); + if (ctx->program->chip_class >= GFX8) + emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp); + else + emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp); + bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp); } else if (instr->dest.ssa.bit_size == 32) { Temp tmp; if (ctx->program->chip_class >= GFX8) @@ -7831,21 +7982,31 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } case nir_intrinsic_masked_swizzle_amd: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - if (!ctx->divergent_vals[instr->dest.ssa.index]) { + if (!nir_dest_is_divergent(instr->dest)) { emit_uniform_subgroup(ctx, instr, src); break; } Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); uint32_t mask = nir_intrinsic_swizzle_mask(instr); - if (dst.regClass() == v1) { - emit_wqm(ctx, - bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false), - dst); + if (instr->dest.ssa.bit_size == 1) { + assert(src.regClass() == bld.lm); + src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src); + src = emit_masked_swizzle(ctx, bld, src, mask); + Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src); + emit_wqm(ctx, tmp, dst); + } else if (dst.regClass() == v1b) { + Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask)); + emit_extract_vector(ctx, tmp, 0, dst); + } else if (dst.regClass() == v2b) { + Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask)); + emit_extract_vector(ctx, tmp, 0, dst); + } else if (dst.regClass() == v1) { + emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask), dst); } else if (dst.regClass() == v2) { Temp lo = bld.tmp(v1), hi = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); - lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false)); - hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false)); + lo = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, lo, mask)); + hi = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, hi, mask)); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); emit_split_vector(ctx, dst, 2); } else { @@ -7928,10 +8089,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) get_ssa_temp(ctx, &instr->dest.ssa)); break; } - case nir_intrinsic_shader_clock: - bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false); + case nir_intrinsic_shader_clock: { + aco_opcode opcode = + nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ? + aco_opcode::s_memrealtime : aco_opcode::s_memtime; + bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false); emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2); break; + } case nir_intrinsic_load_vertex_id_zero_base: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id)); @@ -8223,9 +8388,11 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) { Builder bld(ctx->program, ctx->block); bool has_bias = false, has_lod = false, level_zero = false, has_compare = false, - has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false; + has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false, + has_clamped_lod = false; Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(), - lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(); + lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), + clamped_lod = Temp(); std::vector coords; std::vector derivs; nir_const_value *sample_index_cv = NULL; @@ -8247,10 +8414,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) break; } case nir_tex_src_bias: - if (instr->op == nir_texop_txb) { - bias = get_ssa_temp(ctx, instr->src[i].src.ssa); - has_bias = true; - } + bias = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_bias = true; break; case nir_tex_src_lod: { nir_const_value *val = nir_src_as_const_value(instr->src[i].src); @@ -8263,6 +8428,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) } break; } + case nir_tex_src_min_lod: + clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa); + has_clamped_lod = true; + break; case nir_tex_src_comparator: if (instr->is_shadow) { compare = get_ssa_temp(ctx, instr->src[i].src.ssa); @@ -8303,10 +8472,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16)); Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2); Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */)); - Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u)); + Operand default_sample = Operand(1u); + if (ctx->options->robust_buffer_access) { + /* Extract the second dword of the descriptor, if it's + * all zero, then it's a null descriptor. + */ + Temp dword1 = emit_extract_vector(ctx, resource, 1, s1); + Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u)); + default_sample = Operand(is_non_null_descriptor); + } + + Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u)); bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), - samples, Operand(1u), bld.scc(is_msaa)); + samples, default_sample, bld.scc(is_msaa)); return; } @@ -8653,6 +8832,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) args.emplace_back(sample_index); if (has_lod) args.emplace_back(lod); + if (has_clamped_lod) + args.emplace_back(clamped_lod); Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size())); aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)}; @@ -8697,7 +8878,21 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) // TODO: would be better to do this by adding offsets, but needs the opcodes ordered. aco_opcode opcode = aco_opcode::image_sample; if (has_offset) { /* image_sample_*_o */ - if (has_compare) { + if (has_clamped_lod) { + if (has_compare) { + opcode = aco_opcode::image_sample_c_cl_o; + if (has_derivs) + opcode = aco_opcode::image_sample_c_d_cl_o; + if (has_bias) + opcode = aco_opcode::image_sample_c_b_cl_o; + } else { + opcode = aco_opcode::image_sample_cl_o; + if (has_derivs) + opcode = aco_opcode::image_sample_d_cl_o; + if (has_bias) + opcode = aco_opcode::image_sample_b_cl_o; + } + } else if (has_compare) { opcode = aco_opcode::image_sample_c_o; if (has_derivs) opcode = aco_opcode::image_sample_c_d_o; @@ -8718,6 +8913,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) if (has_lod) opcode = aco_opcode::image_sample_l_o; } + } else if (has_clamped_lod) { /* image_sample_*_cl */ + if (has_compare) { + opcode = aco_opcode::image_sample_c_cl; + if (has_derivs) + opcode = aco_opcode::image_sample_c_d_cl; + if (has_bias) + opcode = aco_opcode::image_sample_c_b_cl; + } else { + opcode = aco_opcode::image_sample_cl; + if (has_derivs) + opcode = aco_opcode::image_sample_d_cl; + if (has_bias) + opcode = aco_opcode::image_sample_b_cl; + } } else { /* no offset */ if (has_compare) { opcode = aco_opcode::image_sample_c; @@ -8743,14 +8952,34 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) } if (instr->op == nir_texop_tg4) { - if (has_offset) { - opcode = aco_opcode::image_gather4_lz_o; - if (has_compare) + if (has_offset) { /* image_gather4_*_o */ + if (has_compare) { opcode = aco_opcode::image_gather4_c_lz_o; + if (has_lod) + opcode = aco_opcode::image_gather4_c_l_o; + if (has_bias) + opcode = aco_opcode::image_gather4_c_b_o; + } else { + opcode = aco_opcode::image_gather4_lz_o; + if (has_lod) + opcode = aco_opcode::image_gather4_l_o; + if (has_bias) + opcode = aco_opcode::image_gather4_b_o; + } } else { - opcode = aco_opcode::image_gather4_lz; - if (has_compare) + if (has_compare) { opcode = aco_opcode::image_gather4_c_lz; + if (has_lod) + opcode = aco_opcode::image_gather4_c_l; + if (has_bias) + opcode = aco_opcode::image_gather4_c_b; + } else { + opcode = aco_opcode::image_gather4_lz; + if (has_lod) + opcode = aco_opcode::image_gather4_l; + if (has_bias) + opcode = aco_opcode::image_gather4_b; + } } } else if (instr->op == nir_texop_lod) { opcode = aco_opcode::image_get_lod; @@ -8801,13 +9030,19 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) } -Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa) +Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool logical) { Temp tmp = get_ssa_temp(ctx, ssa); - if (ssa->parent_instr->type == nir_instr_type_ssa_undef) - return Operand(tmp.regClass()); - else + if (ssa->parent_instr->type == nir_instr_type_ssa_undef) { + return Operand(rc); + } else if (logical && ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) { + if (ctx->program->wave_size == 64) + return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX : 0u); + else + return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX : 0u); + } else { return Operand(tmp); + } } void visit_phi(isel_context *ctx, nir_phi_instr *instr) @@ -8816,7 +9051,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask); - bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index]; + bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest); logical |= ctx->block->kind & block_kind_merge; aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi; @@ -8850,7 +9085,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr) if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size()) continue; cur_pred_idx++; - Operand op = get_phi_operand(ctx, src.second); + Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical); operands[num_operands++] = op; num_defined += !op.isUndefined(); } @@ -9510,7 +9745,7 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt) aco_ptr branch; if_context ic; - if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */ + if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */ /** * Uniform conditionals are represented in the following way*) : * @@ -9538,8 +9773,6 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt) visit_cf_list(ctx, &if_stmt->else_list); end_uniform_if(ctx, &ic); - - return !ctx->cf_info.has_branch; } else { /* non-uniform condition */ /** * To maintain a logical and linear CFG without critical edges, @@ -9573,9 +9806,9 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt) visit_cf_list(ctx, &if_stmt->else_list); end_divergent_if(ctx, &ic); - - return true; } + + return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty(); } static bool visit_cf_list(isel_context *ctx, @@ -9856,6 +10089,7 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1; bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1; + bool is_16bit = values[0].regClass() == v2b; switch (col_format) { @@ -9886,16 +10120,38 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) case V_028714_SPI_SHADER_FP16_ABGR: enabled_channels = 0x5; compr_op = aco_opcode::v_cvt_pkrtz_f16_f32; + if (is_16bit) { + if (ctx->options->chip_class >= GFX9) { + /* Pack the FP16 values together instead of converting them to + * FP32 and back to FP16. + * TODO: use p_create_vector and let the compiler optimizes. + */ + compr_op = aco_opcode::v_pack_b32_f16; + } else { + for (unsigned i = 0; i < 4; i++) { + if ((write_mask >> i) & 1) + values[i] = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), values[i]); + } + } + } break; case V_028714_SPI_SHADER_UNORM16_ABGR: enabled_channels = 0x5; - compr_op = aco_opcode::v_cvt_pknorm_u16_f32; + if (is_16bit && ctx->options->chip_class >= GFX9) { + compr_op = aco_opcode::v_cvt_pknorm_u16_f16; + } else { + compr_op = aco_opcode::v_cvt_pknorm_u16_f32; + } break; case V_028714_SPI_SHADER_SNORM16_ABGR: enabled_channels = 0x5; - compr_op = aco_opcode::v_cvt_pknorm_i16_f32; + if (is_16bit && ctx->options->chip_class >= GFX9) { + compr_op = aco_opcode::v_cvt_pknorm_i16_f16; + } else { + compr_op = aco_opcode::v_cvt_pknorm_i16_f32; + } break; case V_028714_SPI_SHADER_UINT16_ABGR: { @@ -9913,6 +10169,13 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) values[i]); } } + } else if (is_16bit) { + for (unsigned i = 0; i < 4; i++) { + if ((write_mask >> i) & 1) { + Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false); + values[i] = Operand(tmp); + } + } } break; } @@ -9937,6 +10200,13 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) values[i]); } } + } else if (is_16bit) { + for (unsigned i = 0; i < 4; i++) { + if ((write_mask >> i) & 1) { + Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true); + values[i] = Operand(tmp); + } + } } break; @@ -9951,6 +10221,26 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) if (target == V_008DFC_SQ_EXP_NULL) return false; + /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */ + if (ctx->options->enable_mrt_output_nan_fixup && + !is_16bit && + (col_format == V_028714_SPI_SHADER_32_R || + col_format == V_028714_SPI_SHADER_32_GR || + col_format == V_028714_SPI_SHADER_32_AR || + col_format == V_028714_SPI_SHADER_32_ABGR || + col_format == V_028714_SPI_SHADER_FP16_ABGR)) { + for (int i = 0; i < 4; i++) { + if (!(write_mask & (1 << i))) + continue; + + Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, + bld.hint_vcc(bld.def(bld.lm)), values[i], + bld.copy(bld.def(v1), Operand(3u))); + values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i], + bld.copy(bld.def(v1), Operand(0u)), isnan); + } + } + if ((bool) compr_op) { for (int i = 0; i < 2; i++) { /* check if at least one of the values to be compressed is enabled */ @@ -10071,8 +10361,8 @@ static void write_tcs_tess_factors(isel_context *ctx) Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u)); bld.mubuf(aco_opcode::buffer_store_dword, /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word, - /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false, - /* disable_wqm */ false, /* glc */ true); + /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false, + /* addr64 */ false, /* disable_wqm */ false, /* glc */ true); tf_const_offset += 4; begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero); @@ -10362,7 +10652,8 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader) float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64); - /* default to preserving fp16 and fp64 denorms, since it's free */ + /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and + * the precision seems needed for Wolfenstein: Youngblood to render correctly */ if (program->next_fp_mode.must_flush_denorms16_64) program->next_fp_mode.denorm16_64 = 0; else @@ -10727,8 +11018,6 @@ void select_program(Program *program, if (ngg_no_gs && !ngg_early_prim_export(&ctx)) ngg_emit_nogs_output(&ctx); - ralloc_free(ctx.divergent_vals); - if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) { /* Outputs of the previous stage are inputs to the next stage */ ctx.inputs = ctx.outputs; @@ -10754,16 +11043,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, { isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true); - program->next_fp_mode.preserve_signed_zero_inf_nan32 = false; - program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false; - program->next_fp_mode.must_flush_denorms32 = false; - program->next_fp_mode.must_flush_denorms16_64 = false; - program->next_fp_mode.care_about_round32 = false; - program->next_fp_mode.care_about_round16_64 = false; - program->next_fp_mode.denorm16_64 = fp_denorm_keep; - program->next_fp_mode.denorm32 = 0; - program->next_fp_mode.round32 = fp_round_ne; - program->next_fp_mode.round16_64 = fp_round_ne; ctx.block->fp_mode = program->next_fp_mode; add_startpgm(&ctx);