X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_instruction_selection.cpp;h=5bd389a5be415be281b28b922f7c4c89a73de7c7;hb=9c1e0d86a813af7609acf42cfe6bec7401d6405f;hp=c94a1b00e5ebb5134bc3c45a8d9fe95ca119e9de;hpb=b36950ad2c044967ea1b53917c0b068637492f77;p=mesa.git diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index c94a1b00e5e..5bd389a5be4 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -38,6 +38,23 @@ namespace aco { namespace { +#define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__) + +static void _isel_err(isel_context *ctx, const char *file, unsigned line, + const nir_instr *instr, const char *msg) +{ + char *out; + size_t outsize; + FILE *memf = open_memstream(&out, &outsize); + + fprintf(memf, "%s: ", msg); + nir_print_instr(instr, memf); + fclose(memf); + + _aco_err(ctx->program, file, line, out); + free(out); +} + class loop_info_RAII { isel_context* ctx; unsigned header_idx_old; @@ -472,7 +489,7 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un offset = Operand(0u); } - unsigned num_components = dst.bytes() / component_size; + unsigned num_components = vec.bytes() / component_size; if (vec.regClass() == dst.regClass()) { assert(offset.constantValue() == 0); bld.copy(Definition(dst), vec); @@ -480,17 +497,18 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un return; } - emit_split_vector(ctx, vec, vec.bytes() / component_size); + emit_split_vector(ctx, vec, num_components); std::array elems; RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword(); assert(offset.constantValue() % component_size == 0); unsigned skip = offset.constantValue() / component_size; - for (unsigned i = 0; i < num_components; i++) - elems[i] = emit_extract_vector(ctx, vec, i + skip, rc); + for (unsigned i = skip; i < num_components; i++) + elems[i - skip] = emit_extract_vector(ctx, vec, i, rc); /* if dst is vgpr - split the src and create a shrunk version according to the mask. */ if (dst.type() == RegType::vgpr) { + num_components = dst.bytes() / component_size; aco_ptr create_vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; for (unsigned i = 0; i < num_components; i++) create_vec->operands[i] = Operand(elems[i]); @@ -536,6 +554,104 @@ Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1 return emit_wqm(ctx, tmp, dst); } +Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) +{ + if (!dst.id()) { + if (dst_bits % 32 == 0 || src.type() == RegType::sgpr) + dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u)); + else + dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword()); + } + + if (dst.bytes() == src.bytes() && dst_bits < src_bits) + return bld.copy(Definition(dst), src); + else if (dst.bytes() < src.bytes()) + return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u)); + + Temp tmp = dst; + if (dst_bits == 64) + tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1); + + if (tmp == src) { + } else if (src.regClass() == s1) { + if (is_signed) + bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src); + else + bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src); + } else if (ctx->options->chip_class >= GFX8) { + assert(src_bits != 8 || src.regClass() == v1b); + assert(src_bits != 16 || src.regClass() == v2b); + aco_ptr sdwa{create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; + sdwa->operands[0] = Operand(src); + sdwa->definitions[0] = Definition(tmp); + if (is_signed) + sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword; + else + sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword; + sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword; + bld.insert(std::move(sdwa)); + } else { + assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7); + aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32; + bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u)); + } + + if (dst_bits == 64) { + if (is_signed && dst.regClass() == s2) { + Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high); + } else if (is_signed && dst.regClass() == v2) { + Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high); + } else { + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u)); + } + } + + return dst; +} + +enum sgpr_extract_mode { + sgpr_extract_sext, + sgpr_extract_zext, + sgpr_extract_undef, +}; + +Temp extract_8_16_bit_sgpr_element(isel_context *ctx, Temp dst, nir_alu_src *src, sgpr_extract_mode mode) +{ + Temp vec = get_ssa_temp(ctx, src->src.ssa); + unsigned src_size = src->src.ssa->bit_size; + unsigned swizzle = src->swizzle[0]; + + if (vec.size() > 1) { + assert(src_size == 16); + vec = emit_extract_vector(ctx, vec, swizzle / 2, s1); + swizzle = swizzle & 1; + } + + Builder bld(ctx->program, ctx->block); + unsigned offset = src_size * swizzle; + Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst; + + if (mode == sgpr_extract_undef && swizzle == 0) { + bld.copy(Definition(tmp), vec); + } else if (mode == sgpr_extract_undef || (offset == 24 && mode == sgpr_extract_zext)) { + bld.sop2(aco_opcode::s_lshr_b32, Definition(tmp), bld.def(s1, scc), vec, Operand(offset)); + } else if (src_size == 8 && swizzle == 0 && mode == sgpr_extract_sext) { + bld.sop1(aco_opcode::s_sext_i32_i8, Definition(tmp), vec); + } else if (src_size == 16 && swizzle == 0 && mode == sgpr_extract_sext) { + bld.sop1(aco_opcode::s_sext_i32_i16, Definition(tmp), vec); + } else { + aco_opcode op = mode == sgpr_extract_zext ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32; + bld.sop2(op, Definition(tmp), bld.def(s1, scc), vec, Operand((src_size << 16) | offset)); + } + + if (dst.regClass() == s2) + convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst); + + return dst; +} + Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1) { if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1) @@ -559,23 +675,8 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1) if (elem_size < 4 && vec.type() == RegType::sgpr) { assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16); assert(size == 1); - unsigned swizzle = src.swizzle[0]; - if (vec.size() > 1) { - assert(src.src.ssa->bit_size == 16); - vec = emit_extract_vector(ctx, vec, swizzle / 2, s1); - swizzle = swizzle & 1; - } - if (swizzle == 0) - return vec; - - Temp dst{ctx->program->allocateId(), s1}; - aco_ptr bfe{create_instruction(aco_opcode::s_bfe_u32, Format::SOP2, 2, 2)}; - bfe->operands[0] = Operand(vec); - bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle))); - bfe->definitions[0] = Definition(dst); - bfe->definitions[1] = Definition(ctx->program->allocateId(), scc, s1); - ctx->block->instructions.emplace_back(std::move(bfe)); - return dst; + return extract_8_16_bit_sgpr_element( + ctx, Temp(ctx->program->allocateId(), s1), &src, sgpr_extract_undef); } RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4); @@ -614,6 +715,8 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0])); sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1])); sop2->definitions[0] = Definition(dst); + if (instr->no_unsigned_wrap) + sop2->definitions[0].setNUW(true); if (writes_scc) sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1); ctx->block->instructions.emplace_back(std::move(sop2)); @@ -865,9 +968,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } return; } @@ -885,9 +986,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64; bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond))); } else { - fprintf(stderr, "Unimplemented uniform bcsel bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented uniform bcsel bit size"); } return; } @@ -1039,68 +1138,10 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val) return add->definitions[0].getTemp(); } -Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) { - if (!dst.id()) { - if (dst_bits % 32 == 0 || src.type() == RegType::sgpr) - dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u)); - else - dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword()); - } - - if (dst.bytes() == src.bytes() && dst_bits < src_bits) - return bld.copy(Definition(dst), src); - else if (dst.bytes() < src.bytes()) - return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u)); - - Temp tmp = dst; - if (dst_bits == 64) - tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1); - - if (tmp == src) { - } else if (src.regClass() == s1) { - if (is_signed) - bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src); - else - bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src); - } else if (ctx->options->chip_class >= GFX8) { - assert(src_bits != 8 || src.regClass() == v1b); - assert(src_bits != 16 || src.regClass() == v2b); - aco_ptr sdwa{create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; - sdwa->operands[0] = Operand(src); - sdwa->definitions[0] = Definition(tmp); - if (is_signed) - sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword; - else - sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword; - sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword; - bld.insert(std::move(sdwa)); - } else { - assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7); - aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32; - bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u)); - } - - if (dst_bits == 64) { - if (is_signed && dst.regClass() == s2) { - Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u)); - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high); - } else if (is_signed && dst.regClass() == v2) { - Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp); - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high); - } else { - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u)); - } - } - - return dst; -} - void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) { if (!instr->dest.dest.is_ssa) { - fprintf(stderr, "nir alu dst not in ssa: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "nir alu dst not in ssa"); abort(); } Builder bld(ctx->program, ctx->block); @@ -1194,9 +1235,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64; bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1223,9 +1262,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); } } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1236,9 +1273,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src = get_alu_src(ctx, instr->src[0]); bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src)); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1266,9 +1301,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1278,9 +1311,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == s1) { emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1290,9 +1321,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == s1) { emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1302,9 +1331,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == s1) { emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1314,9 +1341,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == s1) { emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1332,9 +1357,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == s2) { emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1350,9 +1373,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == s2) { emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1368,9 +1389,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == s2) { emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1388,9 +1407,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == s1) { emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1408,9 +1425,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == s2) { emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1428,9 +1443,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == s2) { emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1443,9 +1456,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (src.regClass() == s2) { bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1472,9 +1483,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp(); bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1484,9 +1493,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v1) { bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0])); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1522,9 +1529,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1553,9 +1558,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry); } } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1589,9 +1592,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u)); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1625,9 +1626,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1660,9 +1659,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u)); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1673,9 +1670,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == s1) { emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1689,9 +1684,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1705,9 +1698,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1721,9 +1712,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v2) { bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1737,9 +1726,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v2) { bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1762,9 +1749,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) VOP3A_instruction* sub = static_cast(add); sub->neg[1] = true; } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1784,9 +1769,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1); } } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1806,9 +1789,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1); } } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1818,9 +1799,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1830,9 +1809,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1842,9 +1819,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1852,9 +1827,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.size() == 1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1862,9 +1835,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.size() == 1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1872,9 +1843,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.size() == 1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1882,9 +1851,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.size() == 1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1892,9 +1859,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.size() == 1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1902,9 +1867,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.size() == 1) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1917,8 +1880,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma); Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]); Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]); - sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/)); - tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/)); + sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma), Operand(0x3f000000u/*0.5*/)); + tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), + bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma), Operand(0x3f000000u/*0.5*/)); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc); break; } @@ -1944,9 +1909,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) /* Lowered at NIR level for precision reasons. */ emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1968,9 +1931,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -1992,9 +1953,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2011,9 +1970,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) VOP3A_instruction* vop3 = static_cast(add); vop3->clamp = true; } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2024,9 +1981,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v1) { emit_log2(ctx, bld, Definition(dst), src); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2040,9 +1995,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) /* Lowered at NIR level for precision reasons. */ emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2052,9 +2005,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2068,9 +2019,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) /* Lowered at NIR level for precision reasons. */ emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2082,9 +2031,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v2) { emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2097,9 +2044,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v2) { emit_floor_f64(ctx, bld, Definition(dst), src); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2127,9 +2072,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add); } } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2142,9 +2085,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v2) { emit_trunc_f64(ctx, bld, Definition(dst), src); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2182,9 +2123,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); } } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2208,9 +2147,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32; bld.vop1(opcode, Definition(dst), tmp); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2224,9 +2161,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v2) { bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), as_vgpr(ctx, src0), src1); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2239,9 +2174,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v2) { bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2256,9 +2189,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (instr->src[0].src.ssa->bit_size == 64) { bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2287,9 +2218,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2320,9 +2249,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (instr->src[0].src.ssa->bit_size == 64) { emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2368,9 +2295,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2412,9 +2337,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u)); bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2453,9 +2376,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (instr->src[0].src.ssa->bit_size == 64) { emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2474,9 +2395,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (instr->src[0].src.ssa->bit_size == 64) { emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2553,9 +2472,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2625,9 +2542,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2680,31 +2595,52 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_i2i16: case nir_op_i2i32: case nir_op_i2i64: { - convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), - instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst); + if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) { + /* no need to do the extract in get_alu_src() */ + sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size ? + sgpr_extract_sext : sgpr_extract_undef; + extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode); + } else { + convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), + instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst); + } break; } case nir_op_u2u8: case nir_op_u2u16: case nir_op_u2u32: case nir_op_u2u64: { - convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), - instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst); + if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) { + /* no need to do the extract in get_alu_src() */ + sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size ? + sgpr_extract_zext : sgpr_extract_undef; + extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode); + } else { + convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), + instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst); + } break; } case nir_op_b2b32: - case nir_op_b2i32: { + case nir_op_b2i8: + case nir_op_b2i16: + case nir_op_b2i32: + case nir_op_b2i64: { Temp src = get_alu_src(ctx, instr->src[0]); assert(src.regClass() == bld.lm); - if (dst.regClass() == s1) { + Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst; + if (tmp.regClass() == s1) { // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ - bool_to_scalar_condition(ctx, src, dst); - } else if (dst.regClass() == v1) { - bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src); + bool_to_scalar_condition(ctx, src, tmp); + } else if (tmp.type() == RegType::vgpr) { + bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand(0u), Operand(1u), src); } else { unreachable("Invalid register class for b2i32"); } + + if (tmp != dst) + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u)); break; } case nir_op_b2b1: @@ -2778,16 +2714,20 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src0 = bld.tmp(v1); Temp src1 = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src); - if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz) + if (0 && (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)) { bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1); - else - bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst), - bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0), - bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1)); + } else { + src0 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src0); + src1 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src1); + if (ctx->program->chip_class >= GFX10) { + /* the high bits of v_cvt_f16_f32 isn't zero'd on GFX10 */ + bld.vop3(aco_opcode::v_pack_b32_f16, Definition(dst), src0, src1); + } else { + bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst), src0, src1); + } + } } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2795,9 +2735,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.regClass() == v1) { bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0])); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2807,9 +2745,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0])))); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2850,9 +2786,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v1) { bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2895,9 +2829,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -2970,9 +2902,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (src.regClass() == s2) { bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -3059,9 +2989,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } default: - fprintf(stderr, "Unknown NIR ALU instr: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unknown NIR ALU instr"); } } @@ -3127,8 +3055,7 @@ struct LoadEmitInfo { bool glc = false; unsigned swizzle_component_size = 0; - barrier_interaction barrier = barrier_none; - bool can_reorder = true; + memory_sync_info sync; Temp soffset = Temp(0, s1); }; @@ -3222,7 +3149,9 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info) /* align offset down if needed */ Operand aligned_offset = offset; + unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul; if (need_to_align_offset) { + align = 4; Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp(); if (offset.isConstant()) { aligned_offset = Operand(offset.constantValue() & 0xfffffffcu); @@ -3242,7 +3171,6 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info) Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset); - unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul; Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align, reduced_const_offset, byte_align ? Temp() : info->dst); @@ -3304,7 +3232,7 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info) if (num_tmps > 1) { aco_ptr vec{create_instruction( aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)}; - for (unsigned i = 0; i < num_vals; i++) + for (unsigned i = 0; i < num_tmps; i++) vec->operands[i] = Operand(tmp[i]); tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size)); vec->definitions[0] = Definition(tmp[0]); @@ -3430,10 +3358,12 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info, RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4)); Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc); + Instruction *instr; if (read2) - bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1); + instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1); else - bld.ds(op, Definition(val), offset, m, const_offset); + instr = bld.ds(op, Definition(val), offset, m, const_offset); + static_cast(instr)->sync = info->sync; if (size < 4) val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u)); @@ -3479,8 +3409,7 @@ Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info, load->definitions[0] = Definition(val); load->glc = info->glc; load->dlc = info->glc && bld.program->chip_class >= GFX10; - load->barrier = info->barrier; - load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works + load->sync = info->sync; bld.insert(std::move(load)); return val; } @@ -3503,10 +3432,10 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info, unsigned bytes_size = 0; aco_opcode op; - if (bytes_needed == 1) { + if (bytes_needed == 1 || align_ % 2) { bytes_size = 1; op = aco_opcode::buffer_load_ubyte; - } else if (bytes_needed == 2) { + } else if (bytes_needed == 2 || align_ % 4) { bytes_size = 2; op = aco_opcode::buffer_load_ushort; } else if (bytes_needed <= 4) { @@ -3529,10 +3458,10 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info, mubuf->offen = (offset.type() == RegType::vgpr); mubuf->glc = info->glc; mubuf->dlc = info->glc && bld.program->chip_class >= GFX10; - mubuf->barrier = info->barrier; - mubuf->can_reorder = info->can_reorder; + mubuf->sync = info->sync; mubuf->offset = const_offset; - RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4)); + mubuf->swizzled = info->swizzle_component_size != 0; + RegClass rc = RegClass::get(RegType::vgpr, bytes_size); Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc); mubuf->definitions[0] = Definition(val); bld.insert(std::move(mubuf)); @@ -3541,6 +3470,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info, } static auto emit_mubuf_load = emit_load; +static auto emit_scratch_load = emit_load; Temp get_gfx6_global_rsrc(Builder& bld, Temp addr) { @@ -3592,7 +3522,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info, mubuf->offset = 0; mubuf->addr64 = offset.type() == RegType::vgpr; mubuf->disable_wqm = false; - mubuf->barrier = info->barrier; + mubuf->sync = info->sync; mubuf->definitions[0] = Definition(val); bld.insert(std::move(mubuf)); } else { @@ -3603,7 +3533,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info, flat->operands[1] = Operand(s1); flat->glc = info->glc; flat->dlc = info->glc && bld.program->chip_class >= GFX10; - flat->barrier = info->barrier; + flat->sync = info->sync; flat->offset = 0u; flat->definitions[0] = Definition(val); bld.insert(std::move(flat)); @@ -3625,8 +3555,7 @@ Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst, LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes}; info.align_mul = align; info.align_offset = 0; - info.barrier = barrier_shared; - info.can_reorder = false; + info.sync = memory_sync_info(storage_shared); info.const_offset = base_offset; emit_lds_load(ctx, bld, &info); @@ -3665,13 +3594,15 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp /* use allocated_vec if possible */ auto it = ctx->allocated_vec.find(src.id()); if (it != ctx->allocated_vec.end()) { - unsigned total_size = 0; - for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++) - total_size += it->second[i].bytes(); - if (total_size != src.bytes()) + if (!it->second[0].id()) goto split; - unsigned elem_size = it->second[0].bytes(); + assert(src.bytes() % elem_size == 0); + + for (unsigned i = 0; i < src.bytes() / elem_size; i++) { + if (!it->second[i].id()) + goto split; + } for (unsigned i = 0; i < count; i++) { if (offsets[i] % elem_size || dst[i].bytes() % elem_size) @@ -3703,10 +3634,11 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp } } + split: + if (dst_type == RegType::sgpr) src = bld.as_uniform(src); - split: /* just split it */ aco_ptr split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)}; split->operands[0] = Operand(src); @@ -3832,13 +3764,16 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t } assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */ + Instruction *instr; if (write2) { Temp second_data = write_datas[second]; inline_offset /= data.bytes(); - bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off); + instr = bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off); } else { - bld.ds(op, address_offset, data, m, inline_offset); + instr = bld.ds(op, address_offset, data, m, inline_offset); } + static_cast(instr)->sync = + memory_sync_info(storage_shared); } } @@ -3983,7 +3918,8 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un } void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata, - unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false) + unsigned const_offset = 0u, memory_sync_info sync=memory_sync_info(), + bool slc = false, bool swizzled = false) { assert(vdata.id()); assert(vdata.size() != 3 || ctx->program->chip_class != GFX6); @@ -3996,15 +3932,16 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1); Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u); Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset, - /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false, - /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc); + /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled, + /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true, + /* dlc*/ false, /* slc */ slc); - static_cast(r.instr)->can_reorder = allow_reorder; + static_cast(r.instr)->sync = sync; } void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask, - bool allow_combining = true, bool reorder = true, bool slc = false) + bool allow_combining = true, memory_sync_info sync=memory_sync_info(), bool slc = false) { Builder bld(ctx->program, ctx->block); assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8); @@ -4019,7 +3956,7 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset for (unsigned i = 0; i < write_count; i++) { unsigned const_offset = offsets[i] + base_const_offset; - emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc); + emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync, slc, !allow_combining); } } @@ -4339,7 +4276,7 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr) /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */ Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u)); Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset); - store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true); + store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, memory_sync_info(), true); } else { Temp lds_base; @@ -4424,7 +4361,7 @@ void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u)); Temp oc_lds = get_arg(ctx, ctx->args->oc_lds); - store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, false); + store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, memory_sync_info(storage_vmem_output)); } if (write_to_lds) { @@ -4459,9 +4396,7 @@ void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr) ctx->shader->info.stage == MESA_SHADER_GEOMETRY) { bool stored_to_temps = store_output_to_temps(ctx, instr); if (!stored_to_temps) { - fprintf(stderr, "Unimplemented output offset instruction:\n"); - nir_print_instr(instr->src[1].ssa->parent_instr, stderr); - fprintf(stderr, "\n"); + isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction"); abort(); } } else if (ctx->stage == vertex_es || @@ -4678,9 +4613,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) nir_instr *off_instr = instr->src[0].ssa->parent_instr; if (off_instr->type != nir_instr_type_load_const) { - fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n"); - nir_print_instr(off_instr, stderr); - fprintf(stderr, "\n"); + isel_err(off_instr, "Unimplemented nir_intrinsic_load_input offset"); } uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32; @@ -4829,15 +4762,13 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) } if (use_mubuf) { - Instruction *mubuf = bld.mubuf(opcode, - Definition(fetch_dst), list, fetch_index, soffset, - fetch_offset, false, true).instr; - static_cast(mubuf)->can_reorder = true; + bld.mubuf(opcode, + Definition(fetch_dst), list, fetch_index, soffset, + fetch_offset, false, false, true).instr; } else { - Instruction *mtbuf = bld.mtbuf(opcode, - Definition(fetch_dst), list, fetch_index, soffset, - fetch_dfmt, nfmt, fetch_offset, false, true).instr; - static_cast(mtbuf)->can_reorder = true; + bld.mtbuf(opcode, + Definition(fetch_dst), list, fetch_index, soffset, + fetch_dfmt, nfmt, fetch_offset, false, true).instr; } emit_split_vector(ctx, fetch_dst, fetch_dst.size()); @@ -4894,9 +4825,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) nir_instr *off_instr = instr->src[offset_idx].ssa->parent_instr; if (off_instr->type != nir_instr_type_load_const || nir_instr_as_load_const(off_instr)->value[0].u32 != 0) { - fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n"); - nir_print_instr(off_instr, stderr); - fprintf(stderr, "\n"); + isel_err(off_instr, "Unimplemented nir_intrinsic_load_input offset"); } Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask); @@ -5190,7 +5119,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size, Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, - bool glc=false, bool readonly=true, bool allow_smem=true) + bool glc=false, bool allow_smem=true, memory_sync_info sync=memory_sync_info()) { Builder bld(ctx->program, ctx->block); @@ -5200,8 +5129,7 @@ void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_ LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc}; info.glc = glc; - info.barrier = readonly ? barrier_none : barrier_buffer; - info.can_reorder = readonly; + info.sync = sync; info.align_mul = align_mul; info.align_offset = align_offset; if (use_smem) @@ -5277,7 +5205,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); if (offset != 0) // TODO check if index != 0 as well - index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index); + index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index); Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants)); Temp vec = dst; bool trim = false; @@ -5319,7 +5247,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) unreachable("unimplemented or forbidden load_push_constant."); } - bld.smem(op, Definition(vec), ptr, index); + static_cast(bld.smem(op, Definition(vec), ptr, index).instr)->prevent_overflow = true; if (!aligned) { Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index); @@ -5363,7 +5291,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr) Temp offset = get_ssa_temp(ctx, instr->src[0].ssa); if (base && offset.type() == RegType::sgpr) - offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base)); + offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base)); else if (base && offset.type() == RegType::vgpr) offset = bld.vadd32(bld.def(v1), Operand(base), offset); @@ -5719,7 +5647,6 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec load->unrm = true; load->da = da; load->dim = dim; - load->can_reorder = true; /* fmask images shouldn't be modified */ ctx->block->instructions.emplace_back(std::move(load)); Operand sample_index4; @@ -5819,6 +5746,22 @@ static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr } +memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class storage, unsigned semantics) +{ + /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */ + if (semantics & semantic_atomicrmw) + return memory_sync_info(storage, semantics); + + unsigned access = nir_intrinsic_access(instr); + + if (access & ACCESS_VOLATILE) + semantics |= semantic_volatile; + if (access & ACCESS_CAN_REORDER) + semantics |= semantic_can_reorder | semantic_private; + + return memory_sync_info(storage, semantics); +} + void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) { Builder bld(ctx->program, ctx->block); @@ -5828,6 +5771,9 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) bool is_array = glsl_sampler_type_is_array(type); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0); + unsigned access = var->data.access | nir_intrinsic_access(instr); + if (dim == GLSL_SAMPLER_DIM_BUF) { unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); unsigned num_channels = util_last_bit(mask); @@ -5862,9 +5808,9 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)}; load->definitions[0] = Definition(tmp); load->idxen = true; - load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT); + load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT); load->dlc = load->glc && ctx->options->chip_class >= GFX10; - load->barrier = barrier_image; + load->sync = sync; ctx->block->instructions.emplace_back(std::move(load)); expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1); @@ -5890,13 +5836,13 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) load->operands[1] = Operand(s4); /* no sampler */ load->operands[2] = Operand(coords); load->definitions[0] = Definition(tmp); - load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0; + load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0; load->dlc = load->glc && ctx->options->chip_class >= GFX10; load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); load->dmask = dmask; load->unrm = true; load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); - load->barrier = barrier_image; + load->sync = sync; ctx->block->instructions.emplace_back(std::move(load)); expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask); @@ -5911,7 +5857,9 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) bool is_array = glsl_sampler_type_is_array(type); Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa)); - bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0; + memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0); + unsigned access = var->data.access | nir_intrinsic_access(instr); + bool glc = ctx->options->chip_class == GFX6 || access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0; if (dim == GLSL_SAMPLER_DIM_BUF) { Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true); @@ -5942,7 +5890,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) store->glc = glc; store->dlc = false; store->disable_wqm = true; - store->barrier = barrier_image; + store->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(store)); return; @@ -5966,7 +5914,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) store->unrm = true; store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); store->disable_wqm = true; - store->barrier = barrier_image; + store->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(store)); return; @@ -6044,6 +5992,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) } Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw); if (dim == GLSL_SAMPLER_DIM_BUF) { Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); @@ -6061,7 +6010,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) mubuf->glc = return_previous; mubuf->dlc = false; /* Not needed for atomics */ mubuf->disable_wqm = true; - mubuf->barrier = barrier_image; + mubuf->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(mubuf)); return; @@ -6082,7 +6031,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) mimg->unrm = true; mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); mimg->disable_wqm = true; - mimg->barrier = barrier_image; + mimg->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(mimg)); return; @@ -6131,6 +6080,7 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr) } /* LOD */ + assert(nir_src_as_uint(instr->src[1]) == 0); Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); /* Resource */ @@ -6146,7 +6096,6 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr) mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); mimg->dmask = (1 << instr->dest.ssa.num_components) - 1; mimg->da = glsl_sampler_type_is_array(type); - mimg->can_reorder = true; Definition& def = mimg->definitions[0]; ctx->block->instructions.emplace_back(std::move(mimg)); @@ -6201,7 +6150,8 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER); load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), - nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem); + nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem, + get_memory_sync_info(instr, storage_buffer, 0)); } void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) @@ -6215,6 +6165,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); + memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0); bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr)); /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores @@ -6224,6 +6175,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) bool smem = !nir_src_is_divergent(instr->src[2]) && ctx->options->chip_class >= GFX8 && + ctx->options->chip_class < GFX10_3 && (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) && allow_smem; if (smem) @@ -6245,8 +6197,8 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) aco_ptr store{create_instruction(op, Format::SMEM, 3, 0)}; store->operands[0] = Operand(rsrc); if (offsets[i]) { - Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), - offset, Operand(offsets[i])); + Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), + offset, Operand(offsets[i])); store->operands[1] = Operand(off); } else { store->operands[1] = Operand(offset); @@ -6257,7 +6209,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) store->glc = glc; store->dlc = false; store->disable_wqm = true; - store->barrier = barrier_buffer; + store->sync = sync; ctx->block->instructions.emplace_back(std::move(store)); ctx->program->wb_smem_l1_on_end = true; if (op == aco_opcode::p_fs_buffer_store_smem) { @@ -6275,7 +6227,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) store->glc = glc; store->dlc = false; store->disable_wqm = true; - store->barrier = barrier_buffer; + store->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(store)); } @@ -6366,7 +6318,7 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) mubuf->glc = return_previous; mubuf->dlc = false; /* Not needed for atomics */ mubuf->disable_wqm = true; - mubuf->barrier = barrier_buffer; + mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw); ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(mubuf)); } @@ -6391,8 +6343,7 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); info.align_mul = nir_intrinsic_align_mul(instr); info.align_offset = nir_intrinsic_align_offset(instr); - info.barrier = barrier_buffer; - info.can_reorder = false; + info.sync = get_memory_sync_info(instr, storage_buffer, 0); /* VMEM stores don't update the SMEM cache and it's difficult to prove that * it's safe to use SMEM */ bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE; @@ -6412,6 +6363,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); Temp addr = get_ssa_temp(ctx, instr->src[1].ssa); + memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0); bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); if (ctx->options->chip_class >= GFX7) @@ -6477,7 +6429,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) flat->dlc = false; flat->offset = offset; flat->disable_wqm = true; - flat->barrier = barrier_buffer; + flat->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(flat)); } else { @@ -6497,7 +6449,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) mubuf->offset = offsets[i]; mubuf->addr64 = addr.type() == RegType::vgpr; mubuf->disable_wqm = true; - mubuf->barrier = barrier_buffer; + mubuf->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(mubuf)); } @@ -6590,7 +6542,7 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr) flat->dlc = false; /* Not needed for atomics */ flat->offset = 0; flat->disable_wqm = true; - flat->barrier = barrier_buffer; + flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw); ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(flat)); } else { @@ -6657,33 +6609,57 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr) mubuf->offset = 0; mubuf->addr64 = addr.type() == RegType::vgpr; mubuf->disable_wqm = true; - mubuf->barrier = barrier_buffer; + mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw); ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(mubuf)); } } -void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { - Builder bld(ctx->program, ctx->block); - switch(instr->intrinsic) { - case nir_intrinsic_group_memory_barrier: - case nir_intrinsic_memory_barrier: - bld.barrier(aco_opcode::p_memory_barrier_common); - break; - case nir_intrinsic_memory_barrier_buffer: - bld.barrier(aco_opcode::p_memory_barrier_buffer); - break; - case nir_intrinsic_memory_barrier_image: - bld.barrier(aco_opcode::p_memory_barrier_image); - break; - case nir_intrinsic_memory_barrier_tcs_patch: - case nir_intrinsic_memory_barrier_shared: - bld.barrier(aco_opcode::p_memory_barrier_shared); - break; - default: - unreachable("Unimplemented memory barrier intrinsic"); - break; +sync_scope translate_nir_scope(nir_scope scope) +{ + switch (scope) { + case NIR_SCOPE_NONE: + case NIR_SCOPE_INVOCATION: + return scope_invocation; + case NIR_SCOPE_SUBGROUP: + return scope_subgroup; + case NIR_SCOPE_WORKGROUP: + return scope_workgroup; + case NIR_SCOPE_QUEUE_FAMILY: + return scope_queuefamily; + case NIR_SCOPE_DEVICE: + return scope_device; } + unreachable("invalid scope"); +} + +void emit_scoped_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { + Builder bld(ctx->program, ctx->block); + + unsigned semantics = 0; + unsigned storage = 0; + sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr)); + sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr)); + + unsigned nir_storage = nir_intrinsic_memory_modes(instr); + if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global)) + storage |= storage_buffer | storage_image; //TODO: split this when NIR gets nir_var_mem_image + if (ctx->shader->info.stage == MESA_SHADER_COMPUTE && (nir_storage & nir_var_mem_shared)) + storage |= storage_shared; + if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL && (nir_storage & nir_var_shader_out)) + storage |= storage_shared; + + unsigned nir_semantics = nir_intrinsic_memory_semantics(instr); + if (nir_semantics & NIR_MEMORY_ACQUIRE) + semantics |= semantic_acquire | semantic_release; + if (nir_semantics & NIR_MEMORY_RELEASE) + semantics |= semantic_acquire | semantic_release; + + assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE))); + + bld.barrier(aco_opcode::p_barrier, + memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope), + exec_scope); } void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr) @@ -6781,6 +6757,12 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr) op64_rtn = aco_opcode::ds_cmpst_rtn_b64; num_operands = 4; break; + case nir_intrinsic_shared_atomic_fadd: + op32 = aco_opcode::ds_add_f32; + op32_rtn = aco_opcode::ds_add_rtn_f32; + op64 = aco_opcode::num_opcodes; + op64_rtn = aco_opcode::num_opcodes; + break; default: unreachable("Unhandled shared atomic intrinsic"); } @@ -6820,6 +6802,7 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr) ds->offset0 = offset; if (return_previous) ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa)); + ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw); ctx->block->instructions.emplace_back(std::move(ds)); } @@ -6831,7 +6814,7 @@ Temp get_scratch_resource(isel_context *ctx) scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u)); uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) | - S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);; + S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2); if (ctx->program->chip_class >= GFX10) { rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | @@ -6842,9 +6825,9 @@ Temp get_scratch_resource(isel_context *ctx) S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); } - /* older generations need element size = 16 bytes. element size removed in GFX9 */ + /* older generations need element size = 4 bytes. element size removed in GFX9 */ if (ctx->program->chip_class <= GFX8) - rsrc_conf |= S_008F0C_ELEMENT_SIZE(3); + rsrc_conf |= S_008F0C_ELEMENT_SIZE(1); return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf)); } @@ -6859,10 +6842,10 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { instr->dest.ssa.bit_size / 8u, rsrc}; info.align_mul = nir_intrinsic_align_mul(instr); info.align_offset = nir_intrinsic_align_offset(instr); - info.swizzle_component_size = 16; - info.can_reorder = false; + info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0; + info.sync = memory_sync_info(storage_scratch, semantic_private); info.soffset = ctx->program->scratch_offset; - emit_mubuf_load(ctx, bld, &info); + emit_scratch_load(ctx, bld, &info); } void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { @@ -6877,12 +6860,14 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { unsigned write_count = 0; Temp write_datas[32]; unsigned offsets[32]; + unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16; split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, - 16, &write_count, write_datas, offsets); + swizzle_component_size, &write_count, write_datas, offsets); for (unsigned i = 0; i < write_count; i++) { aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes()); - bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true); + Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true); + static_cast(instr)->sync = memory_sync_info(storage_scratch, semantic_private); } } @@ -6996,8 +6981,7 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst mtbuf->offset = const_offset; mtbuf->glc = true; mtbuf->slc = true; - mtbuf->barrier = barrier_gs_data; - mtbuf->can_reorder = true; + mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder); bld.insert(std::move(mtbuf)); } @@ -7149,9 +7133,7 @@ void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp s } else if (src.regClass() == s2) { bld.sop1(aco_opcode::s_mov_b64, dst, src); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } } @@ -7189,10 +7171,11 @@ void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2) } /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */ - Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1); - Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2); - tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1); - tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2); + aco_opcode mad = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32; + Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1); + Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2); + tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1); + tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2); Temp wqm1 = bld.tmp(v1); emit_wqm(ctx, tmp1, wqm1, true); Temp wqm2 = bld.tmp(v1); @@ -7263,6 +7246,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]); Temp private_segment_buffer = ctx->program->private_segment_buffer; + //TODO: bounds checking? if (addr.type() == RegType::sgpr) { Operand offset; if (const_addr) { @@ -7321,8 +7305,6 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) load->glc = false; load->dlc = false; load->disable_wqm = false; - load->barrier = barrier_none; - load->can_reorder = true; ctx->block->instructions.emplace_back(std::move(load)); } @@ -7435,6 +7417,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_shared_atomic_xor: case nir_intrinsic_shared_atomic_exchange: case nir_intrinsic_shared_atomic_comp_swap: + case nir_intrinsic_shared_atomic_fadd: visit_shared_atomic(ctx, instr); break; case nir_intrinsic_image_deref_load: @@ -7503,27 +7486,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_get_buffer_size: visit_get_buffer_size(ctx, instr); break; - case nir_intrinsic_control_barrier: { - if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) { - /* GFX6 only (thanks to a hw bug workaround): - * The real barrier instruction isn’t needed, because an entire patch - * always fits into a single wave. - */ - break; - } - - if (ctx->program->workgroup_size > ctx->program->wave_size) - bld.sopp(aco_opcode::s_barrier); - - break; - } - case nir_intrinsic_memory_barrier_tcs_patch: - case nir_intrinsic_group_memory_barrier: - case nir_intrinsic_memory_barrier: - case nir_intrinsic_memory_barrier_buffer: - case nir_intrinsic_memory_barrier_image: - case nir_intrinsic_memory_barrier_shared: - emit_memory_barrier(ctx, instr); + case nir_intrinsic_scoped_barrier: + emit_scoped_barrier(ctx, instr); break; case nir_intrinsic_load_num_work_groups: { Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); @@ -7600,9 +7564,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) { bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } if (dst.size() != bld.lm.size()) { /* Wave32 with ballot size set to 64 */ @@ -7654,9 +7616,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp); emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } } break; @@ -7694,9 +7654,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } else if (src.regClass() == s2) { bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -7867,9 +7825,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); emit_split_vector(ctx, dst, 2); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } } break; @@ -7947,9 +7903,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); emit_split_vector(ctx, dst, 2); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -7983,9 +7937,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); emit_split_vector(ctx, dst, 2); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -8007,9 +7959,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); emit_split_vector(ctx, dst, 2); } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented NIR instr bit size"); } break; } @@ -8063,11 +8013,18 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } case nir_intrinsic_shader_clock: { - aco_opcode opcode = - nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ? - aco_opcode::s_memrealtime : aco_opcode::s_memtime; - bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false); - emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP && ctx->options->chip_class >= GFX10_3) { + /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */ + Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand(0u)); + } else { + aco_opcode opcode = + nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ? + aco_opcode::s_memrealtime : aco_opcode::s_memtime; + bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile)); + } + emit_split_vector(ctx, dst, 2); break; } case nir_intrinsic_load_vertex_id_zero_base: { @@ -8153,9 +8110,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } default: - fprintf(stderr, "Unimplemented intrinsic instr: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unimplemented intrinsic instr"); abort(); break; @@ -8281,6 +8236,8 @@ void prepare_cube_coords(isel_context *ctx, std::vector& coords, Temp* ddx { Builder bld(ctx->program, ctx->block); Temp ma, tc, sc, id; + aco_opcode madak = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32; + aco_opcode madmk = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32; if (is_array) { coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]); @@ -8301,11 +8258,11 @@ void prepare_cube_coords(isel_context *ctx, std::vector& coords, Temp* ddx sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]); if (!is_deriv) - sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/)); + sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/)); tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]); if (!is_deriv) - tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/)); + tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/)); id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]); @@ -8336,7 +8293,7 @@ void prepare_cube_coords(isel_context *ctx, std::vector& coords, Temp* ddx } if (is_array) - id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/)); + id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/)); coords.resize(3); coords[0] = sc; coords[1] = tc; @@ -8647,7 +8604,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) tex->da = da; tex->definitions[0] = Definition(tmp_dst); tex->dim = dim; - tex->can_reorder = true; ctx->block->instructions.emplace_back(std::move(tex)); if (div_by_6) { @@ -8680,7 +8636,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) tex->da = da; Temp size = bld.tmp(v2); tex->definitions[0] = Definition(size); - tex->can_reorder = true; ctx->block->instructions.emplace_back(std::move(tex)); emit_split_vector(ctx, size, size.size()); @@ -8782,7 +8737,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) mubuf->operands[2] = Operand((uint32_t) 0); mubuf->definitions[0] = Definition(tmp_dst); mubuf->idxen = true; - mubuf->can_reorder = true; ctx->block->instructions.emplace_back(std::move(mubuf)); expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1); @@ -8831,7 +8785,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) tex->unrm = true; tex->da = da; tex->definitions[0] = Definition(tmp_dst); - tex->can_reorder = true; ctx->block->instructions.emplace_back(std::move(tex)); if (instr->op == nir_texop_samples_identical) { @@ -8975,7 +8928,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) tex->dmask = dmask; tex->da = da; tex->definitions[0] = Definition(tmp_dst); - tex->can_reorder = true; ctx->block->instructions.emplace_back(std::move(tex)); if (tg4_integer_cube_workaround) { @@ -9225,9 +9177,7 @@ void visit_jump(isel_context *ctx, nir_jump_instr *instr) } break; default: - fprintf(stderr, "Unknown NIR jump instr: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); + isel_err(&instr->instr, "Unknown NIR jump instr"); abort(); } @@ -9285,9 +9235,7 @@ void visit_block(isel_context *ctx, nir_block *block) visit_jump(ctx, nir_instr_as_jump(instr)); break; default: - fprintf(stderr, "Unknown NIR instr type: "); - nir_print_instr(instr, stderr); - fprintf(stderr, "\n"); + isel_err(instr, "Unknown NIR instr type"); //abort(); } } @@ -9843,10 +9791,10 @@ static bool export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *nex else exp->operands[i] = Operand(v1); } - /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang. + /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang. * Setting valid_mask=1 prevents it and has no other effect. */ - exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0; + exp->valid_mask = ctx->options->chip_class == GFX10 && is_pos && *next_pos == 0; exp->done = false; exp->compressed = false; if (is_pos) @@ -9888,7 +9836,7 @@ static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos) exp->enabled_mask |= 0x4; } } - exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0; + exp->valid_mask = ctx->options->chip_class == GFX10 && *next_pos == 0; exp->done = false; exp->compressed = false; exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++; @@ -10258,6 +10206,13 @@ static void create_fs_exports(isel_context *ctx) create_null_export(ctx); } +static void create_workgroup_barrier(Builder& bld) +{ + bld.barrier(aco_opcode::p_barrier, + memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup), + scope_workgroup); +} + static void write_tcs_tess_factors(isel_context *ctx) { unsigned outer_comps; @@ -10282,9 +10237,7 @@ static void write_tcs_tess_factors(isel_context *ctx) Builder bld(ctx->program, ctx->block); - bld.barrier(aco_opcode::p_memory_barrier_shared); - if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size)) - bld.sopp(aco_opcode::s_barrier); + create_workgroup_barrier(bld); Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids); Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u)); @@ -10334,8 +10287,8 @@ static void write_tcs_tess_factors(isel_context *ctx) Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u)); bld.mubuf(aco_opcode::buffer_store_dword, /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word, - /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false, - /* disable_wqm */ false, /* glc */ true); + /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false, + /* addr64 */ false, /* disable_wqm */ false, /* glc */ true); tf_const_offset += 4; begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero); @@ -10345,7 +10298,7 @@ static void write_tcs_tess_factors(isel_context *ctx) assert(stride == 2 || stride == 4 || stride == 6); Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr, 4u); - store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false); + store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, memory_sync_info()); /* Store to offchip for TES to read - only if TES reads them */ if (ctx->args->options->key.tcs.tes_reads_tess_factors) { @@ -10353,11 +10306,11 @@ static void write_tcs_tess_factors(isel_context *ctx) Temp oc_lds = get_arg(ctx, ctx->args->oc_lds); std::pair vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_out_loc); - store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false); + store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, memory_sync_info(storage_vmem_output)); if (likely(inner_comps)) { std::pair vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_in_loc); - store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false); + store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, memory_sync_info(storage_vmem_output)); } } @@ -10443,7 +10396,6 @@ static void emit_stream_output(isel_context *ctx, store->glc = true; store->dlc = false; store->slc = true; - store->can_reorder = true; ctx->block->instructions.emplace_back(std::move(store)); } } @@ -10863,8 +10815,7 @@ void ngg_emit_nogs_output(isel_context *ctx) if (ctx->stage == ngg_vertex_gs) { /* Wait for GS threads to store primitive ID in LDS. */ - bld.barrier(aco_opcode::p_memory_barrier_shared); - bld.sopp(aco_opcode::s_barrier); + create_workgroup_barrier(bld); /* Calculate LDS address where the GS threads stored the primitive ID. */ Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), @@ -10948,8 +10899,7 @@ void select_program(Program *program, if (i) { Builder bld(ctx.program, ctx.block); - bld.barrier(aco_opcode::p_memory_barrier_shared); - bld.sopp(aco_opcode::s_barrier); + create_workgroup_barrier(bld); if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) { ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u)); @@ -10972,7 +10922,8 @@ void select_program(Program *program, ngg_emit_nogs_output(&ctx); } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { Builder bld(ctx.program, ctx.block); - bld.barrier(aco_opcode::p_memory_barrier_gs_data); + bld.barrier(aco_opcode::p_barrier, + memory_sync_info(storage_vmem_output, semantic_release, scope_device)); bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0)); } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) { write_tcs_tess_factors(&ctx); @@ -11004,7 +10955,7 @@ void select_program(Program *program, ctx.block->kind |= block_kind_uniform; Builder bld(ctx.program, ctx.block); if (ctx.program->wb_smem_l1_on_end) - bld.smem(aco_opcode::s_dcache_wb, false); + bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile)); bld.sopp(aco_opcode::s_endpgm); cleanup_cfg(program); @@ -11016,16 +10967,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, { isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true); - program->next_fp_mode.preserve_signed_zero_inf_nan32 = false; - program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false; - program->next_fp_mode.must_flush_denorms32 = false; - program->next_fp_mode.must_flush_denorms16_64 = false; - program->next_fp_mode.care_about_round32 = false; - program->next_fp_mode.care_about_round16_64 = false; - program->next_fp_mode.denorm16_64 = fp_denorm_keep; - program->next_fp_mode.denorm32 = 0; - program->next_fp_mode.round32 = fp_round_ne; - program->next_fp_mode.round16_64 = fp_round_ne; ctx.block->fp_mode = program->next_fp_mode; add_startpgm(&ctx); @@ -11099,8 +11040,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, mubuf->glc = true; mubuf->slc = true; mubuf->dlc = args->options->chip_class >= GFX10; - mubuf->barrier = barrier_none; - mubuf->can_reorder = true; ctx.outputs.mask[i] |= 1 << j; ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();