From 2694a34aa2c32aeb32d7d70af91db56c6eaaa23b Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 15 Oct 2019 17:25:57 +0100 Subject: [PATCH] aco: add NUW flag MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This (combined with a pass to actually set the corresponding NIR flags) should help fix a lot of the regressions from the SMEM addition combining change. fossil-db (Navi): Totals from 12 (0.01% of 135946) affected shaders: CodeSize: 12376 -> 12304 (-0.58%) Instrs: 2436 -> 2422 (-0.57%) VMEM: 1105 -> 1096 (-0.81%) SClause: 133 -> 130 (-2.26%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_builder_h.py | 8 ++++++++ src/amd/compiler/aco_instruction_selection.cpp | 11 +++++++---- src/amd/compiler/aco_ir.h | 15 ++++++++++++++- src/amd/compiler/aco_opt_value_numbering.cpp | 6 ++++++ src/amd/compiler/aco_optimizer.cpp | 5 ++--- src/amd/compiler/aco_print_ir.cpp | 2 ++ 6 files changed, 39 insertions(+), 8 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index e10358a1cc3..b3adb14dc8e 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -176,6 +176,7 @@ public: std::vector> *instructions; std::vector>::iterator it; bool is_precise = false; + bool is_nuw = false; Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), lm(pgm->lane_mask), instructions(NULL) {} Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(&block->instructions) {} @@ -187,6 +188,12 @@ public: return res; }; + Builder nuw() const { + Builder res = *this; + res.is_nuw = true; + return res; + } + void moveEnd(Block *block) { instructions = &block->instructions; } @@ -572,6 +579,7 @@ formats = [(f if len(f) == 5 else f + ('',)) for f in formats] % for i in range(num_definitions): instr->definitions[${i}] = def${i}; instr->definitions[${i}].setPrecise(is_precise); + instr->definitions[${i}].setNUW(is_nuw); % endfor % for i in range(num_operands): instr->operands[${i}] = op${i}.op; diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 0af1f1f5c15..34887f2f5be 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -614,6 +614,8 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0])); sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1])); sop2->definitions[0] = Definition(dst); + if (instr->no_unsigned_wrap) + sop2->definitions[0].setNUW(true); if (writes_scc) sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1); ctx->block->instructions.emplace_back(std::move(sop2)); @@ -5287,7 +5289,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr) Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); if (offset != 0) // TODO check if index != 0 as well - index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index); + index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index); Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants)); Temp vec = dst; bool trim = false; @@ -5373,7 +5375,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr) Temp offset = get_ssa_temp(ctx, instr->src[0].ssa); if (base && offset.type() == RegType::sgpr) - offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base)); + offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base)); else if (base && offset.type() == RegType::vgpr) offset = bld.vadd32(bld.def(v1), Operand(base), offset); @@ -6255,8 +6257,8 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) aco_ptr store{create_instruction(op, Format::SMEM, 3, 0)}; store->operands[0] = Operand(rsrc); if (offsets[i]) { - Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), - offset, Operand(offsets[i])); + Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), + offset, Operand(offsets[i])); store->operands[1] = Operand(off); } else { store->operands[1] = Operand(offset); @@ -7273,6 +7275,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]); Temp private_segment_buffer = ctx->program->private_segment_buffer; + //TODO: bounds checking? if (addr.type() == RegType::sgpr) { Operand offset; if (const_addr) { diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index a23ff7ce017..661b6982df9 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -700,7 +700,8 @@ private: class Definition final { public: - constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0) {} + constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), + isKill_(0), isPrecise_(0), isNUW_(0) {} Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {} explicit Definition(Temp tmp) noexcept @@ -797,6 +798,17 @@ public: return isPrecise_; } + /* No Unsigned Wrap */ + constexpr void setNUW(bool nuw) noexcept + { + isNUW_ = nuw; + } + + constexpr bool isNUW() const noexcept + { + return isNUW_; + } + private: Temp temp = Temp(0, s1); PhysReg reg_; @@ -806,6 +818,7 @@ private: uint8_t hasHint_:1; uint8_t isKill_:1; uint8_t isPrecise_:1; + uint8_t isNUW_:1; }; /* can't initialize bit-fields in c++11, so work around using a union */ uint8_t control_ = 0; diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index 9a1972ff34e..2fdbfaabd4a 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -406,6 +406,12 @@ void process_block(vn_ctx& ctx, Block& block) ctx.renames[instr->definitions[i].tempId()] = orig_instr->definitions[i].getTemp(); if (instr->definitions[i].isPrecise()) orig_instr->definitions[i].setPrecise(true); + /* SPIR_V spec says that an instruction marked with NUW wrapping + * around is undefined behaviour, so we can break additions in + * other contexts. + */ + if (instr->definitions[i].isNUW()) + orig_instr->definitions[i].setNUW(true); } } else { ctx.expr_values.erase(res.first); diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index a254728baa5..e00c8c1fcaf 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -717,9 +717,6 @@ bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands) bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset, bool prevent_overflow) { - if (prevent_overflow) - return false; //TODO - Operand op = instr->operands[op_index]; if (!op.isTemp()) @@ -740,6 +737,8 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp default: return false; } + if (prevent_overflow && !add_instr->definitions[0].isNUW()) + return false; if (add_instr->usesModifiers()) return false; diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index 3daa60b71c1..9172fc1ac60 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -181,6 +181,8 @@ static void print_definition(const Definition *definition, FILE *output) print_reg_class(definition->regClass(), output); if (definition->isPrecise()) fprintf(output, "(precise)"); + if (definition->isNUW()) + fprintf(output, "(nuw)"); fprintf(output, "%%%d", definition->tempId()); if (definition->isFixed()) -- 2.30.2