From 2694a34aa2c32aeb32d7d70af91db56c6eaaa23b Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Tue, 15 Oct 2019 17:25:57 +0100
Subject: [PATCH] aco: add NUW flag
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This (combined with a pass to actually set the corresponding NIR flags)
should help fix a lot of the regressions from the SMEM addition combining
change.

fossil-db (Navi):
Totals from 12 (0.01% of 135946) affected shaders:
CodeSize: 12376 -> 12304 (-0.58%)
Instrs: 2436 -> 2422 (-0.57%)
VMEM: 1105 -> 1096 (-0.81%)
SClause: 133 -> 130 (-2.26%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel SchÃ¼rmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/2720>
---
 src/amd/compiler/aco_builder_h.py              |  8 ++++++++
 src/amd/compiler/aco_instruction_selection.cpp | 11 +++++++----
 src/amd/compiler/aco_ir.h                      | 15 ++++++++++++++-
 src/amd/compiler/aco_opt_value_numbering.cpp   |  6 ++++++
 src/amd/compiler/aco_optimizer.cpp             |  5 ++---
 src/amd/compiler/aco_print_ir.cpp              |  2 ++
 6 files changed, 39 insertions(+), 8 deletions(-)
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py
index e10358a1cc3..b3adb14dc8e 100644
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -176,6 +176,7 @@ public:
    std::vector<aco_ptr<Instruction>> *instructions;
    std::vector<aco_ptr<Instruction>>::iterator it;
    bool is_precise = false;
+   bool is_nuw = false;
 
    Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), lm(pgm->lane_mask), instructions(NULL) {}
    Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(&block->instructions) {}
@@ -187,6 +188,12 @@ public:
       return res;
    };
 
+   Builder nuw() const {
+      Builder res = *this;
+      res.is_nuw = true;
+      return res;
+   }
+
    void moveEnd(Block *block) {
       instructions = &block->instructions;
    }
@@ -572,6 +579,7 @@ formats = [(f if len(f) == 5 else f + ('',)) for f in formats]
         % for i in range(num_definitions):
             instr->definitions[${i}] = def${i};
             instr->definitions[${i}].setPrecise(is_precise);
+            instr->definitions[${i}].setNUW(is_nuw);
         % endfor
         % for i in range(num_operands):
             instr->operands[${i}] = op${i}.op;
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 0af1f1f5c15..34887f2f5be 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -614,6 +614,8 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
    sop2->definitions[0] = Definition(dst);
+   if (instr->no_unsigned_wrap)
+      sop2->definitions[0].setNUW(true);
    if (writes_scc)
       sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
    ctx->block->instructions.emplace_back(std::move(sop2));
@@ -5287,7 +5289,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
 
    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
    if (offset != 0) // TODO check if index != 0 as well
-      index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
+      index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
    Temp vec = dst;
    bool trim = false;
@@ -5373,7 +5375,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
 
    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
    if (base && offset.type() == RegType::sgpr)
-      offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
+      offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
    else if (base && offset.type() == RegType::vgpr)
       offset = bld.vadd32(bld.def(v1), Operand(base), offset);
 
@@ -6255,8 +6257,8 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
          aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
          store->operands[0] = Operand(rsrc);
          if (offsets[i]) {
-            Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
-                                offset, Operand(offsets[i]));
+            Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
+                                      offset, Operand(offsets[i]));
             store->operands[1] = Operand(off);
          } else {
             store->operands[1] = Operand(offset);
@@ -7273,6 +7275,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
       Temp private_segment_buffer = ctx->program->private_segment_buffer;
+      //TODO: bounds checking?
       if (addr.type() == RegType::sgpr) {
          Operand offset;
          if (const_addr) {
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index a23ff7ce017..661b6982df9 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -700,7 +700,8 @@ private:
 class Definition final
 {
 public:
-   constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), isKill_(0), isPrecise_(0) {}
+   constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0),
+                            isKill_(0), isPrecise_(0), isNUW_(0) {}
    Definition(uint32_t index, RegClass type) noexcept
       : temp(index, type) {}
    explicit Definition(Temp tmp) noexcept
@@ -797,6 +798,17 @@ public:
       return isPrecise_;
    }
 
+   /* No Unsigned Wrap */
+   constexpr void setNUW(bool nuw) noexcept
+   {
+      isNUW_ = nuw;
+   }
+
+   constexpr bool isNUW() const noexcept
+   {
+      return isNUW_;
+   }
+
 private:
    Temp temp = Temp(0, s1);
    PhysReg reg_;
@@ -806,6 +818,7 @@ private:
          uint8_t hasHint_:1;
          uint8_t isKill_:1;
          uint8_t isPrecise_:1;
+         uint8_t isNUW_:1;
       };
       /* can't initialize bit-fields in c++11, so work around using a union */
       uint8_t control_ = 0;
diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp
index 9a1972ff34e..2fdbfaabd4a 100644
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@@ -406,6 +406,12 @@ void process_block(vn_ctx& ctx, Block& block)
                ctx.renames[instr->definitions[i].tempId()] = orig_instr->definitions[i].getTemp();
                if (instr->definitions[i].isPrecise())
                   orig_instr->definitions[i].setPrecise(true);
+               /* SPIR_V spec says that an instruction marked with NUW wrapping
+                * around is undefined behaviour, so we can break additions in
+                * other contexts.
+                */
+               if (instr->definitions[i].isNUW())
+                  orig_instr->definitions[i].setNUW(true);
             }
          } else {
             ctx.expr_values.erase(res.first);
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index a254728baa5..e00c8c1fcaf 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -717,9 +717,6 @@ bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands)
 
 bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset, bool prevent_overflow)
 {
-   if (prevent_overflow)
-      return false; //TODO
-
    Operand op = instr->operands[op_index];
 
    if (!op.isTemp())
@@ -740,6 +737,8 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp
    default:
       return false;
    }
+   if (prevent_overflow && !add_instr->definitions[0].isNUW())
+      return false;
 
    if (add_instr->usesModifiers())
       return false;
diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp
index 3daa60b71c1..9172fc1ac60 100644
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@@ -181,6 +181,8 @@ static void print_definition(const Definition *definition, FILE *output)
    print_reg_class(definition->regClass(), output);
    if (definition->isPrecise())
       fprintf(output, "(precise)");
+   if (definition->isNUW())
+      fprintf(output, "(nuw)");
    fprintf(output, "%%%d", definition->tempId());
 
    if (definition->isFixed())
-- 
2.30.2