aco: be more careful combining additions that could wrap into loads/stores

author Rhys Perry <pendingchaos02@gmail.com>

Tue, 15 Oct 2019 16:00:55 +0000 (17:00 +0100)

committer Marge Bot <eric+marge@anholt.net>

Tue, 21 Jul 2020 18:25:35 +0000 (18:25 +0000)
author Rhys Perry <pendingchaos02@gmail.com>
Tue, 15 Oct 2019 16:00:55 +0000 (17:00 +0100)
committer Marge Bot <eric+marge@anholt.net>
Tue, 21 Jul 2020 18:25:35 +0000 (18:25 +0000)
diff --git a/.gitlab-ci/deqp-radv-navi10-aco-fails.txt b/.gitlab-ci/deqp-radv-navi10-aco-fails.txt

index 1f4a94c4d9c853aa9cd1b115676870c41240f7fa..3c810f18c7440b5cb47da4e0a55d04d03866a128 100644 (file)
--- a/.gitlab-ci/deqp-radv-navi10-aco-fails.txt
+++ b/.gitlab-ci/deqp-radv-navi10-aco-fails.txt
@@ -1,7 +1,3 @@
-# ACO specific issues.
-dEQP-VK.transform_feedback.simple.multistreams_1
-dEQP-VK.transform_feedback.simple.multistreams_3
-
  dEQP-VK.rasterization.flatshading.line_strip_wide
  dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
  dEQP-VK.rasterization.flatshading.non_strict_lines_wide
diff --git a/.gitlab-ci/deqp-radv-navi14-aco-fails.txt b/.gitlab-ci/deqp-radv-navi14-aco-fails.txt

index 1f4a94c4d9c853aa9cd1b115676870c41240f7fa..3c810f18c7440b5cb47da4e0a55d04d03866a128 100644 (file)
--- a/.gitlab-ci/deqp-radv-navi14-aco-fails.txt
+++ b/.gitlab-ci/deqp-radv-navi14-aco-fails.txt
@@ -1,7 +1,3 @@
-# ACO specific issues.
-dEQP-VK.transform_feedback.simple.multistreams_1
-dEQP-VK.transform_feedback.simple.multistreams_3
-
  dEQP-VK.rasterization.flatshading.line_strip_wide
  dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
  dEQP-VK.rasterization.flatshading.non_strict_lines_wide
diff --git a/.gitlab-ci/deqp-radv-raven-aco-fails.txt b/.gitlab-ci/deqp-radv-raven-aco-fails.txt

index 1f4a94c4d9c853aa9cd1b115676870c41240f7fa..3c810f18c7440b5cb47da4e0a55d04d03866a128 100644 (file)
--- a/.gitlab-ci/deqp-radv-raven-aco-fails.txt
+++ b/.gitlab-ci/deqp-radv-raven-aco-fails.txt
@@ -1,7 +1,3 @@
-# ACO specific issues.
-dEQP-VK.transform_feedback.simple.multistreams_1
-dEQP-VK.transform_feedback.simple.multistreams_3
-
  dEQP-VK.rasterization.flatshading.line_strip_wide
  dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
  dEQP-VK.rasterization.flatshading.non_strict_lines_wide
diff --git a/.gitlab-ci/deqp-radv-vega10-aco-fails.txt b/.gitlab-ci/deqp-radv-vega10-aco-fails.txt

index 1f4a94c4d9c853aa9cd1b115676870c41240f7fa..3c810f18c7440b5cb47da4e0a55d04d03866a128 100644 (file)
--- a/.gitlab-ci/deqp-radv-vega10-aco-fails.txt
+++ b/.gitlab-ci/deqp-radv-vega10-aco-fails.txt
@@ -1,7 +1,3 @@
-# ACO specific issues.
-dEQP-VK.transform_feedback.simple.multistreams_1
-dEQP-VK.transform_feedback.simple.multistreams_3
-
  dEQP-VK.rasterization.flatshading.line_strip_wide
  dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
  dEQP-VK.rasterization.flatshading.non_strict_lines_wide
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py

index 7fdff9baf831324dbc68ffe76aceb986c63ef7b1..e10358a1cc30cbaac635687959e89503e085149f 100644 (file)
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -552,8 +552,9 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod
             ("vopc_e64", [Format.VOPC, Format.VOP3A], 'VOP3A_instruction', itertools.product([1, 2], [2])),
             ("flat", [Format.FLAT], 'FLAT_instruction', [(0, 3), (1, 2)]),
             ("global", [Format.GLOBAL], 'FLAT_instruction', [(0, 3), (1, 2)])]
+formats = [(f if len(f) == 5 else f + ('',)) for f in formats]
  %>\\
-% for name, formats, struct, shapes in formats:
+% for name, formats, struct, shapes, extra_field_setup in formats:
      % for num_definitions, num_operands in shapes:
          <%
          args = ['aco_opcode opcode']
@@ -581,6 +582,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod
              % endfor
              ${f.get_builder_initialization(num_operands)}
          % endfor
+       ${extra_field_setup}
        return insert(instr);
     }
  
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index e8fd15c293aca59bee43d048eb98a64c23faeb65..4d161cf0dbdfa6296b69a923a5df1d9a608f9f74 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3539,6 +3539,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
     mubuf->barrier = info->barrier;
     mubuf->can_reorder = info->can_reorder;
     mubuf->offset = const_offset;
+   mubuf->swizzled = info->swizzle_component_size != 0;
     RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
     Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
     mubuf->definitions[0] = Definition(val);
@@ -3990,7 +3991,8 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un
  }
  
  void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
-                             unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
+                             unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false,
+                             bool swizzled = false)
  {
     assert(vdata.id());
     assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
@@ -4003,8 +4005,9 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
     Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
     Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
     Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
-                                 /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
-                                 /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
+                                 /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
+                                 /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
+                                 /* dlc*/ false, /* slc */ slc);
  
     static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
  }
@@ -4026,7 +4029,7 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset
  
     for (unsigned i = 0; i < write_count; i++) {
        unsigned const_offset = offsets[i] + base_const_offset;
-      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc);
+      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc, !allow_combining);
     }
  }
  
@@ -4838,7 +4841,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
           if (use_mubuf) {
              Instruction *mubuf = bld.mubuf(opcode,
                                             Definition(fetch_dst), list, fetch_index, soffset,
-                                           fetch_offset, false, true).instr;
+                                           fetch_offset, false, false, true).instr;
              static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
           } else {
              Instruction *mtbuf = bld.mtbuf(opcode,
@@ -6889,7 +6892,7 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
  
     for (unsigned i = 0; i < write_count; i++) {
        aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
-      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true);
+      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
     }
  }
  
@@ -10341,8 +10344,8 @@ static void write_tcs_tess_factors(isel_context *ctx)
        Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
        bld.mubuf(aco_opcode::buffer_store_dword,
                  /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
-                /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false,
-                /* disable_wqm */ false, /* glc */ true);
+                /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false,
+                /* addr64 */ false, /* disable_wqm */ false, /* glc */ true);
        tf_const_offset += 4;
  
        begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h

index 8adbd567c01c4c65438d06f6a6c4430d875b213e..c1735a0d31a276f55b87995e22087c8e0f31f69b 100644 (file)
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1078,7 +1078,8 @@ struct MUBUF_instruction : public Instruction {
     bool lds : 1; /* Return read-data to LDS instead of VGPRs */
     bool disable_wqm : 1; /* Require an exec mask without helper invocations */
     bool can_reorder : 1;
-   uint8_t padding : 2;
+   bool swizzled:1;
+   uint8_t padding : 1;
     barrier_interaction barrier;
  };
  static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py

index 89e30d734f63870ff5d9848b778000d0e950133a..46246cbea01254e852340387b33462bde23c212a 100644 (file)
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -88,6 +88,7 @@ class Format(Enum):
        elif self == Format.MUBUF:
           return [('unsigned', 'offset', None),
                   ('bool', 'offen', None),
+                 ('bool', 'swizzled', 'false'),
                   ('bool', 'idxen', 'false'),
                   ('bool', 'addr64', 'false'),
                   ('bool', 'disable_wqm', 'false'),
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp

index ab38821479e5419758684dbbee806a8b1fc010a3..f66521399ca6523220ac91d8abca52201575cc34 100644 (file)
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -715,8 +715,11 @@ bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands)
     return true;
  }
  
-bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset)
+bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset, bool prevent_overflow)
  {
+   if (prevent_overflow)
+      return false; //TODO
+
     Operand op = instr->operands[op_index];
  
     if (!op.isTemp())
@@ -754,7 +757,7 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp
           continue;
  
        uint32_t offset2 = 0;
-      if (parse_base_offset(ctx, add_instr, !i, base, &offset2)) {
+      if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) {
           *offset += offset2;
        } else {
           *base = add_instr->operands[!i].getTemp();
@@ -927,6 +930,15 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
           while (info.is_temp())
              info = ctx.info[info.temp.id()];
  
+         /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
+          * overflow for scratch accesses works only on GFX9+ and saddr overflow
+          * never works. Since swizzling is the only thing that separates
+          * scratch accesses and other accesses and swizzling changing how
+          * addressing works significantly, this probably applies to swizzled
+          * MUBUF accesses. */
+         bool vaddr_prevent_overflow = mubuf->swizzled && ctx.program->chip_class < GFX9;
+         bool saddr_prevent_overflow = mubuf->swizzled;
+
           if (mubuf->offen && i == 1 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) {
              assert(!mubuf->idxen);
              instr->operands[1] = Operand(v1);
@@ -937,12 +949,14 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
              instr->operands[2] = Operand((uint32_t) 0);
              mubuf->offset += info.val;
              continue;
-         } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) {
+         } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, vaddr_prevent_overflow) &&
+                    base.regClass() == v1 && mubuf->offset + offset < 4096) {
              assert(!mubuf->idxen);
              instr->operands[1].setTemp(base);
              mubuf->offset += offset;
              continue;
-         } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && mubuf->offset + offset < 4096) {
+         } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, saddr_prevent_overflow) &&
+                    base.regClass() == s1 && mubuf->offset + offset < 4096) {
              instr->operands[i].setTemp(base);
              mubuf->offset += offset;
              continue;
@@ -957,7 +971,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
           uint32_t offset;
           bool has_usable_ds_offset = ctx.program->chip_class >= GFX7;
           if (has_usable_ds_offset &&
-             i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) &&
+             i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
               base.regClass() == instr->operands[i].regClass() &&
               instr->opcode != aco_opcode::ds_swizzle_b32) {
              if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 ||
@@ -993,7 +1007,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
                (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) {
              instr->operands[i] = Operand(info.val);
              continue;
-         } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
+         } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
              bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
              if (soe &&
                  (!ctx.info[smem->operands.back().tempId()].is_constant_or_literal(32) ||
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp

index c00f6466a7dedd39a1edf279828cd2c9412aa79e..d472f63ec57b543a05419b2bbedc048ca8c7e30b 100644 (file)
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -1566,9 +1566,9 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                       split->definitions[i] = bld.def(v1);
                    bld.insert(split);
                    for (unsigned i = 0; i < temp.size(); i++)
-                     bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false);
+                     bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
                 } else {
-                  bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false);
+                  bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
                 }
              } else {
                 ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
@@ -1632,11 +1632,11 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                    for (unsigned i = 0; i < def.size(); i++) {
                       Temp tmp = bld.tmp(v1);
                       vec->operands[i] = Operand(tmp);
-                     bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false);
+                     bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
                    }
                    bld.insert(vec);
                 } else {
-                  bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false);
+                  bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
                 }
              } else {
                 uint32_t spill_slot = slots[spill_id];
author	Rhys Perry <pendingchaos02@gmail.com>
	Tue, 15 Oct 2019 16:00:55 +0000 (17:00 +0100)
committer	Marge Bot <eric+marge@anholt.net>
	Tue, 21 Jul 2020 18:25:35 +0000 (18:25 +0000)
.gitlab-ci/deqp-radv-navi10-aco-fails.txt		patch \| blob \| history
.gitlab-ci/deqp-radv-navi14-aco-fails.txt		patch \| blob \| history
.gitlab-ci/deqp-radv-raven-aco-fails.txt		patch \| blob \| history
.gitlab-ci/deqp-radv-vega10-aco-fails.txt		patch \| blob \| history
src/amd/compiler/aco_builder_h.py		patch \| blob \| history
src/amd/compiler/aco_instruction_selection.cpp		patch \| blob \| history
src/amd/compiler/aco_ir.h		patch \| blob \| history
src/amd/compiler/aco_opcodes.py		patch \| blob \| history
src/amd/compiler/aco_optimizer.cpp		patch \| blob \| history
src/amd/compiler/aco_spill.cpp		patch \| blob \| history