intel/fs/gen12: Workaround unwanted SEND execution due to broken NoMask control flow.
authorFrancisco Jerez <currojerez@riseup.net>
Fri, 24 Jan 2020 06:55:33 +0000 (22:55 -0800)
committerFrancisco Jerez <currojerez@riseup.net>
Fri, 14 Feb 2020 22:31:48 +0000 (14:31 -0800)
This is a less invasive alternative to the workaround documented in
the hardware spec for GEN:BUG:1407528679, which doesn't involve
disabling structured control flow (it's unlikely that switching to
GOTO/JOIN would have actually fixed the problem anyway).

Under some conditions Gen12 hardware can end up executing a BB with
all channels disabled, which will lead to the execution of any NoMask
instructions in it, even though any execution-masked instructions will
be correctly shot down.  This may break assumptions of some NoMask
SEND messages whose descriptor depends on data generated by live
invocations of the shader.

This avoids the problem by predicating certain instructions on an ANY
horizontal predicate that makes sure that their execution is omitted
when all channels of the program are disabled.  The shader-db impact
of this patch seems to be minimal:

total instructions in shared programs: 17169833 -> 17169913 (0.00%)
instructions in affected programs: 30663 -> 30743 (0.26%)
helped: 0
HURT: 42

total cycles in shared programs: 336966176 -> 336968568 (0.00%)
cycles in affected programs: 2367290 -> 2369682 (0.10%)
helped: 0
HURT: 13

Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Cc: 20.0 <mesa-stable@lists.freedesktop.org>
src/intel/compiler/brw_fs.cpp
src/intel/compiler/brw_fs.h

index 1b582cda0cd6f7e9342b924540ae556acf6d05f3..7ebbe47b887136bcb57b6750b679603845dcbb5d 100644 (file)
@@ -31,6 +31,7 @@
 #include "main/macros.h"
 #include "brw_eu.h"
 #include "brw_fs.h"
+#include "brw_fs_live_variables.h"
 #include "brw_nir.h"
 #include "brw_vec4_gs_visitor.h"
 #include "brw_cfg.h"
@@ -7448,6 +7449,9 @@ fs_visitor::optimize()
 
    OPT(lower_logical_sends);
 
+   /* After logical SEND lowering. */
+   OPT(fixup_nomask_control_flow);
+
    if (progress) {
       OPT(opt_copy_propagation);
       /* Only run after logical send lowering because it's easier to implement
@@ -7576,6 +7580,151 @@ fs_visitor::fixup_3src_null_dest()
       invalidate_live_intervals();
 }
 
+/**
+ * Find the first instruction in the program that might start a region of
+ * divergent control flow due to a HALT jump.  There is no
+ * find_halt_control_flow_region_end(), the region of divergence extends until
+ * the only FS_OPCODE_PLACEHOLDER_HALT in the program.
+ */
+static const fs_inst *
+find_halt_control_flow_region_start(const fs_visitor *v)
+{
+   if (brw_wm_prog_data(v->prog_data)->uses_kill) {
+      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+         if (inst->opcode == FS_OPCODE_DISCARD_JUMP ||
+             inst->opcode == FS_OPCODE_PLACEHOLDER_HALT)
+            return inst;
+      }
+   }
+
+   return NULL;
+}
+
+/**
+ * Work around the Gen12 hardware bug filed as GEN:BUG:1407528679.  EU fusion
+ * can cause a BB to be executed with all channels disabled, which will lead
+ * to the execution of any NoMask instructions in it, even though any
+ * execution-masked instructions will be correctly shot down.  This may break
+ * assumptions of some NoMask SEND messages whose descriptor depends on data
+ * generated by live invocations of the shader.
+ *
+ * This avoids the problem by predicating certain instructions on an ANY
+ * horizontal predicate that makes sure that their execution is omitted when
+ * all channels of the program are disabled.
+ */
+bool
+fs_visitor::fixup_nomask_control_flow()
+{
+   if (devinfo->gen != 12)
+      return false;
+
+   const brw_predicate pred = dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
+                              dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
+                              BRW_PREDICATE_ALIGN1_ANY8H;
+   const fs_inst *halt_start = find_halt_control_flow_region_start(this);
+   unsigned depth = 0;
+   bool progress = false;
+
+   calculate_live_intervals();
+
+   /* Scan the program backwards in order to be able to easily determine
+    * whether the flag register is live at any point.
+    */
+   foreach_block_reverse_safe(block, cfg) {
+      BITSET_WORD flag_liveout = live_intervals->block_data[block->num]
+                                               .flag_liveout[0];
+      STATIC_ASSERT(ARRAY_SIZE(live_intervals->block_data[0].flag_liveout) == 1);
+
+      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+         if (!inst->predicate && inst->exec_size >= 8)
+            flag_liveout &= ~inst->flags_written();
+
+         switch (inst->opcode) {
+         case BRW_OPCODE_DO:
+         case BRW_OPCODE_IF:
+            /* Note that this doesn't handle FS_OPCODE_DISCARD_JUMP since only
+             * the first one in the program closes the region of divergent
+             * control flow due to any HALT instructions -- Instead this is
+             * handled with the halt_start check below.
+             */
+            depth--;
+            break;
+
+         case BRW_OPCODE_WHILE:
+         case BRW_OPCODE_ENDIF:
+         case FS_OPCODE_PLACEHOLDER_HALT:
+            depth++;
+            break;
+
+         default:
+            /* Note that the vast majority of NoMask SEND instructions in the
+             * program are harmless while executed in a block with all
+             * channels disabled, since any instructions with side effects we
+             * could hit here should be execution-masked.
+             *
+             * The main concern is NoMask SEND instructions where the message
+             * descriptor or header depends on data generated by live
+             * invocations of the shader (RESINFO and
+             * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
+             * computed surface index seem to be the only examples right now
+             * where this could easily lead to GPU hangs).  Unfortunately we
+             * have no straightforward way to detect that currently, so just
+             * predicate any NoMask SEND instructions we find under control
+             * flow.
+             *
+             * If this proves to have a measurable performance impact it can
+             * be easily extended with a whitelist of messages we know we can
+             * safely omit the predication for.
+             */
+            if (depth && inst->force_writemask_all &&
+                is_send(inst) && !inst->predicate) {
+               /* We need to load the execution mask into the flag register by
+                * using a builder with channel group matching the whole shader
+                * (rather than the default which is derived from the original
+                * instruction), in order to avoid getting a right-shifted
+                * value.
+                */
+               const fs_builder ubld = fs_builder(this, block, inst)
+                                       .exec_all().group(dispatch_width, 0);
+               const fs_reg flag = retype(brw_flag_reg(0, 0),
+                                          BRW_REGISTER_TYPE_UD);
+
+               /* Due to the lack of flag register allocation we need to save
+                * and restore the flag register if it's live.
+                */
+               const bool save_flag = flag_liveout &
+                                      flag_mask(flag, dispatch_width / 8);
+               const fs_reg tmp = ubld.group(1, 0).vgrf(flag.type);
+
+               if (save_flag)
+                  ubld.group(1, 0).MOV(tmp, flag);
+
+               ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
+
+               set_predicate(pred, inst);
+               inst->flag_subreg = 0;
+
+               if (save_flag)
+                  ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
+
+               progress = true;
+            }
+            break;
+         }
+
+         if (inst == halt_start)
+            depth--;
+
+         flag_liveout |= inst->flags_read(devinfo);
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 void
 fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling)
 {
index 5784da7535cf95bc621dad9c815f32ada3589235..dfa1ff4d0e1fc60a92abf5e92048ac151abd211c 100644 (file)
@@ -107,6 +107,7 @@ public:
    void setup_cs_payload();
    bool fixup_sends_duplicate_payload();
    void fixup_3src_null_dest();
+   bool fixup_nomask_control_flow();
    void assign_curb_setup();
    void assign_urb_setup();
    void convert_attr_sources_to_hw_regs(fs_inst *inst);