intel/fs: Only propagate saturation if exec_size is the same.

[mesa.git] / src / intel / compiler / brw_fs_reg_allocate.cpp
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp

index ec8e116cb384d6c4756b2cb4a9dabc1a27871c2e..d5c4f032182d4101cf6008ffc34381ff8099d59c 100644 (file)
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -548,6 +548,9 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
     int first_mrf_hack_node = node_count;
     if (devinfo->gen >= 7)
        node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START;
+   int grf127_send_hack_node = node_count;
+   if (devinfo->gen >= 8)
+      node_count ++;
     struct ra_graph *g =
        ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count);
  
@@ -614,7 +617,9 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
            * highest register that works.
            */
           if (inst->eot) {
-            int size = alloc.sizes[inst->src[0].nr];
+            const int vgrf = inst->opcode == SHADER_OPCODE_SEND ?
+                             inst->src[2].nr : inst->src[0].nr;
+            int size = alloc.sizes[vgrf];
              int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1;
  
              /* If something happened to spill, we want to push the EOT send
@@ -623,31 +628,94 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
               */
              reg -= BRW_MAX_MRF(devinfo->gen) - first_used_mrf;
  
-            ra_set_node_reg(g, inst->src[0].nr, reg);
+            ra_set_node_reg(g, vgrf, reg);
              break;
           }
        }
     }
  
-   if (dispatch_width > 8) {
-      /* In 16-wide dispatch we have an issue where a compressed
-       * instruction is actually two instructions executed simultaneiously.
-       * It's actually ok to have the source and destination registers be
-       * the same.  In this case, each instruction over-writes its own
-       * source and there's no problem.  The real problem here is if the
-       * source and destination registers are off by one.  Then you can end
-       * up in a scenario where the first instruction over-writes the
-       * source of the second instruction.  Since the compiler doesn't know
-       * about this level of granularity, we simply make the source and
-       * destination interfere.
+   /* In 16-wide instructions we have an issue where a compressed
+    * instruction is actually two instructions executed simultaneously.
+    * It's actually ok to have the source and destination registers be
+    * the same.  In this case, each instruction over-writes its own
+    * source and there's no problem.  The real problem here is if the
+    * source and destination registers are off by one.  Then you can end
+    * up in a scenario where the first instruction over-writes the
+    * source of the second instruction.  Since the compiler doesn't know
+    * about this level of granularity, we simply make the source and
+    * destination interfere.
+    */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->exec_size < 16 || inst->dst.file != VGRF)
+         continue;
+
+      for (int i = 0; i < inst->sources; ++i) {
+         if (inst->src[i].file == VGRF) {
+            ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
+         }
+      }
+   }
+
+   if (devinfo->gen >= 8) {
+      /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
+       * subsection "EUISA Instructions", Send Message (page 990):
+       *
+       * "r127 must not be used for return address when there is a src and
+       * dest overlap in send instruction."
+       *
+       * We are avoiding using grf127 as part of the destination of send
+       * messages adding a node interference to the grf127_send_hack_node.
+       * This node has a fixed asignment to grf127.
+       *
+       * We don't apply it to SIMD16 instructions because previous code avoids
+       * any register overlap between sources and destination.
         */
+      ra_set_node_reg(g, grf127_send_hack_node, 127);
        foreach_block_and_inst(block, fs_inst, inst, cfg) {
-         if (inst->dst.file != VGRF)
-            continue;
+         if (inst->exec_size < 16 && inst->is_send_from_grf() &&
+             inst->dst.file == VGRF)
+            ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
+      }
  
-         for (int i = 0; i < inst->sources; ++i) {
-            if (inst->src[i].file == VGRF) {
-               ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
+      if (spilled_any_registers) {
+         foreach_block_and_inst(block, fs_inst, inst, cfg) {
+            /* Spilling instruction are genereated as SEND messages from MRF
+             * but as Gen7+ supports sending from GRF the driver will maps
+             * assingn these MRF registers to a GRF. Implementations reuses
+             * the dest of the send message as source. So as we will have an
+             * overlap for sure, we create an interference between destination
+             * and grf127.
+             */
+            if ((inst->opcode == SHADER_OPCODE_GEN7_SCRATCH_READ ||
+                 inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ) &&
+                inst->dst.file == VGRF)
+               ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node);
+         }
+      }
+   }
+
+   /* From the Skylake PRM Vol. 2a docs for sends:
+    *
+    *    "It is required that the second block of GRFs does not overlap with
+    *    the first block."
+    *
+    * Normally, this is taken care of by fixup_sends_duplicate_payload() but
+    * in the case where one of the registers is an undefined value, the
+    * register allocator may decide that they don't interfere even though
+    * they're used as sources in the same instruction.  We also need to add
+    * interference here.
+    */
+   if (devinfo->gen >= 9) {
+      foreach_block_and_inst(block, fs_inst, inst, cfg) {
+         if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
+             inst->src[2].file == VGRF &&
+             inst->src[3].file == VGRF &&
+             inst->src[2].nr != inst->src[3].nr) {
+            for (unsigned i = 0; i < inst->mlen; i++) {
+               for (unsigned j = 0; j < inst->ex_mlen; j++) {
+                  ra_add_node_interference(g, inst->src[2].nr + i,
+                                           inst->src[3].nr + j);
+               }
              }
           }
        }
@@ -864,15 +932,27 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
     }
  
     for (unsigned i = 0; i < this->alloc.count; i++) {
+      int live_length = virtual_grf_end[i] - virtual_grf_start[i];
+      if (live_length <= 0)
+         continue;
+
+      /* Divide the cost (in number of spills/fills) by the log of the length
+       * of the live range of the register.  This will encourage spill logic
+       * to spill long-living things before spilling short-lived things where
+       * spilling is less likely to actually do us any good.  We use the log
+       * of the length because it will fall off very quickly and not cause us
+       * to spill medium length registers with more uses.
+       */
+      float adjusted_cost = spill_costs[i] / logf(live_length);
        if (!no_spill[i])
-        ra_set_node_spill_cost(g, i, spill_costs[i]);
+        ra_set_node_spill_cost(g, i, adjusted_cost);
     }
  
     return ra_get_best_spill_node(g);
  }
  
  void
-fs_visitor::spill_reg(int spill_reg)
+fs_visitor::spill_reg(unsigned spill_reg)
  {
     int size = alloc.sizes[spill_reg];
     unsigned int spill_offset = last_scratch;
@@ -986,7 +1066,7 @@ fs_visitor::spill_reg(int spill_reg)
            * write, there should be no need for the unspill since the
            * instruction will be overwriting the whole destination in any case.
           */
-         if (inst->is_partial_write() ||
+         if (inst->is_partial_reg_write() ||
               (!inst->force_writemask_all && !per_channel))
              emit_unspill(ubld, spill_src, subset_spill_offset,
                           regs_written(inst));