intel/compiler: Allow MESA_SHADER_KERNEL

[mesa.git] / src / intel / compiler / brw_ir_performance.cpp
diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp

index 5785d839e3ada1290cf068ccf2b01719bcec8722..3c39594d1212dc0bd9d3fc8e4338236e4c4ea0e6 100644 (file)
--- a/src/intel/compiler/brw_ir_performance.cpp
+++ b/src/intel/compiler/brw_ir_performance.cpp
@@ -934,11 +934,25 @@ namespace {
  
        case SHADER_OPCODE_MEMORY_FENCE:
        case SHADER_OPCODE_INTERLOCK:
-         if (devinfo->gen >= 7)
-            return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0,
-                                  10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
-         else
+         switch (info.sfid) {
+         case GEN6_SFID_DATAPORT_RENDER_CACHE:
+            if (devinfo->gen >= 7)
+               return calculate_desc(info, unit_dp_rc, 2, 0, 0, 30 /* XXX */, 0,
+                                     10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
+            else
+               abort();
+
+         case GEN7_SFID_DATAPORT_DATA_CACHE:
+         case HSW_SFID_DATAPORT_DATA_CACHE_1:
+            if (devinfo->gen >= 7)
+               return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0,
+                                     10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
+            else
+               abort();
+
+         default:
              abort();
+         }
  
        case SHADER_OPCODE_GEN4_SCRATCH_READ:
        case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
@@ -1508,9 +1522,19 @@ namespace {
         *       difference is the worst-case scenario branch_weight used for
         *       SIMD32 which accounts for the possibility of a dynamically
         *       uniform branch becoming divergent in SIMD32.
+       *
+       *       Note that we provide slightly more pessimistic weights on
+       *       Gen12+ for SIMD32, since the effective warp size on that
+       *       platform is 2x the SIMD width due to EU fusion, which increases
+       *       the likelihood of divergent control flow in comparison to
+       *       previous generations, giving narrower SIMD modes a performance
+       *       advantage in several test-cases with non-uniform discard jumps.
         */
        const float branch_weight = (dispatch_width > 16 ? 1.0 : 0.5);
+      const float discard_weight = (dispatch_width > 16 || s->devinfo->gen < 12 ?
+                                    1.0 : 0.5);
        const float loop_weight = 10;
+      unsigned discard_count = 0;
        unsigned elapsed = 0;
        state st;
  
@@ -1524,6 +1548,8 @@ namespace {
  
              if (inst->opcode == BRW_OPCODE_ENDIF)
                 st.weight /= branch_weight;
+            else if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT && discard_count)
+               st.weight /= discard_weight;
  
              elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight;
  
@@ -1533,6 +1559,8 @@ namespace {
                 st.weight *= loop_weight;
              else if (inst->opcode == BRW_OPCODE_WHILE)
                 st.weight /= loop_weight;
+            else if (inst->opcode == FS_OPCODE_DISCARD_JUMP && !discard_count++)
+               st.weight *= discard_weight;
           }
  
           p.block_latency[block->num] = elapsed - elapsed0;