intel/compiler: Allow MESA_SHADER_KERNEL
[mesa.git] / src / intel / compiler / brw_ir_performance.cpp
index 5785d839e3ada1290cf068ccf2b01719bcec8722..3c39594d1212dc0bd9d3fc8e4338236e4c4ea0e6 100644 (file)
@@ -934,11 +934,25 @@ namespace {
 
       case SHADER_OPCODE_MEMORY_FENCE:
       case SHADER_OPCODE_INTERLOCK:
-         if (devinfo->gen >= 7)
-            return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0,
-                                  10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
-         else
+         switch (info.sfid) {
+         case GEN6_SFID_DATAPORT_RENDER_CACHE:
+            if (devinfo->gen >= 7)
+               return calculate_desc(info, unit_dp_rc, 2, 0, 0, 30 /* XXX */, 0,
+                                     10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
+            else
+               abort();
+
+         case GEN7_SFID_DATAPORT_DATA_CACHE:
+         case HSW_SFID_DATAPORT_DATA_CACHE_1:
+            if (devinfo->gen >= 7)
+               return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0,
+                                     10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
+            else
+               abort();
+
+         default:
             abort();
+         }
 
       case SHADER_OPCODE_GEN4_SCRATCH_READ:
       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
@@ -1508,9 +1522,19 @@ namespace {
        *       difference is the worst-case scenario branch_weight used for
        *       SIMD32 which accounts for the possibility of a dynamically
        *       uniform branch becoming divergent in SIMD32.
+       *
+       *       Note that we provide slightly more pessimistic weights on
+       *       Gen12+ for SIMD32, since the effective warp size on that
+       *       platform is 2x the SIMD width due to EU fusion, which increases
+       *       the likelihood of divergent control flow in comparison to
+       *       previous generations, giving narrower SIMD modes a performance
+       *       advantage in several test-cases with non-uniform discard jumps.
        */
       const float branch_weight = (dispatch_width > 16 ? 1.0 : 0.5);
+      const float discard_weight = (dispatch_width > 16 || s->devinfo->gen < 12 ?
+                                    1.0 : 0.5);
       const float loop_weight = 10;
+      unsigned discard_count = 0;
       unsigned elapsed = 0;
       state st;
 
@@ -1524,6 +1548,8 @@ namespace {
 
             if (inst->opcode == BRW_OPCODE_ENDIF)
                st.weight /= branch_weight;
+            else if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT && discard_count)
+               st.weight /= discard_weight;
 
             elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight;
 
@@ -1533,6 +1559,8 @@ namespace {
                st.weight *= loop_weight;
             else if (inst->opcode == BRW_OPCODE_WHILE)
                st.weight /= loop_weight;
+            else if (inst->opcode == FS_OPCODE_DISCARD_JUMP && !discard_count++)
+               st.weight *= discard_weight;
          }
 
          p.block_latency[block->num] = elapsed - elapsed0;