X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fcompiler%2Fbrw_ir_performance.cpp;h=3c39594d1212dc0bd9d3fc8e4338236e4c4ea0e6;hb=003b04e266ae0faad563c1228561b53f33a68474;hp=5785d839e3ada1290cf068ccf2b01719bcec8722;hpb=188a3659aea6dec9acf1c2fd15fcaecffe4f7d4e;p=mesa.git diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp index 5785d839e3a..3c39594d121 100644 --- a/src/intel/compiler/brw_ir_performance.cpp +++ b/src/intel/compiler/brw_ir_performance.cpp @@ -934,11 +934,25 @@ namespace { case SHADER_OPCODE_MEMORY_FENCE: case SHADER_OPCODE_INTERLOCK: - if (devinfo->gen >= 7) - return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0, - 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); - else + switch (info.sfid) { + case GEN6_SFID_DATAPORT_RENDER_CACHE: + if (devinfo->gen >= 7) + return calculate_desc(info, unit_dp_rc, 2, 0, 0, 30 /* XXX */, 0, + 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0); + else + abort(); + + case GEN7_SFID_DATAPORT_DATA_CACHE: + case HSW_SFID_DATAPORT_DATA_CACHE_1: + if (devinfo->gen >= 7) + return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0, + 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); + else + abort(); + + default: abort(); + } case SHADER_OPCODE_GEN4_SCRATCH_READ: case SHADER_OPCODE_GEN4_SCRATCH_WRITE: @@ -1508,9 +1522,19 @@ namespace { * difference is the worst-case scenario branch_weight used for * SIMD32 which accounts for the possibility of a dynamically * uniform branch becoming divergent in SIMD32. + * + * Note that we provide slightly more pessimistic weights on + * Gen12+ for SIMD32, since the effective warp size on that + * platform is 2x the SIMD width due to EU fusion, which increases + * the likelihood of divergent control flow in comparison to + * previous generations, giving narrower SIMD modes a performance + * advantage in several test-cases with non-uniform discard jumps. */ const float branch_weight = (dispatch_width > 16 ? 1.0 : 0.5); + const float discard_weight = (dispatch_width > 16 || s->devinfo->gen < 12 ? + 1.0 : 0.5); const float loop_weight = 10; + unsigned discard_count = 0; unsigned elapsed = 0; state st; @@ -1524,6 +1548,8 @@ namespace { if (inst->opcode == BRW_OPCODE_ENDIF) st.weight /= branch_weight; + else if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT && discard_count) + st.weight /= discard_weight; elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight; @@ -1533,6 +1559,8 @@ namespace { st.weight *= loop_weight; else if (inst->opcode == BRW_OPCODE_WHILE) st.weight /= loop_weight; + else if (inst->opcode == FS_OPCODE_DISCARD_JUMP && !discard_count++) + st.weight *= discard_weight; } p.block_latency[block->num] = elapsed - elapsed0;