From 45d4665dc749fa52cc165d8d22356c8d8b5b3e22 Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Tue, 3 Mar 2020 13:20:47 -0800 Subject: [PATCH] intel/fs: Fix workaround for VxH indirect addressing bug under control flow. The current workaround for this hardware bug involved marking the ADD instruction used to initialize the address register as NoMask on Gen12, which was based on the assumption that the problem was caused by a hardware bug affecting the application of the execution mask to the address register write. However that doesn't seem to be the case: The address register write was working correctly, the real problem leading to hangs on TGL is that the indirect addressing logic is unable to deal with garbage values in the address register (e.g. misaligned offsets), even for channels which are currently inactive due to non-uniform control flow. The current workaround isn't able to avoid that situation in general, since the result of the NoMask ADD instruction for a dead channel is calculated based on the corresponding (dead) component of the indirect_byte_offset source, which would still be undefined in the likely case that the source was initialized under control flow itself. This would lead to hangs whenever MOV_INDIRECT was used under non-uniform control flow in some scenarios like a tessellation shader from GFXBench5/gl_4 (AKA Car Chase) on TGL. In addition I've managed to reproduce the same issue on earlier platforms by initializing the whole address register with garbage before the ADD instruction, so this seems to be a long-standing issue we have avoided mostly by luck. This patch fixes the problem and applies the workaround to all platforms, since even when the hardware is able to deal with garbage address values without hanging there might be a significant performance cost from reading random GRF registers due to the useless extra EU cycles spent fetching registers for dead channels and due to the potential for unintended serialization with respect to other random instructions that could be executed in parallel, which may have had a cost of the order of hundreds of cycles in the worst case scenario. Fixes: f93dfb509c "intel/fs: Write the address register with NoMask for MOV_INDIRECT" Tested-by: Rafael Antognolli Reviewed-by: Jason Ekstrand --- src/intel/compiler/brw_fs_generator.cpp | 38 ++++++++++++++++++------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index dd2e753abd1..3a2f2a9b7ce 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -418,6 +418,13 @@ fs_generator::generate_mov_indirect(fs_inst *inst, /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ struct brw_reg addr = vec8(brw_address_reg(0)); + /* Whether we can use destination dependency control without running the + * risk of a hang if an instruction gets shot down. + */ + const bool use_dep_ctrl = !inst->predicate && + inst->exec_size == dispatch_width; + brw_inst *insn; + /* The destination stride of an instruction (in bytes) must be greater * than or equal to the size of the rest of the instruction. Since the * address register is of type UW, we can't use a D-type instruction. @@ -451,17 +458,28 @@ fs_generator::generate_mov_indirect(fs_inst *inst, * code, using it saves us 0 instructions and would require quite a bit * of case-by-case work. It's just not worth it. * - * There's some sort of HW bug on Gen12 which causes issues if we write - * to the address register in control-flow. Since we only ever touch - * the address register from the generator, we can easily enough work - * around it by setting NoMask on the add. + * Due to a hardware bug some platforms (particularly Gen11+) seem to + * require the address components of all channels to be valid whether or + * not they're active, which causes issues if we use VxH addressing + * under non-uniform control-flow. We can easily work around that by + * initializing the whole address register with a pipelined NoMask MOV + * instruction. */ - brw_push_insn_state(p); - if (devinfo->gen == 12) - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); - brw_pop_insn_state(p); - brw_set_default_swsb(p, tgl_swsb_regdist(1)); + if (devinfo->gen >= 7) { + insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset)); + brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); + brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); + if (devinfo->gen >= 12) + brw_set_default_swsb(p, tgl_swsb_null()); + else + brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl); + } + + insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); + if (devinfo->gen >= 12) + brw_set_default_swsb(p, tgl_swsb_regdist(1)); + else if (devinfo->gen >= 7) + brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl); if (type_sz(reg.type) > 4 && ((devinfo->gen == 7 && !devinfo->is_haswell) || -- 2.30.2