From e328fbd9f8c8e5ad2a41e249bf18be5642d46d8d Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Mon, 30 Dec 2019 00:38:08 -0800 Subject: [PATCH] intel/fs: Add partial support for copy-propagating FIXED_GRFs. This will be useful for eliminating redundant copies from the FS thread payload, particularly in SIMD32 programs. For the moment we only allow FIXED_GRFs with identity strides in order to avoid dealing with composing the arbitrary bidimensional strides that FIXED_GRF regions potentially have, which are rarely used at the IR level anyway. This enables the following commit allowing block-propagation of FIXED_GRF LOAD_PAYLOAD copies, and prevents the following shader-db regressions (including SIMD32 programs) in combination with the interpolation rework part of this series. On ICL: total instructions in shared programs: 20484665 -> 20529650 (0.22%) instructions in affected programs: 6031235 -> 6076220 (0.75%) helped: 5 HURT: 42073 total spills in shared programs: 8748 -> 8925 (2.02%) spills in affected programs: 186 -> 363 (95.16%) helped: 5 HURT: 9 total fills in shared programs: 8663 -> 8960 (3.43%) fills in affected programs: 647 -> 944 (45.90%) helped: 5 HURT: 9 On SKL: total instructions in shared programs: 18937442 -> 19128162 (1.01%) instructions in affected programs: 8378187 -> 8568907 (2.28%) helped: 39 HURT: 68176 LOST: 1 GAINED: 4 On SNB: total instructions in shared programs: 14094685 -> 14243499 (1.06%) instructions in affected programs: 7751062 -> 7899876 (1.92%) helped: 623 HURT: 53586 LOST: 7 GAINED: 25 Reviewed-by: Kenneth Graunke --- .../compiler/brw_fs_copy_propagation.cpp | 89 ++++++++++++++++--- 1 file changed, 75 insertions(+), 14 deletions(-) diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index e6eb174e0f2..adf0f2fefff 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -452,7 +452,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) if (entry->src.file == IMM) return false; assert(entry->src.file == VGRF || entry->src.file == UNIFORM || - entry->src.file == ATTR); + entry->src.file == ATTR || entry->src.file == FIXED_GRF); if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD && inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) @@ -469,6 +469,21 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) entry->dst, entry->size_written)) return false; + /* Avoid propagating a FIXED_GRF register into an EOT instruction in order + * for any register allocation restrictions to be applied. + */ + if (entry->src.file == FIXED_GRF && inst->eot) + return false; + + /* Avoid propagating odd-numbered FIXED_GRF registers into the first source + * of a LINTERP instruction on platforms where the PLN instruction has + * register alignment restrictions. + */ + if (devinfo->has_pln && devinfo->gen <= 6 && + entry->src.file == FIXED_GRF && (entry->src.nr & 1) && + inst->opcode == FS_OPCODE_LINTERP && arg == 0) + return false; + /* we can't generally copy-propagate UD negations because we * can end up accessing the resulting values as signed integers * instead. See also resolve_ud_negate() and comment in @@ -493,16 +508,30 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) * derivatives, assume that their operands are packed so we can't * generally propagate strided regions to them. */ - if (instruction_requires_packed_data(inst) && entry->src.stride > 1) + const unsigned entry_stride = (entry->src.file == FIXED_GRF ? 1 : + entry->src.stride); + if (instruction_requires_packed_data(inst) && entry_stride > 1) return false; /* Bail if the result of composing both strides would exceed the * hardware limit. */ - if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride, + if (!can_take_stride(inst, arg, entry_stride * inst->src[arg].stride, devinfo)) return false; + /* Bail if the source FIXED_GRF region of the copy cannot be trivially + * composed with the source region of the instruction -- E.g. because the + * copy uses some extended stride greater than 4 not supported natively by + * the hardware as a horizontal stride, or because instruction compression + * could require us to use a vertical stride shorter than a GRF. + */ + if (entry->src.file == FIXED_GRF && + (inst->src[arg].stride > 4 || + inst->dst.component_size(inst->exec_size) > + inst->src[arg].component_size(inst->exec_size))) + return false; + /* Bail if the instruction type is larger than the execution type of the * copy, what implies that each channel is reading multiple channels of the * destination of the copy, and simply replacing the sources would give a @@ -524,7 +553,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) * * Which would have different semantics. */ - if (entry->src.stride != 1 && + if (entry_stride != 1 && (inst->src[arg].stride * type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0) return false; @@ -562,13 +591,42 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) } } + /* Save the offset of inst->src[arg] relative to entry->dst for it to be + * applied later. + */ + const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset; + + /* Fold the copy into the instruction consuming it. */ inst->src[arg].file = entry->src.file; inst->src[arg].nr = entry->src.nr; - inst->src[arg].stride *= entry->src.stride; - inst->saturate = inst->saturate || entry->saturate; + inst->src[arg].subnr = entry->src.subnr; + inst->src[arg].offset = entry->src.offset; + + /* Compose the strides of both regions. */ + if (entry->src.file == FIXED_GRF) { + if (inst->src[arg].stride) { + const unsigned orig_width = 1 << entry->src.width; + const unsigned reg_width = REG_SIZE / (type_sz(inst->src[arg].type) * + inst->src[arg].stride); + inst->src[arg].width = cvt(MIN2(orig_width, reg_width)) - 1; + inst->src[arg].hstride = cvt(inst->src[arg].stride); + inst->src[arg].vstride = inst->src[arg].hstride + inst->src[arg].width; + } else { + inst->src[arg].vstride = inst->src[arg].hstride = + inst->src[arg].width = 0; + } - /* Compute the offset of inst->src[arg] relative to entry->dst */ - const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset; + inst->src[arg].stride = 1; + + /* Hopefully no Align16 around here... */ + assert(entry->src.swizzle == BRW_SWIZZLE_XYZW); + inst->src[arg].swizzle = entry->src.swizzle; + } else { + inst->src[arg].stride *= entry->src.stride; + } + + /* Compose any saturate modifiers. */ + inst->saturate = inst->saturate || entry->saturate; /* Compute the first component of the copy that the instruction is * reading, and the base byte offset within that component. @@ -580,9 +638,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) /* Calculate the byte offset at the origin of the copy of the given * component and suboffset. */ - inst->src[arg].offset = suboffset + - component * entry->src.stride * type_sz(entry->src.type) + - entry->src.offset; + inst->src[arg] = byte_offset(inst->src[arg], + component * entry_stride * type_sz(entry->src.type) + suboffset); if (has_source_modifiers) { if (entry->dst.type != inst->src[arg].type) { @@ -834,7 +891,9 @@ can_propagate_from(fs_inst *inst) inst->src[0], inst->size_read(0))) || inst->src[0].file == ATTR || inst->src[0].file == UNIFORM || - inst->src[0].file == IMM) && + inst->src[0].file == IMM || + (inst->src[0].file == FIXED_GRF && + inst->src[0].is_contiguous())) && inst->src[0].type == inst->dst.type && !inst->is_partial_write()); } @@ -863,7 +922,7 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block, } /* kill the destination from the ACP */ - if (inst->dst.file == VGRF) { + if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) { foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) { if (regions_overlap(entry->dst, entry->size_written, inst->dst, inst->size_written)) @@ -905,7 +964,9 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block, assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0); const unsigned size_written = effective_width * type_sz(inst->src[i].type); - if (inst->src[i].file == VGRF) { + if (inst->src[i].file == VGRF || + (inst->src[i].file == FIXED_GRF && + inst->src[i].is_contiguous())) { acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry); entry->dst = byte_offset(inst->dst, offset); entry->src = inst->src[i]; -- 2.30.2