From: Matt Turner Date: Tue, 9 Jun 2020 20:51:10 +0000 (-0700) Subject: intel/compiler: Drop opt_sampler_eot() X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=66111bc95a5bba96ae39a4274c98cc4e3e183cae;p=mesa.git intel/compiler: Drop opt_sampler_eot() Gen9 and Cherryview have the ability to mark texture instructions with the End-of-thread bit under some conditions, which allows the texture result to be written to the render target directly, rather than returning to the EU. In order to handle overlapping primitives correctly, we have to use the 'sendc' instruction which stalls until other threads potentially writing to the same locations in the render target are retired. Unfortunately, this stall happens before the texture is sampled (rather than in parallel with stall), so for some literal edge cases (like the diagonal edge between two triangles forming a rectangle) there can be a performance penalty. As a result, it's probably not a good idea to use this optimization in general. I had planned to leave it enabled only for BLORP, where we use rectangle primitives and are typically clearing/blitting an entire render target without any overlapping primitives, but I noticed that the optimization wasn't applied in some normal cases anyway. For example, in the piglit test tests/shaders/glsl-fs-texture2d-bias.shader_test it is applied to one BLORP-blit shader but not another due to some kind of mishandling of register types (the destination register type of the texture operation is UD while the color source of the render target write is F). Additionally the instruction scheduler assumed that the combined texture and render target write operation took 0 cycles, leading to cycle estimates that are wildly inaccurate. Since the optimization was not implemented for SIMD32 and our decision whether to use the SIMD32 program is made by comparing the estimated performance with that of the SIMD16 shader, we wrongly threw out a bunch of SIMD32 programs that are likely profitable. total cycles in shared programs: 472807891 -> 473784245 (0.21%) cycles in affected programs: 108277 -> 1084631 (901.72%) helped: 0 HURT: 1290 total sends in shared programs: 998955 -> 1000245 (0.13%) sends in affected programs: 1400 -> 2690 (92.14%) helped: 0 HURT: 1290 LOST: 0 GAINED: 33 This patch shows no performance changes in Intel's Mesa performance CI. Given the problems, the lack of evidence that the pass improves performance, and the fact that the hardware feature was removed from subsequent GPU generations, I think that the pass is not valuable and should be removed. Reviewed-by: Jason Ekstrand Reviewed-by: Francisco Jerez Reviewed-by: Kenneth Graunke Signed-off-by: Matt Turner Part-of: --- diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 3894217c045..514acc88b33 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2942,102 +2942,6 @@ fs_visitor::opt_zero_samples() return progress; } -/** - * Optimize sample messages which are followed by the final RT write. - * - * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its - * results sent directly to the framebuffer, bypassing the EU. Recognize the - * final texturing results copied to the framebuffer write payload and modify - * them to write to the framebuffer directly. - */ -bool -fs_visitor::opt_sampler_eot() -{ - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; - - if (stage != MESA_SHADER_FRAGMENT || dispatch_width > 16) - return false; - - if (devinfo->gen != 9 && !devinfo->is_cherryview) - return false; - - /* FINISHME: It should be possible to implement this optimization when there - * are multiple drawbuffers. - */ - if (key->nr_color_regions != 1) - return false; - - /* Requires emitting a bunch of saturating MOV instructions during logical - * send lowering to clamp the color payload, which the sampler unit isn't - * going to do for us. - */ - if (key->clamp_fragment_color) - return false; - - /* Look for a texturing instruction immediately before the final FB_WRITE. */ - bblock_t *block = cfg->blocks[cfg->num_blocks - 1]; - fs_inst *fb_write = (fs_inst *)block->end(); - assert(fb_write->eot); - assert(fb_write->opcode == FS_OPCODE_FB_WRITE_LOGICAL); - - /* There wasn't one; nothing to do. */ - if (unlikely(fb_write->prev->is_head_sentinel())) - return false; - - fs_inst *tex_inst = (fs_inst *) fb_write->prev; - - /* 3D Sampler » Messages » Message Format - * - * “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler - * messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*” - */ - if (tex_inst->opcode != SHADER_OPCODE_TEX_LOGICAL && - tex_inst->opcode != SHADER_OPCODE_TXD_LOGICAL && - tex_inst->opcode != SHADER_OPCODE_TXF_LOGICAL && - tex_inst->opcode != SHADER_OPCODE_TXL_LOGICAL && - tex_inst->opcode != FS_OPCODE_TXB_LOGICAL && - tex_inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL && - tex_inst->opcode != SHADER_OPCODE_TXF_CMS_W_LOGICAL && - tex_inst->opcode != SHADER_OPCODE_TXF_UMS_LOGICAL) - return false; - - /* XXX - This shouldn't be necessary. */ - if (tex_inst->prev->is_head_sentinel()) - return false; - - /* Check that the FB write sources are fully initialized by the single - * texturing instruction. - */ - for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) { - if (i == FB_WRITE_LOGICAL_SRC_COLOR0) { - if (!fb_write->src[i].equals(tex_inst->dst) || - fb_write->size_read(i) != tex_inst->size_written) - return false; - } else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) { - if (fb_write->src[i].file != BAD_FILE) - return false; - } - } - - assert(!tex_inst->eot); /* We can't get here twice */ - assert((tex_inst->offset & (0xff << 24)) == 0); - - const fs_builder ibld(this, block, tex_inst); - - tex_inst->offset |= fb_write->target << 24; - tex_inst->eot = true; - tex_inst->dst = ibld.null_reg_ud(); - tex_inst->size_written = 0; - fb_write->remove(cfg->blocks[cfg->num_blocks - 1]); - - /* Marking EOT is sufficient, lower_logical_sends() will notice the EOT - * flag and submit a header together with the sampler message as required - * by the hardware. - */ - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - return true; -} - bool fs_visitor::opt_register_renaming() { @@ -7573,10 +7477,6 @@ fs_visitor::optimize() OPT(lower_simd_width); OPT(lower_barycentrics); - - /* After SIMD lowering just in case we had to unroll the EOT send. */ - OPT(opt_sampler_eot); - OPT(lower_logical_sends); /* After logical SEND lowering. */ diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 5bf5cdeed41..c24438a737e 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -175,7 +175,6 @@ public: bool remove_duplicate_mrf_writes(); bool remove_extra_rounding_modes(); - bool opt_sampler_eot(); void schedule_instructions(instruction_scheduler_mode mode); void insert_gen4_send_dependency_workarounds(); void insert_gen4_pre_send_dependency_workarounds(bblock_t *block,