From d9cd982d556be560af3bcbcdaf62b6b93eb934a5 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Sun, 15 Feb 2015 20:06:59 -0800 Subject: [PATCH] i965/simd8vs: Fix SIMD8 atomics MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The short version: we need to set bits in R0.7 which provide a mask to be used for PS kill samples/pixels. Since the VS has no such concept, we just need to set all 1. The longer version... Execution for SIMD8 atomics is defined as follows: SIMD8: The low 8 bits of the execution mask are ANDed with 8 bits of the Pixel/Sample Mask from the message header. For the typed messages, the Slot Group in the message descriptor selects either the low or high 8 bits. For the untyped messages, the low 8 bits are always selected. The resulting mask is used to determine which slots are read into the destination GRF register (for read), or which slots are written to the surface (for write). If the header is not present, only the low 8 bits of the execution mask are used. The message header for untyped messages is defined in R0.7 "This field contains the 16-bit pixel/sample mask to be used for SIMD16 and SIMD8 messages. All 16 bits are used for SIMD16 messages. For typed SIMD8 messages, Slot Group selects which 8 bits of this field are used. For untyped SIMD8 messages, the low 8 bits of this field are used." Furthermore, "The message header for the untyped messages only needs to be delivered for pixel shader threads, where the execution mask may indicate pixels/samples that are enabled only due to derivative (LOD) calculations, but the corresponding slot on the surface must not be accessed." We're not using a pixel shader here, but AFAICT, this mask is used for all stages. This leaves two options, Remove the header, or make the VS code emit the correct thing for the header. I believe one of the goals of using SIMD8 VS was to get as much code reuse as possible, and so I chose the latter. Since the VS has no such thing as kill instructions, the mask is derived simple as all 1's. v2: Add a comment to the code (stolen from Curro on the mailing list) Change the control flow style (Curro + Jason) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=87258 Cc: Kristian Høgsberg Signed-off-by: Ben Widawsky Reviewed-by: Francisco Jerez Reviewed-by: Jason Ekstrand --- src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 24 +++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 2a36d942838..c7f321fe65d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -2978,9 +2978,6 @@ fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index, fs_reg dst, fs_reg offset, fs_reg src0, fs_reg src1) { - bool uses_kill = - (stage == MESA_SHADER_FRAGMENT) && - ((brw_wm_prog_data*) this->prog_data)->uses_kill; int reg_width = dispatch_width / 8; int length = 0; @@ -2991,13 +2988,24 @@ fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index, emit(MOV(sources[0], fs_reg(0u))) ->force_writemask_all = true; - if (uses_kill) { - emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1))) - ->force_writemask_all = true; + if (stage == MESA_SHADER_FRAGMENT) { + if (((brw_wm_prog_data*)this->prog_data)->uses_kill) { + emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1))) + ->force_writemask_all = true; + } else { + emit(MOV(component(sources[0], 7), + retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD))) + ->force_writemask_all = true; + } } else { + /* The execution mask is part of the side-band information sent together with + * the message payload to the data port. It's implicitly ANDed with the sample + * mask sent in the header to compute the actual set of channels that execute + * the atomic operation. + */ + assert(stage == MESA_SHADER_VERTEX); emit(MOV(component(sources[0], 7), - retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD))) - ->force_writemask_all = true; + brw_imm_ud(0xff)))->force_writemask_all = true; } length++; -- 2.30.2