static void *
blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
struct blorp_address *addr);
+static void
+blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
+ const struct blorp_address *addrs,
+ unsigned num_vbs);
#if GEN_GEN >= 8
static struct blorp_address
blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size);
}
+#if GEN_GEN >= 7
+static void
+blorp_emit_memcpy(struct blorp_batch *batch,
+ struct blorp_address dst,
+ struct blorp_address src,
+ uint32_t size);
+#endif
+
static void
blorp_emit_vertex_data(struct blorp_batch *batch,
const struct blorp_params *params,
}
blorp_flush_range(batch, data, *size);
+
+ if (params->dst_clear_color_as_input) {
+#if GEN_GEN >= 7
+ /* In this case, the clear color isn't known statically and instead
+ * comes in through an indirect which we have to copy into the vertex
+ * buffer before we execute the 3DPRIMITIVE. We already copied the
+ * value of params->wm_inputs.clear_color into the vertex buffer in the
+ * loop above. Now we emit code to stomp it from the GPU with the
+ * actual clear color value.
+ */
+ assert(num_varyings == 1);
+
+ /* The clear color is the first thing after the header */
+ struct blorp_address clear_color_input_addr = *addr;
+ clear_color_input_addr.offset += 16;
+
+ const unsigned clear_color_size =
+ GEN_GEN < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
+ blorp_emit_memcpy(batch, clear_color_input_addr,
+ params->dst.clear_color_addr,
+ clear_color_size);
+#else
+ unreachable("MCS partial resolve is not a thing on SNB and earlier");
+#endif
+ }
}
static void
vb[idx].BufferPitch = stride;
#if GEN_GEN >= 6
- vb[idx].VertexBufferMOCS = addr.mocs;
+ vb[idx].MOCS = addr.mocs;
#endif
#if GEN_GEN >= 7
const struct blorp_params *params)
{
struct GENX(VERTEX_BUFFER_STATE) vb[3];
+ uint32_t num_vbs = 2;
memset(vb, 0, sizeof(vb));
- struct blorp_address addr;
+ struct blorp_address addrs[2] = {};
uint32_t size;
- blorp_emit_vertex_data(batch, params, &addr, &size);
- blorp_fill_vertex_buffer_state(batch, vb, 0, addr, size, 3 * sizeof(float));
+ blorp_emit_vertex_data(batch, params, &addrs[0], &size);
+ blorp_fill_vertex_buffer_state(batch, vb, 0, addrs[0], size,
+ 3 * sizeof(float));
- blorp_emit_input_varying_data(batch, params, &addr, &size);
- blorp_fill_vertex_buffer_state(batch, vb, 1, addr, size, 0);
-
- uint32_t num_vbs = 2;
- if (params->dst_clear_color_as_input) {
- const unsigned clear_color_size =
- GEN_GEN < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
- blorp_fill_vertex_buffer_state(batch, vb, num_vbs++,
- params->dst.clear_color_addr,
- clear_color_size, 0);
- }
+ blorp_emit_input_varying_data(batch, params, &addrs[1], &size);
+ blorp_fill_vertex_buffer_state(batch, vb, 1, addrs[1], size, 0);
const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
if (!dw)
return;
+ blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, num_vbs);
+
for (unsigned i = 0; i < num_vbs; i++) {
GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
dw += GENX(VERTEX_BUFFER_STATE_length);
};
slot++;
- if (params->dst_clear_color_as_input) {
- /* If the caller wants the destination indirect clear color, redirect
- * to vertex buffer 2 where we stored it earlier. The only users of
- * an indirect clear color source have that as their only vertex
- * attribute.
- */
- assert(num_varyings == 1);
+ for (unsigned i = 0; i < num_varyings; ++i) {
ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
- .VertexBufferIndex = 2,
+ .VertexBufferIndex = 1,
.Valid = true,
- .SourceElementOffset = 0,
- .Component0Control = VFCOMP_STORE_SRC,
-#if GEN_GEN >= 9
.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+ .SourceElementOffset = 16 + i * 4 * sizeof(float),
+ .Component0Control = VFCOMP_STORE_SRC,
.Component1Control = VFCOMP_STORE_SRC,
.Component2Control = VFCOMP_STORE_SRC,
.Component3Control = VFCOMP_STORE_SRC,
-#else
- /* Clear colors on gen7-8 are for bits out of one dword */
- .SourceElementFormat = ISL_FORMAT_R32_FLOAT,
- .Component1Control = VFCOMP_STORE_0,
- .Component2Control = VFCOMP_STORE_0,
- .Component3Control = VFCOMP_STORE_0,
+#if GEN_GEN <= 5
+ .DestinationElementOffset = slot * 4,
#endif
};
slot++;
- } else {
- for (unsigned i = 0; i < num_varyings; ++i) {
- ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
- .VertexBufferIndex = 1,
- .Valid = true,
- .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
- .SourceElementOffset = 16 + i * 4 * sizeof(float),
- .Component0Control = VFCOMP_STORE_SRC,
- .Component1Control = VFCOMP_STORE_SRC,
- .Component2Control = VFCOMP_STORE_SRC,
- .Component3Control = VFCOMP_STORE_SRC,
-#if GEN_GEN <= 5
- .DestinationElementOffset = slot * 4,
-#endif
- };
- slot++;
- }
}
const unsigned num_dwords =
ps.BindingTableEntryCount = 1;
}
- if (prog_data) {
- ps.DispatchGRFStartRegisterForConstantSetupData0 =
- prog_data->base.dispatch_grf_start_reg;
- ps.DispatchGRFStartRegisterForConstantSetupData2 =
- prog_data->dispatch_grf_start_reg_2;
+ /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to
+ * disable prefetching of binding tables on A0 and B0 steppings.
+ * TODO: Revisit this WA on C0 stepping.
+ */
+ if (GEN_GEN == 11)
+ ps.BindingTableEntryCount = 0;
+ if (prog_data) {
ps._8PixelDispatchEnable = prog_data->dispatch_8;
ps._16PixelDispatchEnable = prog_data->dispatch_16;
+ ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+ /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+ *
+ * "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+ * Dispatch must not be enabled for PER_PIXEL dispatch mode."
+ *
+ * Since 16x MSAA is first introduced on SKL, we don't need to apply
+ * the workaround on any older hardware.
+ */
+ if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
+ params->num_samples == 16) {
+ assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+ ps._32PixelDispatchEnable = false;
+ }
- ps.KernelStartPointer0 = params->wm_prog_kernel;
- ps.KernelStartPointer2 =
- params->wm_prog_kernel + prog_data->prog_offset_2;
+ ps.DispatchGRFStartRegisterForConstantSetupData0 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+ ps.DispatchGRFStartRegisterForConstantSetupData1 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+ ps.DispatchGRFStartRegisterForConstantSetupData2 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+
+ ps.KernelStartPointer0 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+ ps.KernelStartPointer1 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+ ps.KernelStartPointer2 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 2);
}
/* 3DSTATE_PS expects the number of threads per PSD, which is always 64
#endif
if (prog_data) {
+ ps._8PixelDispatchEnable = prog_data->dispatch_8;
+ ps._16PixelDispatchEnable = prog_data->dispatch_16;
+ ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
ps.DispatchGRFStartRegisterForConstantSetupData0 =
- prog_data->base.dispatch_grf_start_reg;
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+ ps.DispatchGRFStartRegisterForConstantSetupData1 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
ps.DispatchGRFStartRegisterForConstantSetupData2 =
- prog_data->dispatch_grf_start_reg_2;
-
- ps.KernelStartPointer0 = params->wm_prog_kernel;
- ps.KernelStartPointer2 =
- params->wm_prog_kernel + prog_data->prog_offset_2;
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
- ps._8PixelDispatchEnable = prog_data->dispatch_8;
- ps._16PixelDispatchEnable = prog_data->dispatch_16;
+ ps.KernelStartPointer0 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+ ps.KernelStartPointer1 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+ ps.KernelStartPointer2 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 2);
ps.AttributeEnable = prog_data->num_varying_inputs > 0;
} else {
if (prog_data) {
wm.ThreadDispatchEnable = true;
+ wm._8PixelDispatchEnable = prog_data->dispatch_8;
+ wm._16PixelDispatchEnable = prog_data->dispatch_16;
+ wm._32PixelDispatchEnable = prog_data->dispatch_32;
+
wm.DispatchGRFStartRegisterForConstantSetupData0 =
- prog_data->base.dispatch_grf_start_reg;
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 0);
+ wm.DispatchGRFStartRegisterForConstantSetupData1 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 1);
wm.DispatchGRFStartRegisterForConstantSetupData2 =
- prog_data->dispatch_grf_start_reg_2;
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 2);
- wm.KernelStartPointer0 = params->wm_prog_kernel;
- wm.KernelStartPointer2 =
- params->wm_prog_kernel + prog_data->prog_offset_2;
-
- wm._8PixelDispatchEnable = prog_data->dispatch_8;
- wm._16PixelDispatchEnable = prog_data->dispatch_16;
+ wm.KernelStartPointer0 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, wm, 0);
+ wm.KernelStartPointer1 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, wm, 1);
+ wm.KernelStartPointer2 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, wm, 2);
wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
}
#endif /* GEN_GEN >= 6 */
-#if GEN_GEN >= 7 && GEN_GEN < 10
+#if GEN_GEN >= 7
static void
blorp_emit_memcpy(struct blorp_batch *batch,
struct blorp_address dst,
*/
blorp_emit_3dstate_multisample(batch, params);
+ /* From the BDW PRM Volume 7, Depth Buffer Clear:
+ *
+ * The clear value must be between the min and max depth values
+ * (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
+ * D32_FLOAT, then +/-DENORM values are also allowed.
+ *
+ * Set the bounds to match our hardware limits, [0.0, 1.0].
+ */
+ if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR) {
+ assert(params->depth.clear_color.f32[0] >= 0.0f);
+ assert(params->depth.clear_color.f32[0] <= 1.0f);
+ blorp_emit_cc_viewport(batch);
+ }
+
+ /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
+ * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
+ * even when WM_HZ_OP is active. However, WM thread dispatch is normally
+ * disabled for HiZ ops and it appears that force-enabling it can lead to
+ * GPU hangs on at least Skylake. Since we don't know the current state of
+ * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
+ */
+ blorp_emit(batch, GENX(3DSTATE_WM), wm);
+
/* If we can't alter the depth stencil config and multiple layers are
* involved, the HiZ op will fail. This is because the op requires that a
* new config is emitted for each additional layer.
static void
blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
{
- blorp_update_clear_color(batch, ¶ms->dst, params->fast_clear_op);
- blorp_update_clear_color(batch, ¶ms->depth, params->hiz_op);
+ if (!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR)) {
+ blorp_update_clear_color(batch, ¶ms->dst, params->fast_clear_op);
+ blorp_update_clear_color(batch, ¶ms->depth, params->hiz_op);
+ }
#if GEN_GEN >= 8
if (params->hiz_op != ISL_AUX_OP_NONE) {