#include "blorp_priv.h"
#include "dev/gen_device_info.h"
#include "common/gen_sample_positions.h"
+#include "common/gen_l3_config.h"
#include "genxml/gen_macros.h"
/**
static void *
blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
struct blorp_address *addr);
+static void
+blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
+ const struct blorp_address *addrs,
+ uint32_t *sizes,
+ unsigned num_vbs);
-#if GEN_GEN >= 8
-static struct blorp_address
+UNUSED static struct blorp_address
blorp_get_workaround_page(struct blorp_batch *batch);
-#endif
static void
blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
struct blorp_address address, uint32_t delta);
+static uint64_t
+blorp_get_surface_address(struct blorp_batch *batch,
+ struct blorp_address address);
+
#if GEN_GEN >= 7 && GEN_GEN < 10
static struct blorp_address
blorp_get_surface_base_address(struct blorp_batch *batch);
#endif
+#if GEN_GEN >= 7
+static const struct gen_l3_config *
+blorp_get_l3_config(struct blorp_batch *batch);
+# else
static void
blorp_emit_urb_config(struct blorp_batch *batch,
unsigned vs_entry_size, unsigned sf_entry_size);
+#endif
static void
blorp_emit_pipeline(struct blorp_batch *batch,
_blorp_cmd_pack(cmd)(batch, (void *)_dst, &name), \
_dst = NULL)
-#define blorp_emitn(batch, cmd, n) ({ \
+#define blorp_emitn(batch, cmd, n, ...) ({ \
uint32_t *_dw = blorp_emit_dwords(batch, n); \
if (_dw) { \
struct cmd template = { \
_blorp_cmd_header(cmd), \
.DWordLength = n - _blorp_cmd_length_bias(cmd), \
+ __VA_ARGS__ \
}; \
_blorp_cmd_pack(cmd)(batch, _dw, &template); \
} \
const unsigned sf_entry_size =
params->sf_prog_data ? params->sf_prog_data->urb_entry_size : 0;
+#if GEN_GEN >= 7
+ assert(sf_entry_size == 0);
+ const unsigned entry_size[4] = { vs_entry_size, 1, 1, 1 };
+
+ unsigned entries[4], start[4];
+ gen_get_urb_config(batch->blorp->compiler->devinfo,
+ blorp_get_l3_config(batch),
+ false, false, entry_size, entries, start);
+
+#if GEN_GEN == 7 && !GEN_IS_HASWELL
+ /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
+ *
+ * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
+ * needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
+ * 3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
+ * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one PIPE_CONTROL
+ * needs to be sent before any combination of VS associated 3DSTATE."
+ */
+ blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
+ pc.DepthStallEnable = true;
+ pc.PostSyncOperation = WriteImmediateData;
+ pc.Address = blorp_get_workaround_page(batch);
+ }
+#endif
+
+ for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+ blorp_emit(batch, GENX(3DSTATE_URB_VS), urb) {
+ urb._3DCommandSubOpcode += i;
+ urb.VSURBStartingAddress = start[i];
+ urb.VSURBEntryAllocationSize = entry_size[i] - 1;
+ urb.VSNumberofURBEntries = entries[i];
+ }
+ }
+#else /* GEN_GEN < 7 */
blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size);
+#endif
}
+#if GEN_GEN >= 7
+static void
+blorp_emit_memcpy(struct blorp_batch *batch,
+ struct blorp_address dst,
+ struct blorp_address src,
+ uint32_t size);
+#endif
+
static void
blorp_emit_vertex_data(struct blorp_batch *batch,
const struct blorp_params *params,
}
blorp_flush_range(batch, data, *size);
+
+ if (params->dst_clear_color_as_input) {
+#if GEN_GEN >= 7
+ /* In this case, the clear color isn't known statically and instead
+ * comes in through an indirect which we have to copy into the vertex
+ * buffer before we execute the 3DPRIMITIVE. We already copied the
+ * value of params->wm_inputs.clear_color into the vertex buffer in the
+ * loop above. Now we emit code to stomp it from the GPU with the
+ * actual clear color value.
+ */
+ assert(num_varyings == 1);
+
+ /* The clear color is the first thing after the header */
+ struct blorp_address clear_color_input_addr = *addr;
+ clear_color_input_addr.offset += 16;
+
+ const unsigned clear_color_size =
+ GEN_GEN < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
+ blorp_emit_memcpy(batch, clear_color_input_addr,
+ params->dst.clear_color_addr,
+ clear_color_size);
+#else
+ unreachable("MCS partial resolve is not a thing on SNB and earlier");
+#endif
+ }
}
static void
vb[idx].BufferPitch = stride;
#if GEN_GEN >= 6
- vb[idx].VertexBufferMOCS = addr.mocs;
+ vb[idx].MOCS = addr.mocs;
#endif
#if GEN_GEN >= 7
const struct blorp_params *params)
{
struct GENX(VERTEX_BUFFER_STATE) vb[3];
+ uint32_t num_vbs = 2;
memset(vb, 0, sizeof(vb));
- struct blorp_address addr;
- uint32_t size;
- blorp_emit_vertex_data(batch, params, &addr, &size);
- blorp_fill_vertex_buffer_state(batch, vb, 0, addr, size, 3 * sizeof(float));
+ struct blorp_address addrs[2] = {};
+ uint32_t sizes[2];
+ blorp_emit_vertex_data(batch, params, &addrs[0], &sizes[0]);
+ blorp_fill_vertex_buffer_state(batch, vb, 0, addrs[0], sizes[0],
+ 3 * sizeof(float));
- blorp_emit_input_varying_data(batch, params, &addr, &size);
- blorp_fill_vertex_buffer_state(batch, vb, 1, addr, size, 0);
+ blorp_emit_input_varying_data(batch, params, &addrs[1], &sizes[1]);
+ blorp_fill_vertex_buffer_state(batch, vb, 1, addrs[1], sizes[1], 0);
- uint32_t num_vbs = 2;
- if (params->dst_clear_color_as_input) {
- const unsigned clear_color_size =
- GEN_GEN < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
- blorp_fill_vertex_buffer_state(batch, vb, num_vbs++,
- params->dst.clear_color_addr,
- clear_color_size, 0);
- }
+ blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, sizes, num_vbs);
const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
};
slot++;
- if (params->dst_clear_color_as_input) {
- /* If the caller wants the destination indirect clear color, redirect
- * to vertex buffer 2 where we stored it earlier. The only users of
- * an indirect clear color source have that as their only vertex
- * attribute.
- */
- assert(num_varyings == 1);
+ for (unsigned i = 0; i < num_varyings; ++i) {
ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
- .VertexBufferIndex = 2,
+ .VertexBufferIndex = 1,
.Valid = true,
- .SourceElementOffset = 0,
- .Component0Control = VFCOMP_STORE_SRC,
-#if GEN_GEN >= 9
.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+ .SourceElementOffset = 16 + i * 4 * sizeof(float),
+ .Component0Control = VFCOMP_STORE_SRC,
.Component1Control = VFCOMP_STORE_SRC,
.Component2Control = VFCOMP_STORE_SRC,
.Component3Control = VFCOMP_STORE_SRC,
-#else
- /* Clear colors on gen7-8 are for bits out of one dword */
- .SourceElementFormat = ISL_FORMAT_R32_FLOAT,
- .Component1Control = VFCOMP_STORE_0,
- .Component2Control = VFCOMP_STORE_0,
- .Component3Control = VFCOMP_STORE_0,
+#if GEN_GEN <= 5
+ .DestinationElementOffset = slot * 4,
#endif
};
slot++;
- } else {
- for (unsigned i = 0; i < num_varyings; ++i) {
- ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
- .VertexBufferIndex = 1,
- .Valid = true,
- .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
- .SourceElementOffset = 16 + i * 4 * sizeof(float),
- .Component0Control = VFCOMP_STORE_SRC,
- .Component1Control = VFCOMP_STORE_SRC,
- .Component2Control = VFCOMP_STORE_SRC,
- .Component3Control = VFCOMP_STORE_SRC,
-#if GEN_GEN <= 5
- .DestinationElementOffset = slot * 4,
-#endif
- };
- slot++;
- }
}
const unsigned num_dwords =
dw += GENX(VERTEX_ELEMENT_STATE_length);
}
+ blorp_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
+ vf.StatisticsEnable = false;
+ }
+
#if GEN_GEN >= 8
/* Overwrite Render Target Array Index (2nd dword) in the VUE header with
* primitive instance identifier. This is used for layered clears.
#if GEN_GEN >= 8
- blorp_emit(batch, GENX(3DSTATE_SF), sf);
+ blorp_emit(batch, GENX(3DSTATE_SF), sf) {
+#if GEN_GEN >= 12
+ sf.DerefBlockSize = PerPolyDerefMode;
+#endif
+ }
blorp_emit(batch, GENX(3DSTATE_RASTER), raster) {
raster.CullMode = CULLMODE_NONE;
ps.BindingTableEntryCount = 1;
}
- if (prog_data) {
- ps.DispatchGRFStartRegisterForConstantSetupData0 =
- prog_data->base.dispatch_grf_start_reg;
- ps.DispatchGRFStartRegisterForConstantSetupData2 =
- prog_data->dispatch_grf_start_reg_2;
+ /* SAMPLER_STATE prefetching is broken on Gen11 - WA_1606682166 */
+ if (GEN_GEN == 11)
+ ps.SamplerCount = 0;
+ if (prog_data) {
ps._8PixelDispatchEnable = prog_data->dispatch_8;
ps._16PixelDispatchEnable = prog_data->dispatch_16;
+ ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+ /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+ *
+ * "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+ * Dispatch must not be enabled for PER_PIXEL dispatch mode."
+ *
+ * Since 16x MSAA is first introduced on SKL, we don't need to apply
+ * the workaround on any older hardware.
+ */
+ if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
+ params->num_samples == 16) {
+ assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+ ps._32PixelDispatchEnable = false;
+ }
- ps.KernelStartPointer0 = params->wm_prog_kernel;
- ps.KernelStartPointer2 =
- params->wm_prog_kernel + prog_data->prog_offset_2;
+ ps.DispatchGRFStartRegisterForConstantSetupData0 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+ ps.DispatchGRFStartRegisterForConstantSetupData1 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+ ps.DispatchGRFStartRegisterForConstantSetupData2 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+
+ ps.KernelStartPointer0 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+ ps.KernelStartPointer1 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+ ps.KernelStartPointer2 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 2);
}
/* 3DSTATE_PS expects the number of threads per PSD, which is always 64
switch (params->fast_clear_op) {
case ISL_AUX_OP_NONE:
break;
+#if GEN_GEN >= 10
+ case ISL_AUX_OP_AMBIGUATE:
+ ps.RenderTargetFastClearEnable = true;
+ ps.RenderTargetResolveType = FAST_CLEAR_0;
+ break;
+#endif
#if GEN_GEN >= 9
case ISL_AUX_OP_PARTIAL_RESOLVE:
ps.RenderTargetResolveType = RESOLVE_PARTIAL;
#endif
if (prog_data) {
+ ps._8PixelDispatchEnable = prog_data->dispatch_8;
+ ps._16PixelDispatchEnable = prog_data->dispatch_16;
+ ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
ps.DispatchGRFStartRegisterForConstantSetupData0 =
- prog_data->base.dispatch_grf_start_reg;
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+ ps.DispatchGRFStartRegisterForConstantSetupData1 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
ps.DispatchGRFStartRegisterForConstantSetupData2 =
- prog_data->dispatch_grf_start_reg_2;
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
- ps.KernelStartPointer0 = params->wm_prog_kernel;
- ps.KernelStartPointer2 =
- params->wm_prog_kernel + prog_data->prog_offset_2;
-
- ps._8PixelDispatchEnable = prog_data->dispatch_8;
- ps._16PixelDispatchEnable = prog_data->dispatch_16;
+ ps.KernelStartPointer0 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+ ps.KernelStartPointer1 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+ ps.KernelStartPointer2 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, ps, 2);
ps.AttributeEnable = prog_data->num_varying_inputs > 0;
} else {
if (prog_data) {
wm.ThreadDispatchEnable = true;
+ wm._8PixelDispatchEnable = prog_data->dispatch_8;
+ wm._16PixelDispatchEnable = prog_data->dispatch_16;
+ wm._32PixelDispatchEnable = prog_data->dispatch_32;
+
wm.DispatchGRFStartRegisterForConstantSetupData0 =
- prog_data->base.dispatch_grf_start_reg;
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 0);
+ wm.DispatchGRFStartRegisterForConstantSetupData1 =
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 1);
wm.DispatchGRFStartRegisterForConstantSetupData2 =
- prog_data->dispatch_grf_start_reg_2;
-
- wm.KernelStartPointer0 = params->wm_prog_kernel;
- wm.KernelStartPointer2 =
- params->wm_prog_kernel + prog_data->prog_offset_2;
+ brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 2);
- wm._8PixelDispatchEnable = prog_data->dispatch_8;
- wm._16PixelDispatchEnable = prog_data->dispatch_16;
+ wm.KernelStartPointer0 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, wm, 0);
+ wm.KernelStartPointer1 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, wm, 1);
+ wm.KernelStartPointer2 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, wm, 2);
wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
}
static uint32_t
blorp_emit_color_calc_state(struct blorp_batch *batch,
- MAYBE_UNUSED const struct blorp_params *params)
+ UNUSED const struct blorp_params *params)
{
uint32_t offset;
blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {
ds.DepthBufferWriteEnable = true;
switch (params->hiz_op) {
- case ISL_AUX_OP_NONE:
- ds.DepthTestEnable = true;
- ds.DepthTestFunction = COMPAREFUNCTION_ALWAYS;
- break;
-
/* See the following sections of the Sandy Bridge PRM, Volume 2, Part1:
* - 7.5.3.1 Depth Buffer Clear
* - 7.5.3.2 Depth Buffer Resolve
ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
break;
+ case ISL_AUX_OP_NONE:
case ISL_AUX_OP_FAST_CLEAR:
case ISL_AUX_OP_AMBIGUATE:
ds.DepthTestEnable = false;
(void)depth_stencil_state_offset;
#endif
+#if GEN_GEN >= 12
+ blorp_emit(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
+ /* Update empty push constants for all stages (bitmask = 11111b) */
+ pc.ShaderUpdateEnable = 0x1f;
+ }
+#else
blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), vs);
#if GEN_GEN >= 7
blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), hs);
#endif
blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs);
blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps);
+#endif
if (params->src.enabled)
blorp_emit_sampler_state(batch);
#endif /* GEN_GEN >= 6 */
-#if GEN_GEN >= 7 && GEN_GEN < 10
+#if GEN_GEN >= 7
static void
blorp_emit_memcpy(struct blorp_batch *batch,
struct blorp_address dst,
static void
blorp_emit_surface_state(struct blorp_batch *batch,
const struct brw_blorp_surface_info *surface,
- enum isl_aux_op op,
+ enum isl_aux_op aux_op,
void *state, uint32_t state_offset,
const bool color_write_disables[4],
bool is_render_target)
surf.dim = ISL_SURF_DIM_2D;
}
- /* Blorp doesn't support HiZ in any of the blit or slow-clear paths */
+ if (isl_aux_usage_has_hiz(surface->aux_usage)) {
+ /* BLORP doesn't render with depth so we can't use HiZ */
+ assert(!is_render_target);
+ /* We can't reinterpret HiZ */
+ assert(surface->surf.format == surface->view.format);
+ }
enum isl_aux_usage aux_usage = surface->aux_usage;
- if (aux_usage == ISL_AUX_USAGE_HIZ)
- aux_usage = ISL_AUX_USAGE_NONE;
isl_channel_mask_t write_disable_mask = 0;
if (is_render_target && GEN_GEN <= 5) {
isl_surf_fill_state(batch->blorp->isl_dev, state,
.surf = &surf, .view = &surface->view,
.aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
+ .address =
+ blorp_get_surface_address(batch, surface->addr),
+ .aux_address = aux_usage == ISL_AUX_USAGE_NONE ? 0 :
+ blorp_get_surface_address(batch, surface->aux_addr),
+ .clear_address = !use_clear_address ? 0 :
+ blorp_get_surface_address(batch,
+ surface->clear_color_addr),
.mocs = surface->addr.mocs,
.clear_color = surface->clear_color,
.use_clear_address = use_clear_address,
surface->aux_addr, *aux_addr);
}
- if (surface->clear_color_addr.buffer) {
+ if (aux_usage != ISL_AUX_USAGE_NONE && surface->clear_color_addr.buffer) {
#if GEN_GEN >= 10
assert((surface->clear_color_addr.offset & 0x3f) == 0);
uint32_t *clear_addr = state + isl_dev->ss.clear_color_state_offset;
isl_dev->ss.clear_color_state_offset,
surface->clear_color_addr, *clear_addr);
#elif GEN_GEN >= 7
- if (op == ISL_AUX_OP_FULL_RESOLVE || op == ISL_AUX_OP_PARTIAL_RESOLVE) {
+ /* Fast clears just whack the AUX surface and don't actually use the
+ * clear color for anything. We can avoid the MI memcpy on that case.
+ */
+ if (aux_op != ISL_AUX_OP_FAST_CLEAR) {
struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
uint32_t bind_offset = 0, surface_offsets[2];
void *surface_maps[2];
- MAYBE_UNUSED bool has_indirect_clear_color = false;
+ UNUSED bool has_indirect_clear_color = false;
if (params->use_pre_baked_binding_table) {
bind_offset = params->pre_baked_binding_table_offset;
} else {
*/
blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
pipe.StateCacheInvalidationEnable = true;
+#if GEN_GEN >= 12
+ pipe.TileCacheFlushEnable = true;
+#endif
}
}
#endif
params->depth.addr, 0);
info.hiz_usage = params->depth.aux_usage;
- if (info.hiz_usage == ISL_AUX_USAGE_HIZ) {
+ if (isl_aux_usage_has_hiz(info.hiz_usage)) {
info.hiz_surf = ¶ms->depth.aux_surf;
struct blorp_address hiz_address = params->depth.aux_addr;
if (params->stencil.enabled) {
info.stencil_surf = ¶ms->stencil.surf;
+ info.stencil_aux_usage = params->stencil.aux_usage;
struct blorp_address stencil_address = params->stencil.addr;
#if GEN_GEN == 6
/* Sandy bridge hardware does not technically support mipmapped stencil.
}
isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info);
+
+#if GEN_GEN >= 12
+ /* GEN:BUG:1408224581
+ *
+ * Workaround: Gen12LP Astep only An additional pipe control with
+ * post-sync = store dword operation would be required.( w/a is to
+ * have an additional pipe control after the stencil state whenever
+ * the surface state bits of this state is changing).
+ */
+ blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
+ pc.PostSyncOperation = WriteImmediateData;
+ pc.Address = blorp_get_workaround_page(batch);
+ }
+#endif
}
#if GEN_GEN >= 8
*/
assert(params->depth.enabled || params->stencil.enabled);
- /* The stencil buffer should only be enabled if a fast clear operation is
- * requested.
+ /* The stencil buffer should only be enabled on GEN == 12, if a fast clear
+ * or full resolve operation is requested. On rest of the GEN, if a fast
+ * clear operation is requested.
*/
- if (params->stencil.enabled)
+ if (params->stencil.enabled) {
+#if GEN_GEN >= 12
+ assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR ||
+ params->hiz_op == ISL_AUX_OP_FULL_RESOLVE);
+#else
assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR);
+#endif
+ }
/* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP:
*
*/
blorp_emit_3dstate_multisample(batch, params);
+ /* From the BDW PRM Volume 7, Depth Buffer Clear:
+ *
+ * The clear value must be between the min and max depth values
+ * (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
+ * D32_FLOAT, then +/-DENORM values are also allowed.
+ *
+ * Set the bounds to match our hardware limits, [0.0, 1.0].
+ */
+ if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR) {
+ assert(params->depth.clear_color.f32[0] >= 0.0f);
+ assert(params->depth.clear_color.f32[0] <= 1.0f);
+ blorp_emit_cc_viewport(batch);
+ }
+
+ /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
+ * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
+ * even when WM_HZ_OP is active. However, WM thread dispatch is normally
+ * disabled for HiZ ops and it appears that force-enabling it can lead to
+ * GPU hangs on at least Skylake. Since we don't know the current state of
+ * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
+ */
+ blorp_emit(batch, GENX(3DSTATE_WM), wm);
+
/* If we can't alter the depth stencil config and multiple layers are
* involved, the HiZ op will fail. This is because the op requires that a
* new config is emitted for each additional layer.
break;
case ISL_AUX_OP_FULL_RESOLVE:
assert(params->full_surface_hiz_op);
- hzp.DepthBufferResolveEnable = true;
+ hzp.DepthBufferResolveEnable = params->depth.enabled;
+#if GEN_GEN >= 12
+ if (params->stencil.enabled) {
+ assert(params->stencil.aux_usage == ISL_AUX_USAGE_CCS_E);
+ hzp.StencilBufferResolveEnable = true;
+ }
+#endif
break;
case ISL_AUX_OP_AMBIGUATE:
assert(params->full_surface_hiz_op);
enum isl_aux_op op)
{
if (info->clear_color_addr.buffer && op == ISL_AUX_OP_FAST_CLEAR) {
-#if GEN_GEN >= 9
+#if GEN_GEN == 11
+ blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
+ pipe.CommandStreamerStallEnable = true;
+ }
+
+ /* 2 QWORDS */
+ const unsigned inlinedata_dw = 2 * 2;
+ const unsigned num_dwords = GENX(MI_ATOMIC_length) + inlinedata_dw;
+
+ struct blorp_address clear_addr = info->clear_color_addr;
+ uint32_t *dw = blorp_emitn(batch, GENX(MI_ATOMIC), num_dwords,
+ .DataSize = MI_ATOMIC_QWORD,
+ .ATOMICOPCODE = MI_ATOMIC_OP_MOVE8B,
+ .InlineData = true,
+ .MemoryAddress = clear_addr);
+ /* dw starts at dword 1, but we need to fill dwords 3 and 5 */
+ dw[2] = info->clear_color.u32[0];
+ dw[3] = 0;
+ dw[4] = info->clear_color.u32[1];
+ dw[5] = 0;
+
+ clear_addr.offset += 8;
+ dw = blorp_emitn(batch, GENX(MI_ATOMIC), num_dwords,
+ .DataSize = MI_ATOMIC_QWORD,
+ .ATOMICOPCODE = MI_ATOMIC_OP_MOVE8B,
+ .CSSTALL = true,
+ .ReturnDataControl = true,
+ .InlineData = true,
+ .MemoryAddress = clear_addr);
+ /* dw starts at dword 1, but we need to fill dwords 3 and 5 */
+ dw[2] = info->clear_color.u32[2];
+ dw[3] = 0;
+ dw[4] = info->clear_color.u32[3];
+ dw[5] = 0;
+
+ blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
+ pipe.StateCacheInvalidationEnable = true;
+ pipe.TextureCacheInvalidationEnable = true;
+ }
+#elif GEN_GEN >= 9
+
+ /* According to GEN:BUG:2201730850, in the Clear Color Programming Note
+ * under the Red channel, "Software shall write the converted Depth
+ * Clear to this dword." The only depth formats listed under the red
+ * channel are IEEE_FP and UNORM24_X8. These two requirements are
+ * incompatible with the UNORM16 depth format, so just ignore that case
+ * and simply perform the conversion for all depth formats.
+ */
+ union isl_color_value fixed_color = info->clear_color;
+ if (GEN_GEN == 12 && isl_surf_usage_is_depth(info->surf.usage)) {
+ isl_color_value_pack(&info->clear_color, info->surf.format,
+ fixed_color.u32);
+ }
+
for (int i = 0; i < 4; i++) {
blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
sdi.Address = info->clear_color_addr;
sdi.Address.offset += i * 4;
- sdi.ImmediateData = info->clear_color.u32[i];
+ sdi.ImmediateData = fixed_color.u32[i];
+#if GEN_GEN >= 12
+ if (i == 3)
+ sdi.ForceWriteCompletionCheck = true;
+#endif
+ }
+ }
+
+/* The RENDER_SURFACE_STATE::ClearColor field states that software should
+ * write the converted depth value 16B after the clear address:
+ *
+ * 3D Sampler will always fetch clear depth from the location 16-bytes
+ * above this address, where the clear depth, converted to native
+ * surface format by software, will be stored.
+ *
+ */
+#if GEN_GEN >= 12
+ if (isl_surf_usage_is_depth(info->surf.usage)) {
+ blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
+ sdi.Address = info->clear_color_addr;
+ sdi.Address.offset += 4 * 4;
+ sdi.ImmediateData = fixed_color.u32[0];
+ sdi.ForceWriteCompletionCheck = true;
}
}
+#endif
+
#elif GEN_GEN >= 7
blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
sdi.Address = info->clear_color_addr;
static void
blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
{
- blorp_update_clear_color(batch, ¶ms->dst, params->fast_clear_op);
- blorp_update_clear_color(batch, ¶ms->depth, params->hiz_op);
+ if (!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR)) {
+ blorp_update_clear_color(batch, ¶ms->dst, params->fast_clear_op);
+ blorp_update_clear_color(batch, ¶ms->depth, params->hiz_op);
+ }
#if GEN_GEN >= 8
if (params->hiz_op != ISL_AUX_OP_NONE) {