X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fblorp%2Fblorp_genX_exec.h;h=d271bb4248c705f9548aff01d84c6e95299207a8;hb=09e4c33085f15ffa691053143bec9dbf4aecfeaa;hp=7a8c45dbee57a196029049401139020358d113e8;hpb=5bcf479524b96554cab7d2429dacf650b4054638;p=mesa.git diff --git a/src/intel/blorp/blorp_genX_exec.h b/src/intel/blorp/blorp_genX_exec.h index 7a8c45dbee5..d271bb4248c 100644 --- a/src/intel/blorp/blorp_genX_exec.h +++ b/src/intel/blorp/blorp_genX_exec.h @@ -27,6 +27,7 @@ #include "blorp_priv.h" #include "dev/gen_device_info.h" #include "common/gen_sample_positions.h" +#include "common/gen_l3_config.h" #include "genxml/gen_macros.h" /** @@ -62,12 +63,11 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size, static void blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch, const struct blorp_address *addrs, + uint32_t *sizes, unsigned num_vbs); -#if GEN_GEN >= 8 -static struct blorp_address +UNUSED static struct blorp_address blorp_get_workaround_page(struct blorp_batch *batch); -#endif static void blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries, @@ -82,14 +82,23 @@ static void blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset, struct blorp_address address, uint32_t delta); +static uint64_t +blorp_get_surface_address(struct blorp_batch *batch, + struct blorp_address address); + #if GEN_GEN >= 7 && GEN_GEN < 10 static struct blorp_address blorp_get_surface_base_address(struct blorp_batch *batch); #endif +#if GEN_GEN >= 7 +static const struct gen_l3_config * +blorp_get_l3_config(struct blorp_batch *batch); +# else static void blorp_emit_urb_config(struct blorp_batch *batch, unsigned vs_entry_size, unsigned sf_entry_size); +#endif static void blorp_emit_pipeline(struct blorp_batch *batch, @@ -126,12 +135,13 @@ _blorp_combine_address(struct blorp_batch *batch, void *location, _blorp_cmd_pack(cmd)(batch, (void *)_dst, &name), \ _dst = NULL) -#define blorp_emitn(batch, cmd, n) ({ \ +#define blorp_emitn(batch, cmd, n, ...) ({ \ uint32_t *_dw = blorp_emit_dwords(batch, n); \ if (_dw) { \ struct cmd template = { \ _blorp_cmd_header(cmd), \ .DWordLength = n - _blorp_cmd_length_bias(cmd), \ + __VA_ARGS__ \ }; \ _blorp_cmd_pack(cmd)(batch, _dw, &template); \ } \ @@ -201,7 +211,42 @@ emit_urb_config(struct blorp_batch *batch, const unsigned sf_entry_size = params->sf_prog_data ? params->sf_prog_data->urb_entry_size : 0; +#if GEN_GEN >= 7 + assert(sf_entry_size == 0); + const unsigned entry_size[4] = { vs_entry_size, 1, 1, 1 }; + + unsigned entries[4], start[4]; + gen_get_urb_config(batch->blorp->compiler->devinfo, + blorp_get_l3_config(batch), + false, false, entry_size, entries, start); + +#if GEN_GEN == 7 && !GEN_IS_HASWELL + /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1: + * + * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall + * needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS, + * 3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS, + * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one PIPE_CONTROL + * needs to be sent before any combination of VS associated 3DSTATE." + */ + blorp_emit(batch, GENX(PIPE_CONTROL), pc) { + pc.DepthStallEnable = true; + pc.PostSyncOperation = WriteImmediateData; + pc.Address = blorp_get_workaround_page(batch); + } +#endif + + for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { + blorp_emit(batch, GENX(3DSTATE_URB_VS), urb) { + urb._3DCommandSubOpcode += i; + urb.VSURBStartingAddress = start[i]; + urb.VSURBEntryAllocationSize = entry_size[i] - 1; + urb.VSNumberofURBEntries = entries[i]; + } + } +#else /* GEN_GEN < 7 */ blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size); +#endif } #if GEN_GEN >= 7 @@ -311,7 +356,7 @@ blorp_fill_vertex_buffer_state(struct blorp_batch *batch, vb[idx].BufferPitch = stride; #if GEN_GEN >= 6 - vb[idx].VertexBufferMOCS = addr.mocs; + vb[idx].MOCS = addr.mocs; #endif #if GEN_GEN >= 7 @@ -339,21 +384,21 @@ blorp_emit_vertex_buffers(struct blorp_batch *batch, memset(vb, 0, sizeof(vb)); struct blorp_address addrs[2] = {}; - uint32_t size; - blorp_emit_vertex_data(batch, params, &addrs[0], &size); - blorp_fill_vertex_buffer_state(batch, vb, 0, addrs[0], size, + uint32_t sizes[2]; + blorp_emit_vertex_data(batch, params, &addrs[0], &sizes[0]); + blorp_fill_vertex_buffer_state(batch, vb, 0, addrs[0], sizes[0], 3 * sizeof(float)); - blorp_emit_input_varying_data(batch, params, &addrs[1], &size); - blorp_fill_vertex_buffer_state(batch, vb, 1, addrs[1], size, 0); + blorp_emit_input_varying_data(batch, params, &addrs[1], &sizes[1]); + blorp_fill_vertex_buffer_state(batch, vb, 1, addrs[1], sizes[1], 0); + + blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, sizes, num_vbs); const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length); uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords); if (!dw) return; - blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, num_vbs); - for (unsigned i = 0; i < num_vbs; i++) { GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]); dw += GENX(VERTEX_BUFFER_STATE_length); @@ -509,6 +554,10 @@ blorp_emit_vertex_elements(struct blorp_batch *batch, dw += GENX(VERTEX_ELEMENT_STATE_length); } + blorp_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) { + vf.StatisticsEnable = false; + } + #if GEN_GEN >= 8 /* Overwrite Render Target Array Index (2nd dword) in the VUE header with * primitive instance identifier. This is used for layered clears. @@ -661,7 +710,11 @@ blorp_emit_sf_config(struct blorp_batch *batch, #if GEN_GEN >= 8 - blorp_emit(batch, GENX(3DSTATE_SF), sf); + blorp_emit(batch, GENX(3DSTATE_SF), sf) { +#if GEN_GEN >= 12 + sf.DerefBlockSize = PerPolyDerefMode; +#endif + } blorp_emit(batch, GENX(3DSTATE_RASTER), raster) { raster.CullMode = CULLMODE_NONE; @@ -762,12 +815,9 @@ blorp_emit_ps_config(struct blorp_batch *batch, ps.BindingTableEntryCount = 1; } - /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to - * disable prefetching of binding tables on A0 and B0 steppings. - * TODO: Revisit this WA on C0 stepping. - */ + /* SAMPLER_STATE prefetching is broken on Gen11 - WA_1606682166 */ if (GEN_GEN == 11) - ps.BindingTableEntryCount = 0; + ps.SamplerCount = 0; if (prog_data) { ps._8PixelDispatchEnable = prog_data->dispatch_8; @@ -818,6 +868,12 @@ blorp_emit_ps_config(struct blorp_batch *batch, switch (params->fast_clear_op) { case ISL_AUX_OP_NONE: break; +#if GEN_GEN >= 10 + case ISL_AUX_OP_AMBIGUATE: + ps.RenderTargetFastClearEnable = true; + ps.RenderTargetResolveType = FAST_CLEAR_0; + break; +#endif #if GEN_GEN >= 9 case ISL_AUX_OP_PARTIAL_RESOLVE: ps.RenderTargetResolveType = RESOLVE_PARTIAL; @@ -1055,7 +1111,7 @@ blorp_emit_blend_state(struct blorp_batch *batch, static uint32_t blorp_emit_color_calc_state(struct blorp_batch *batch, - MAYBE_UNUSED const struct blorp_params *params) + UNUSED const struct blorp_params *params) { uint32_t offset; blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) { @@ -1092,11 +1148,6 @@ blorp_emit_depth_stencil_state(struct blorp_batch *batch, ds.DepthBufferWriteEnable = true; switch (params->hiz_op) { - case ISL_AUX_OP_NONE: - ds.DepthTestEnable = true; - ds.DepthTestFunction = COMPAREFUNCTION_ALWAYS; - break; - /* See the following sections of the Sandy Bridge PRM, Volume 2, Part1: * - 7.5.3.1 Depth Buffer Clear * - 7.5.3.2 Depth Buffer Resolve @@ -1107,6 +1158,7 @@ blorp_emit_depth_stencil_state(struct blorp_batch *batch, ds.DepthTestFunction = COMPAREFUNCTION_NEVER; break; + case ISL_AUX_OP_NONE: case ISL_AUX_OP_FAST_CLEAR: case ISL_AUX_OP_AMBIGUATE: ds.DepthTestEnable = false; @@ -1237,6 +1289,12 @@ blorp_emit_pipeline(struct blorp_batch *batch, (void)depth_stencil_state_offset; #endif +#if GEN_GEN >= 12 + blorp_emit(batch, GENX(3DSTATE_CONSTANT_ALL), pc) { + /* Update empty push constants for all stages (bitmask = 11111b) */ + pc.ShaderUpdateEnable = 0x1f; + } +#else blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), vs); #if GEN_GEN >= 7 blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), hs); @@ -1244,6 +1302,7 @@ blorp_emit_pipeline(struct blorp_batch *batch, #endif blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs); blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps); +#endif if (params->src.enabled) blorp_emit_sampler_state(batch); @@ -1326,7 +1385,7 @@ blorp_emit_memcpy(struct blorp_batch *batch, static void blorp_emit_surface_state(struct blorp_batch *batch, const struct brw_blorp_surface_info *surface, - enum isl_aux_op op, + enum isl_aux_op aux_op, void *state, uint32_t state_offset, const bool color_write_disables[4], bool is_render_target) @@ -1340,10 +1399,13 @@ blorp_emit_surface_state(struct blorp_batch *batch, surf.dim = ISL_SURF_DIM_2D; } - /* Blorp doesn't support HiZ in any of the blit or slow-clear paths */ + if (isl_aux_usage_has_hiz(surface->aux_usage)) { + /* BLORP doesn't render with depth so we can't use HiZ */ + assert(!is_render_target); + /* We can't reinterpret HiZ */ + assert(surface->surf.format == surface->view.format); + } enum isl_aux_usage aux_usage = surface->aux_usage; - if (aux_usage == ISL_AUX_USAGE_HIZ) - aux_usage = ISL_AUX_USAGE_NONE; isl_channel_mask_t write_disable_mask = 0; if (is_render_target && GEN_GEN <= 5) { @@ -1363,6 +1425,13 @@ blorp_emit_surface_state(struct blorp_batch *batch, isl_surf_fill_state(batch->blorp->isl_dev, state, .surf = &surf, .view = &surface->view, .aux_surf = &surface->aux_surf, .aux_usage = aux_usage, + .address = + blorp_get_surface_address(batch, surface->addr), + .aux_address = aux_usage == ISL_AUX_USAGE_NONE ? 0 : + blorp_get_surface_address(batch, surface->aux_addr), + .clear_address = !use_clear_address ? 0 : + blorp_get_surface_address(batch, + surface->clear_color_addr), .mocs = surface->addr.mocs, .clear_color = surface->clear_color, .use_clear_address = use_clear_address, @@ -1382,7 +1451,7 @@ blorp_emit_surface_state(struct blorp_batch *batch, surface->aux_addr, *aux_addr); } - if (surface->clear_color_addr.buffer) { + if (aux_usage != ISL_AUX_USAGE_NONE && surface->clear_color_addr.buffer) { #if GEN_GEN >= 10 assert((surface->clear_color_addr.offset & 0x3f) == 0); uint32_t *clear_addr = state + isl_dev->ss.clear_color_state_offset; @@ -1390,7 +1459,10 @@ blorp_emit_surface_state(struct blorp_batch *batch, isl_dev->ss.clear_color_state_offset, surface->clear_color_addr, *clear_addr); #elif GEN_GEN >= 7 - if (op == ISL_AUX_OP_FULL_RESOLVE || op == ISL_AUX_OP_PARTIAL_RESOLVE) { + /* Fast clears just whack the AUX surface and don't actually use the + * clear color for anything. We can avoid the MI memcpy on that case. + */ + if (aux_op != ISL_AUX_OP_FAST_CLEAR) { struct blorp_address dst_addr = blorp_get_surface_base_address(batch); dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset; blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr, @@ -1446,7 +1518,7 @@ blorp_emit_surface_states(struct blorp_batch *batch, uint32_t bind_offset = 0, surface_offsets[2]; void *surface_maps[2]; - MAYBE_UNUSED bool has_indirect_clear_color = false; + UNUSED bool has_indirect_clear_color = false; if (params->use_pre_baked_binding_table) { bind_offset = params->pre_baked_binding_table_offset; } else { @@ -1495,6 +1567,9 @@ blorp_emit_surface_states(struct blorp_batch *batch, */ blorp_emit(batch, GENX(PIPE_CONTROL), pipe) { pipe.StateCacheInvalidationEnable = true; +#if GEN_GEN >= 12 + pipe.TileCacheFlushEnable = true; +#endif } } #endif @@ -1548,7 +1623,7 @@ blorp_emit_depth_stencil_config(struct blorp_batch *batch, params->depth.addr, 0); info.hiz_usage = params->depth.aux_usage; - if (info.hiz_usage == ISL_AUX_USAGE_HIZ) { + if (isl_aux_usage_has_hiz(info.hiz_usage)) { info.hiz_surf = ¶ms->depth.aux_surf; struct blorp_address hiz_address = params->depth.aux_addr; @@ -1576,6 +1651,7 @@ blorp_emit_depth_stencil_config(struct blorp_batch *batch, if (params->stencil.enabled) { info.stencil_surf = ¶ms->stencil.surf; + info.stencil_aux_usage = params->stencil.aux_usage; struct blorp_address stencil_address = params->stencil.addr; #if GEN_GEN == 6 /* Sandy bridge hardware does not technically support mipmapped stencil. @@ -1596,6 +1672,20 @@ blorp_emit_depth_stencil_config(struct blorp_batch *batch, } isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info); + +#if GEN_GEN >= 12 + /* GEN:BUG:1408224581 + * + * Workaround: Gen12LP Astep only An additional pipe control with + * post-sync = store dword operation would be required.( w/a is to + * have an additional pipe control after the stencil state whenever + * the surface state bits of this state is changing). + */ + blorp_emit(batch, GENX(PIPE_CONTROL), pc) { + pc.PostSyncOperation = WriteImmediateData; + pc.Address = blorp_get_workaround_page(batch); + } +#endif } #if GEN_GEN >= 8 @@ -1611,11 +1701,18 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch, */ assert(params->depth.enabled || params->stencil.enabled); - /* The stencil buffer should only be enabled if a fast clear operation is - * requested. + /* The stencil buffer should only be enabled on GEN == 12, if a fast clear + * or full resolve operation is requested. On rest of the GEN, if a fast + * clear operation is requested. */ - if (params->stencil.enabled) + if (params->stencil.enabled) { +#if GEN_GEN >= 12 + assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR || + params->hiz_op == ISL_AUX_OP_FULL_RESOLVE); +#else assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR); +#endif + } /* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP: * @@ -1642,6 +1739,15 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch, blorp_emit_cc_viewport(batch); } + /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the + * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch + * even when WM_HZ_OP is active. However, WM thread dispatch is normally + * disabled for HiZ ops and it appears that force-enabling it can lead to + * GPU hangs on at least Skylake. Since we don't know the current state of + * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP. + */ + blorp_emit(batch, GENX(3DSTATE_WM), wm); + /* If we can't alter the depth stencil config and multiple layers are * involved, the HiZ op will fail. This is because the op requires that a * new config is emitted for each additional layer. @@ -1662,7 +1768,13 @@ blorp_emit_gen8_hiz_op(struct blorp_batch *batch, break; case ISL_AUX_OP_FULL_RESOLVE: assert(params->full_surface_hiz_op); - hzp.DepthBufferResolveEnable = true; + hzp.DepthBufferResolveEnable = params->depth.enabled; +#if GEN_GEN >= 12 + if (params->stencil.enabled) { + assert(params->stencil.aux_usage == ISL_AUX_USAGE_CCS_E); + hzp.StencilBufferResolveEnable = true; + } +#endif break; case ISL_AUX_OP_AMBIGUATE: assert(params->full_surface_hiz_op); @@ -1706,14 +1818,91 @@ blorp_update_clear_color(struct blorp_batch *batch, enum isl_aux_op op) { if (info->clear_color_addr.buffer && op == ISL_AUX_OP_FAST_CLEAR) { -#if GEN_GEN >= 9 +#if GEN_GEN == 11 + blorp_emit(batch, GENX(PIPE_CONTROL), pipe) { + pipe.CommandStreamerStallEnable = true; + } + + /* 2 QWORDS */ + const unsigned inlinedata_dw = 2 * 2; + const unsigned num_dwords = GENX(MI_ATOMIC_length) + inlinedata_dw; + + struct blorp_address clear_addr = info->clear_color_addr; + uint32_t *dw = blorp_emitn(batch, GENX(MI_ATOMIC), num_dwords, + .DataSize = MI_ATOMIC_QWORD, + .ATOMICOPCODE = MI_ATOMIC_OP_MOVE8B, + .InlineData = true, + .MemoryAddress = clear_addr); + /* dw starts at dword 1, but we need to fill dwords 3 and 5 */ + dw[2] = info->clear_color.u32[0]; + dw[3] = 0; + dw[4] = info->clear_color.u32[1]; + dw[5] = 0; + + clear_addr.offset += 8; + dw = blorp_emitn(batch, GENX(MI_ATOMIC), num_dwords, + .DataSize = MI_ATOMIC_QWORD, + .ATOMICOPCODE = MI_ATOMIC_OP_MOVE8B, + .CSSTALL = true, + .ReturnDataControl = true, + .InlineData = true, + .MemoryAddress = clear_addr); + /* dw starts at dword 1, but we need to fill dwords 3 and 5 */ + dw[2] = info->clear_color.u32[2]; + dw[3] = 0; + dw[4] = info->clear_color.u32[3]; + dw[5] = 0; + + blorp_emit(batch, GENX(PIPE_CONTROL), pipe) { + pipe.StateCacheInvalidationEnable = true; + pipe.TextureCacheInvalidationEnable = true; + } +#elif GEN_GEN >= 9 + + /* According to GEN:BUG:2201730850, in the Clear Color Programming Note + * under the Red channel, "Software shall write the converted Depth + * Clear to this dword." The only depth formats listed under the red + * channel are IEEE_FP and UNORM24_X8. These two requirements are + * incompatible with the UNORM16 depth format, so just ignore that case + * and simply perform the conversion for all depth formats. + */ + union isl_color_value fixed_color = info->clear_color; + if (GEN_GEN == 12 && isl_surf_usage_is_depth(info->surf.usage)) { + isl_color_value_pack(&info->clear_color, info->surf.format, + fixed_color.u32); + } + for (int i = 0; i < 4; i++) { blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) { sdi.Address = info->clear_color_addr; sdi.Address.offset += i * 4; - sdi.ImmediateData = info->clear_color.u32[i]; + sdi.ImmediateData = fixed_color.u32[i]; +#if GEN_GEN >= 12 + if (i == 3) + sdi.ForceWriteCompletionCheck = true; +#endif } } + +/* The RENDER_SURFACE_STATE::ClearColor field states that software should + * write the converted depth value 16B after the clear address: + * + * 3D Sampler will always fetch clear depth from the location 16-bytes + * above this address, where the clear depth, converted to native + * surface format by software, will be stored. + * + */ +#if GEN_GEN >= 12 + if (isl_surf_usage_is_depth(info->surf.usage)) { + blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) { + sdi.Address = info->clear_color_addr; + sdi.Address.offset += 4 * 4; + sdi.ImmediateData = fixed_color.u32[0]; + sdi.ForceWriteCompletionCheck = true; + } + } +#endif + #elif GEN_GEN >= 7 blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) { sdi.Address = info->clear_color_addr;