X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Firis%2Firis_state.c;h=7d9ac0749880dc0ec366a1712ca346f150b2a7c6;hb=3fed1c75ef4d165a3c96f3a9ac0295268c16c6be;hp=da6c89260014dd7b0ebfb01629c87f5258637940;hpb=882ca6dfb0f3d17e0f8bc917307d915ab1718069;p=mesa.git diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index da6c8926001..7d9ac074988 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -95,6 +95,7 @@ #include "util/u_transfer.h" #include "util/u_upload_mgr.h" #include "util/u_viewport.h" +#include "util/u_memory.h" #include "drm-uapi/i915_drm.h" #include "nir.h" #include "intel/compiler/brw_compiler.h" @@ -110,12 +111,6 @@ #include "iris_genx_macros.h" #include "intel/common/gen_guardband.h" -static uint32_t -mocs(const struct iris_bo *bo, const struct isl_device *dev) -{ - return bo && bo->external ? dev->mocs.external : dev->mocs.internal; -} - /** * Statically assert that PIPE_* enums match the hardware packets. * (As long as they match, we don't need to translate them.) @@ -344,11 +339,12 @@ stream_state(struct iris_batch *batch, u_upload_alloc(uploader, 0, size, alignment, out_offset, out_res, &ptr); struct iris_bo *bo = iris_resource_bo(*out_res); - iris_use_pinned_bo(batch, bo, false); + iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE); - *out_offset += iris_bo_offset_from_base_address(bo); + iris_record_state_size(batch->state_sizes, + bo->gtt_offset + *out_offset, size); - iris_record_state_size(batch->state_sizes, *out_offset, size); + *out_offset += iris_bo_offset_from_base_address(bo); return ptr; } @@ -386,6 +382,8 @@ emit_state(struct iris_batch *batch, static void flush_before_state_base_change(struct iris_batch *batch) { + const struct gen_device_info *devinfo = &batch->screen->devinfo; + /* Flush before emitting STATE_BASE_ADDRESS. * * This isn't documented anywhere in the PRM. However, it seems to be @@ -411,7 +409,18 @@ flush_before_state_base_change(struct iris_batch *batch) "change STATE_BASE_ADDRESS (flushes)", PIPE_CONTROL_RENDER_TARGET_FLUSH | PIPE_CONTROL_DEPTH_CACHE_FLUSH | - PIPE_CONTROL_DATA_CACHE_FLUSH); + PIPE_CONTROL_DATA_CACHE_FLUSH | + /* GEN:BUG:1606662791: + * + * Software must program PIPE_CONTROL command + * with "HDC Pipeline Flush" prior to + * programming of the below two non-pipeline + * state : + * * STATE_BASE_ADDRESS + * * 3DSTATE_BINDING_TABLE_POOL_ALLOC + */ + ((GEN_GEN == 12 && devinfo->revision == 0 /* A0 */ ? + PIPE_CONTROL_FLUSH_HDC : 0))); } static void @@ -517,10 +526,12 @@ static void iris_load_register_mem32(struct iris_batch *batch, uint32_t reg, struct iris_bo *bo, uint32_t offset) { + iris_batch_sync_region_start(batch); iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { lrm.RegisterAddress = reg; lrm.MemoryAddress = ro_bo(bo, offset); } + iris_batch_sync_region_end(batch); } /** @@ -540,11 +551,13 @@ iris_store_register_mem32(struct iris_batch *batch, uint32_t reg, struct iris_bo *bo, uint32_t offset, bool predicated) { + iris_batch_sync_region_start(batch); iris_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) { srm.RegisterAddress = reg; - srm.MemoryAddress = rw_bo(bo, offset); + srm.MemoryAddress = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE); srm.PredicateEnable = predicated; } + iris_batch_sync_region_end(batch); } static void @@ -561,10 +574,12 @@ iris_store_data_imm32(struct iris_batch *batch, struct iris_bo *bo, uint32_t offset, uint32_t imm) { + iris_batch_sync_region_start(batch); iris_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) { - sdi.Address = rw_bo(bo, offset); + sdi.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE); sdi.ImmediateData = imm; } + iris_batch_sync_region_end(batch); } static void @@ -576,11 +591,13 @@ iris_store_data_imm64(struct iris_batch *batch, * 2 in genxml but it's actually variable length and we need 5 DWords. */ void *map = iris_get_command_space(batch, 4 * 5); + iris_batch_sync_region_start(batch); _iris_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) { sdi.DWordLength = 5 - 2; - sdi.Address = rw_bo(bo, offset); + sdi.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE); sdi.ImmediateData = imm; } + iris_batch_sync_region_end(batch); } static void @@ -593,13 +610,17 @@ iris_copy_mem_mem(struct iris_batch *batch, assert(bytes % 4 == 0); assert(dst_offset % 4 == 0); assert(src_offset % 4 == 0); + iris_batch_sync_region_start(batch); for (unsigned i = 0; i < bytes; i += 4) { iris_emit_cmd(batch, GENX(MI_COPY_MEM_MEM), cp) { - cp.DestinationMemoryAddress = rw_bo(dst_bo, dst_offset + i); + cp.DestinationMemoryAddress = rw_bo(dst_bo, dst_offset + i, + IRIS_DOMAIN_OTHER_WRITE); cp.SourceMemoryAddress = ro_bo(src_bo, src_offset + i); } } + + iris_batch_sync_region_end(batch); } static void @@ -719,10 +740,11 @@ init_state_base_address(struct iris_batch *batch) } static void -iris_emit_l3_config(struct iris_batch *batch, const struct gen_l3_config *cfg, - bool has_slm, bool wants_dc_cache) +iris_emit_l3_config(struct iris_batch *batch, + const struct gen_l3_config *cfg) { uint32_t reg_val; + assert(cfg || GEN_GEN >= 12); #if GEN_GEN >= 12 #define L3_ALLOCATION_REG GENX(L3ALLOC) @@ -733,8 +755,8 @@ iris_emit_l3_config(struct iris_batch *batch, const struct gen_l3_config *cfg, #endif iris_pack_state(L3_ALLOCATION_REG, ®_val, reg) { -#if GEN_GEN < 12 - reg.SLMEnable = has_slm; +#if GEN_GEN < 11 + reg.SLMEnable = cfg->n[GEN_L3P_SLM] > 0; #endif #if GEN_GEN == 11 /* WA_1406697149: Bit 9 "Error Detection Behavior Control" must be set @@ -744,28 +766,21 @@ iris_emit_l3_config(struct iris_batch *batch, const struct gen_l3_config *cfg, reg.ErrorDetectionBehaviorControl = true; reg.UseFullWays = true; #endif - reg.URBAllocation = cfg->n[GEN_L3P_URB]; - reg.ROAllocation = cfg->n[GEN_L3P_RO]; - reg.DCAllocation = cfg->n[GEN_L3P_DC]; - reg.AllAllocation = cfg->n[GEN_L3P_ALL]; + if (GEN_GEN < 12 || cfg) { + reg.URBAllocation = cfg->n[GEN_L3P_URB]; + reg.ROAllocation = cfg->n[GEN_L3P_RO]; + reg.DCAllocation = cfg->n[GEN_L3P_DC]; + reg.AllAllocation = cfg->n[GEN_L3P_ALL]; + } else { +#if GEN_GEN >= 12 + reg.L3FullWayAllocationEnable = true; +#endif + } } _iris_emit_lri(batch, L3_ALLOCATION_REG_num, reg_val); } -static void -iris_emit_default_l3_config(struct iris_batch *batch, - const struct gen_device_info *devinfo, - bool compute) -{ - bool wants_dc_cache = true; - bool has_slm = compute; - const struct gen_l3_weights w = - gen_get_default_l3_weights(devinfo, wants_dc_cache, has_slm); - const struct gen_l3_config *cfg = gen_get_l3_config(devinfo, w); - iris_emit_l3_config(batch, cfg, has_slm, wants_dc_cache); -} - -#if GEN_GEN == 9 || GEN_GEN == 10 +#if GEN_GEN == 9 static void iris_enable_obj_preemption(struct iris_batch *batch, bool enable) { @@ -885,6 +900,37 @@ iris_alloc_push_constants(struct iris_batch *batch) } } +#if GEN_GEN >= 12 +static void +init_aux_map_state(struct iris_batch *batch); +#endif + +/** + * Upload initial GPU state for any kind of context. + * + * These need to happen for both render and compute. + */ +static void +iris_init_common_context(struct iris_batch *batch) +{ +#if GEN_GEN == 11 + uint32_t reg_val; + + iris_pack_state(GENX(SAMPLER_MODE), ®_val, reg) { + reg.HeaderlessMessageforPreemptableContexts = 1; + reg.HeaderlessMessageforPreemptableContextsMask = 1; + } + iris_emit_lri(batch, SAMPLER_MODE, reg_val); + + /* Bit 1 must be set in HALF_SLICE_CHICKEN7. */ + iris_pack_state(GENX(HALF_SLICE_CHICKEN7), ®_val, reg) { + reg.EnabledTexelOffsetPrecisionFix = 1; + reg.EnabledTexelOffsetPrecisionFixMask = 1; + } + iris_emit_lri(batch, HALF_SLICE_CHICKEN7, reg_val); +#endif +} + /** * Upload the initial GPU state for a render context. * @@ -897,12 +943,16 @@ iris_init_render_context(struct iris_batch *batch) UNUSED const struct gen_device_info *devinfo = &batch->screen->devinfo; uint32_t reg_val; + iris_batch_sync_region_start(batch); + emit_pipeline_select(batch, _3D); - iris_emit_default_l3_config(batch, devinfo, false); + iris_emit_l3_config(batch, batch->screen->l3_config_3d); init_state_base_address(batch); + iris_init_common_context(batch); + #if GEN_GEN >= 9 iris_pack_state(GENX(CS_DEBUG_MODE2), ®_val, reg) { reg.CONSTANT_BUFFERAddressOffsetDisable = true; @@ -931,31 +981,26 @@ iris_init_render_context(struct iris_batch *batch) #endif #if GEN_GEN == 11 - iris_pack_state(GENX(SAMPLER_MODE), ®_val, reg) { - reg.HeaderlessMessageforPreemptableContexts = 1; - reg.HeaderlessMessageforPreemptableContextsMask = 1; - } - iris_emit_lri(batch, SAMPLER_MODE, reg_val); - - /* Bit 1 must be set in HALF_SLICE_CHICKEN7. */ - iris_pack_state(GENX(HALF_SLICE_CHICKEN7), ®_val, reg) { - reg.EnabledTexelOffsetPrecisionFix = 1; - reg.EnabledTexelOffsetPrecisionFixMask = 1; - } - iris_emit_lri(batch, HALF_SLICE_CHICKEN7, reg_val); + iris_pack_state(GENX(TCCNTLREG), ®_val, reg) { + reg.L3DataPartialWriteMergingEnable = true; + reg.ColorZPartialWriteMergingEnable = true; + reg.URBPartialWriteMergingEnable = true; + reg.TCDisable = true; + } + iris_emit_lri(batch, TCCNTLREG, reg_val); - /* Hardware specification recommends disabling repacking for the - * compatibility with decompression mechanism in display controller. - */ - if (devinfo->disable_ccs_repack) { - iris_pack_state(GENX(CACHE_MODE_0), ®_val, reg) { - reg.DisableRepackingforCompression = true; - reg.DisableRepackingforCompressionMask = true; - } - iris_emit_lri(batch, CACHE_MODE_0, reg_val); + /* Hardware specification recommends disabling repacking for the + * compatibility with decompression mechanism in display controller. + */ + if (devinfo->disable_ccs_repack) { + iris_pack_state(GENX(CACHE_MODE_0), ®_val, reg) { + reg.DisableRepackingforCompression = true; + reg.DisableRepackingforCompressionMask = true; } + iris_emit_lri(batch, CACHE_MODE_0, reg_val); + } - iris_upload_slice_hashing_state(batch); + iris_upload_slice_hashing_state(batch); #endif /* 3DSTATE_DRAWING_RECTANGLE is non-pipelined, so we want to avoid @@ -994,10 +1039,12 @@ iris_init_render_context(struct iris_batch *batch) iris_alloc_push_constants(batch); -#if GEN_GEN == 10 - /* Gen11+ is enabled for us by the kernel. */ - iris_enable_obj_preemption(batch, true); + +#if GEN_GEN >= 12 + init_aux_map_state(batch); #endif + + iris_batch_sync_region_end(batch); } static void @@ -1005,16 +1052,38 @@ iris_init_compute_context(struct iris_batch *batch) { UNUSED const struct gen_device_info *devinfo = &batch->screen->devinfo; + iris_batch_sync_region_start(batch); + + /* GEN:BUG:1607854226: + * + * Start with pipeline in 3D mode to set the STATE_BASE_ADDRESS. + */ +#if GEN_GEN == 12 + emit_pipeline_select(batch, _3D); +#else emit_pipeline_select(batch, GPGPU); +#endif - iris_emit_default_l3_config(batch, devinfo, true); + iris_emit_l3_config(batch, batch->screen->l3_config_cs); init_state_base_address(batch); + iris_init_common_context(batch); + +#if GEN_GEN == 12 + emit_pipeline_select(batch, GPGPU); +#endif + #if GEN_GEN == 9 if (devinfo->is_geminilake) init_glk_barrier_mode(batch, GLK_BARRIER_MODE_GPGPU); #endif + +#if GEN_GEN >= 12 + init_aux_map_state(batch); +#endif + + iris_batch_sync_region_end(batch); } struct iris_vertex_buffer_state { @@ -1032,7 +1101,8 @@ struct iris_depth_buffer_state { uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) + GENX(3DSTATE_STENCIL_BUFFER_length) + GENX(3DSTATE_HIER_DEPTH_BUFFER_length) + - GENX(3DSTATE_CLEAR_PARAMS_length)]; + GENX(3DSTATE_CLEAR_PARAMS_length) + + GENX(MI_LOAD_REGISTER_IMM_length) * 2]; }; /** @@ -1234,12 +1304,10 @@ iris_bind_blend_state(struct pipe_context *ctx, void *state) struct iris_blend_state *cso = state; ice->state.cso_blend = cso; - ice->state.blend_enables = cso ? cso->blend_enables : 0; ice->state.dirty |= IRIS_DIRTY_PS_BLEND; ice->state.dirty |= IRIS_DIRTY_BLEND_STATE; - ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; - ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_BLEND]; + ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[IRIS_NOS_BLEND]; if (GEN_GEN == 8) ice->state.dirty |= IRIS_DIRTY_PMA_FIX; @@ -1308,7 +1376,7 @@ iris_create_zsa_state(struct pipe_context *ctx, state->stencil[0].writemask != 0 || (two_sided_stencil && state->stencil[1].writemask != 0); - /* The state tracker needs to optimize away EQUAL writes for us. */ + /* gallium frontends need to optimize away EQUAL writes for us. */ assert(!(state->depth.func == PIPE_FUNC_EQUAL && state->depth.writemask)); iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), cso->wmds, wmds) { @@ -1335,6 +1403,9 @@ iris_create_zsa_state(struct pipe_context *ctx, wmds.BackfaceStencilTestMask = state->stencil[1].valuemask; wmds.BackfaceStencilWriteMask = state->stencil[1].writemask; /* wmds.[Backface]StencilReferenceValue are merged later */ +#if GEN_GEN >= 12 + wmds.StencilReferenceValueModifyDisable = true; +#endif } #if GEN_GEN >= 12 @@ -1387,7 +1458,8 @@ iris_bind_zsa_state(struct pipe_context *ctx, void *state) ice->state.cso_zsa = new_cso; ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT; ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL; - ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_DEPTH_STENCIL_ALPHA]; + ice->state.stage_dirty |= + ice->state.stage_dirty_for_nos[IRIS_NOS_DEPTH_STENCIL_ALPHA]; if (GEN_GEN == 8) ice->state.dirty |= IRIS_DIRTY_PMA_FIX; @@ -1798,13 +1870,14 @@ iris_bind_rasterizer_state(struct pipe_context *ctx, void *state) ice->state.dirty |= IRIS_DIRTY_SBE; if (cso_changed(conservative_rasterization)) - ice->state.dirty |= IRIS_DIRTY_FS; + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS; } ice->state.cso_rast = new_cso; ice->state.dirty |= IRIS_DIRTY_RASTER; ice->state.dirty |= IRIS_DIRTY_CLIP; - ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_RASTERIZER]; + ice->state.stage_dirty |= + ice->state.stage_dirty_for_nos[IRIS_NOS_RASTERIZER]; } /** @@ -1946,7 +2019,7 @@ iris_bind_sampler_states(struct pipe_context *ctx, } if (dirty) - ice->state.dirty |= IRIS_DIRTY_SAMPLER_STATES_VS << stage; + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage; } /** @@ -1961,7 +2034,7 @@ iris_upload_sampler_states(struct iris_context *ice, gl_shader_stage stage) struct iris_shader_state *shs = &ice->state.shaders[stage]; const struct shader_info *info = iris_get_shader_info(ice, stage); - /* We assume the state tracker will call pipe->bind_sampler_states() + /* We assume gallium frontends will call pipe->bind_sampler_states() * if the program's number of textures changes. */ unsigned count = info ? util_last_bit(info->textures_used) : 0; @@ -1980,10 +2053,12 @@ iris_upload_sampler_states(struct iris_context *ice, gl_shader_stage stage) return; struct pipe_resource *res = shs->sampler_table.res; - shs->sampler_table.offset += - iris_bo_offset_from_base_address(iris_resource_bo(res)); + struct iris_bo *bo = iris_resource_bo(res); - iris_record_state_size(ice->state.sizes, shs->sampler_table.offset, size); + iris_record_state_size(ice->state.sizes, + bo->gtt_offset + shs->sampler_table.offset, size); + + shs->sampler_table.offset += iris_bo_offset_from_base_address(bo); /* Make sure all land in the same BO */ iris_border_color_pool_reserve(ice, IRIS_MAX_TEXTURE_SAMPLERS); @@ -2098,18 +2173,18 @@ fill_buffer_surface_state(struct isl_device *isl_dev, .format = format, .swizzle = swizzle, .stride_B = cpp, - .mocs = mocs(res->bo, isl_dev)); + .mocs = iris_mocs(res->bo, isl_dev)); } #define SURFACE_STATE_ALIGNMENT 64 /** * Allocate several contiguous SURFACE_STATE structures, one for each - * supported auxiliary surface mode. + * supported auxiliary surface mode. This only allocates the CPU-side + * copy, they will need to be uploaded later after they're filled in. */ -static void * -alloc_surface_states(struct u_upload_mgr *mgr, - struct iris_state_ref *ref, +static void +alloc_surface_states(struct iris_surface_state *surf_state, unsigned aux_usages) { const unsigned surf_size = 4 * GENX(RENDER_SURFACE_STATE_length); @@ -2119,13 +2194,68 @@ alloc_surface_states(struct u_upload_mgr *mgr, assert(aux_usages != 0); + /* In case we're re-allocating them... */ + free(surf_state->cpu); + + surf_state->num_states = util_bitcount(aux_usages); + surf_state->cpu = calloc(surf_state->num_states, surf_size); + surf_state->ref.offset = 0; + pipe_resource_reference(&surf_state->ref.res, NULL); + + assert(surf_state->cpu); +} + +/** + * Upload the CPU side SURFACE_STATEs into a GPU buffer. + */ +static void +upload_surface_states(struct u_upload_mgr *mgr, + struct iris_surface_state *surf_state) +{ + const unsigned surf_size = 4 * GENX(RENDER_SURFACE_STATE_length); + const unsigned bytes = surf_state->num_states * surf_size; + void *map = - upload_state(mgr, ref, util_bitcount(aux_usages) * surf_size, - SURFACE_STATE_ALIGNMENT); + upload_state(mgr, &surf_state->ref, bytes, SURFACE_STATE_ALIGNMENT); - ref->offset += iris_bo_offset_from_base_address(iris_resource_bo(ref->res)); + surf_state->ref.offset += + iris_bo_offset_from_base_address(iris_resource_bo(surf_state->ref.res)); - return map; + if (map) + memcpy(map, surf_state->cpu, bytes); +} + +/** + * Update resource addresses in a set of SURFACE_STATE descriptors, + * and re-upload them if necessary. + */ +static bool +update_surface_state_addrs(struct u_upload_mgr *mgr, + struct iris_surface_state *surf_state, + struct iris_bo *bo) +{ + if (surf_state->bo_address == bo->gtt_offset) + return false; + + STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) % 64 == 0); + STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_bits) == 64); + + uint64_t *ss_addr = (uint64_t *) &surf_state->cpu[GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) / 32]; + + /* First, update the CPU copies. We assume no other fields exist in + * the QWord containing Surface Base Address. + */ + for (unsigned i = 0; i < surf_state->num_states; i++) { + *ss_addr = *ss_addr - surf_state->bo_address + bo->gtt_offset; + ss_addr = ((void *) ss_addr) + SURFACE_STATE_ALIGNMENT; + } + + /* Next, upload the updated copies to a GPU buffer. */ + upload_surface_states(mgr, surf_state); + + surf_state->bo_address = bo->gtt_offset; + + return true; } #if GEN_GEN == 8 @@ -2141,11 +2271,11 @@ get_rt_read_isl_surf(const struct gen_device_info *devinfo, struct iris_resource *res, enum pipe_texture_target target, struct isl_view *view, + uint32_t *offset_to_tile, uint32_t *tile_x_sa, uint32_t *tile_y_sa, struct isl_surf *surf) { - *surf = res->surf; const enum isl_dim_layout dim_layout = @@ -2168,9 +2298,9 @@ get_rt_read_isl_surf(const struct gen_device_info *devinfo, assert(view->levels == 1 && view->array_len == 1); assert(*tile_x_sa == 0 && *tile_y_sa == 0); - res->offset += iris_resource_get_tile_offsets(res, view->base_level, - view->base_array_layer, - tile_x_sa, tile_y_sa); + *offset_to_tile = iris_resource_get_tile_offsets(res, view->base_level, + view->base_array_layer, + tile_x_sa, tile_y_sa); const unsigned l = view->base_level; surf->logical_level0_px.width = minify(surf->logical_level0_px.width, l); @@ -2195,14 +2325,15 @@ fill_surface_state(struct isl_device *isl_dev, struct isl_surf *surf, struct isl_view *view, unsigned aux_usage, + uint32_t extra_main_offset, uint32_t tile_x_sa, uint32_t tile_y_sa) { struct isl_surf_fill_state_info f = { .surf = surf, .view = view, - .mocs = mocs(res->bo, isl_dev), - .address = res->bo->gtt_offset + res->offset, + .mocs = iris_mocs(res->bo, isl_dev), + .address = res->bo->gtt_offset + res->offset + extra_main_offset, .x_offset_sa = tile_x_sa, .y_offset_sa = tile_y_sa, }; @@ -2262,11 +2393,9 @@ iris_create_sampler_view(struct pipe_context *ctx, isv->res = (struct iris_resource *) tex; - void *map = alloc_surface_states(ice->state.surface_uploader, - &isv->surface_state, - isv->res->aux.sampler_usages); - if (!unlikely(map)) - return NULL; + alloc_surface_states(&isv->surface_state, isv->res->aux.sampler_usages); + + isv->surface_state.bo_address = isv->res->bo->gtt_offset; isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT; @@ -2290,6 +2419,8 @@ iris_create_sampler_view(struct pipe_context *ctx, .usage = usage, }; + void *map = isv->surface_state.cpu; + /* Fill out SURFACE_STATE for this view. */ if (tmpl->target != PIPE_BUFFER) { isv->view.base_level = tmpl->u.tex.first_level; @@ -2310,7 +2441,7 @@ iris_create_sampler_view(struct pipe_context *ctx, * surface state with HiZ. */ fill_surface_state(&screen->isl_dev, map, isv->res, &isv->res->surf, - &isv->view, aux_usage, 0, 0); + &isv->view, aux_usage, 0, 0, 0); map += SURFACE_STATE_ALIGNMENT; } @@ -2320,6 +2451,8 @@ iris_create_sampler_view(struct pipe_context *ctx, tmpl->u.buf.offset, tmpl->u.buf.size); } + upload_surface_states(ice->state.surface_uploader, &isv->surface_state); + return &isv->base; } @@ -2329,7 +2462,8 @@ iris_sampler_view_destroy(struct pipe_context *ctx, { struct iris_sampler_view *isv = (void *) state; pipe_resource_reference(&state->texture, NULL); - pipe_resource_reference(&isv->surface_state.res, NULL); + pipe_resource_reference(&isv->surface_state.ref.res, NULL); + free(isv->surface_state.cpu); free(isv); } @@ -2425,56 +2559,49 @@ iris_create_surface(struct pipe_context *ctx, return psurf; - void *map = alloc_surface_states(ice->state.surface_uploader, - &surf->surface_state, - res->aux.possible_usages); - if (!unlikely(map)) { - pipe_resource_reference(&surf->surface_state.res, NULL); - return NULL; - } + alloc_surface_states(&surf->surface_state, res->aux.possible_usages); + surf->surface_state.bo_address = res->bo->gtt_offset; #if GEN_GEN == 8 - void *map_read = alloc_surface_states(ice->state.surface_uploader, - &surf->surface_state_read, - res->aux.possible_usages); - if (!unlikely(map_read)) { - pipe_resource_reference(&surf->surface_state_read.res, NULL); - return NULL; - } + alloc_surface_states(&surf->surface_state_read, res->aux.possible_usages); + surf->surface_state_read.bo_address = res->bo->gtt_offset; #endif if (!isl_format_is_compressed(res->surf.format)) { if (iris_resource_unfinished_aux_import(res)) iris_resource_finish_aux_import(&screen->base, res); + void *map = surf->surface_state.cpu; + UNUSED void *map_read = surf->surface_state_read.cpu; + /* This is a normal surface. Fill out a SURFACE_STATE for each possible * auxiliary surface mode and return the pipe_surface. */ unsigned aux_modes = res->aux.possible_usages; while (aux_modes) { -#if GEN_GEN == 8 - uint32_t offset = res->offset; -#endif enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes); fill_surface_state(&screen->isl_dev, map, res, &res->surf, - view, aux_usage, 0, 0); + view, aux_usage, 0, 0, 0); map += SURFACE_STATE_ALIGNMENT; #if GEN_GEN == 8 struct isl_surf surf; - uint32_t tile_x_sa = 0, tile_y_sa = 0; + uint32_t offset_to_tile = 0, tile_x_sa = 0, tile_y_sa = 0; get_rt_read_isl_surf(devinfo, res, target, read_view, - &tile_x_sa, &tile_y_sa, &surf); + &offset_to_tile, &tile_x_sa, &tile_y_sa, &surf); fill_surface_state(&screen->isl_dev, map_read, res, &surf, read_view, - aux_usage, tile_x_sa, tile_y_sa); - /* Restore offset because we change offset in case of handling - * non_coherent fb fetch - */ - res->offset = offset; + aux_usage, offset_to_tile, tile_x_sa, tile_y_sa); map_read += SURFACE_STATE_ALIGNMENT; #endif } + upload_surface_states(ice->state.surface_uploader, &surf->surface_state); + +#if GEN_GEN == 8 + upload_surface_states(ice->state.surface_uploader, + &surf->surface_state_read); +#endif + return psurf; } @@ -2506,7 +2633,7 @@ iris_create_surface(struct pipe_context *ctx, * texture, the tile offsets may be anything and we can't rely on * X/Y Offset. * - * Return NULL to force the state tracker to take fallback paths. + * Return NULL to force gallium frontends to take fallback paths. */ if (view->array_len > 1 || GEN_GEN == 8) return NULL; @@ -2547,13 +2674,16 @@ iris_create_surface(struct pipe_context *ctx, struct isl_surf_fill_state_info f = { .surf = &isl_surf, .view = view, - .mocs = mocs(res->bo, &screen->isl_dev), + .mocs = iris_mocs(res->bo, &screen->isl_dev), .address = res->bo->gtt_offset + offset_B, .x_offset_sa = tile_x_sa, .y_offset_sa = tile_y_sa, }; - isl_surf_fill_state_s(&screen->isl_dev, map, &f); + isl_surf_fill_state_s(&screen->isl_dev, surf->surface_state.cpu, &f); + + upload_surface_states(ice->state.surface_uploader, &surf->surface_state); + return psurf; } @@ -2598,7 +2728,6 @@ iris_set_shader_images(struct pipe_context *ctx, { struct iris_context *ice = (struct iris_context *) ctx; struct iris_screen *screen = (struct iris_screen *)ctx->screen; - const struct gen_device_info *devinfo = &screen->devinfo; gl_shader_stage stage = stage_from_pipe(p_stage); struct iris_shader_state *shs = &ice->state.shaders[stage]; #if GEN_GEN == 8 @@ -2615,12 +2744,6 @@ iris_set_shader_images(struct pipe_context *ctx, const struct pipe_image_view *img = &p_images[i]; struct iris_resource *res = (void *) img->resource; - void *map = - alloc_surface_states(ice->state.surface_uploader, - &iv->surface_state, 1 << ISL_AUX_USAGE_NONE); - if (!unlikely(map)) - return; - util_copy_image_view(&iv->base, img); shs->bound_image_views |= 1 << (start_slot + i); @@ -2628,25 +2751,16 @@ iris_set_shader_images(struct pipe_context *ctx, res->bind_history |= PIPE_BIND_SHADER_IMAGE; res->bind_stages |= 1 << stage; - isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT; - enum isl_format isl_fmt = - iris_format_for_usage(devinfo, img->format, usage).fmt; + enum isl_format isl_fmt = iris_image_view_get_format(ice, img); - bool untyped_fallback = false; + /* Render compression with images supported on gen12+ only. */ + unsigned aux_usages = GEN_GEN >= 12 ? res->aux.possible_usages : + 1 << ISL_AUX_USAGE_NONE; - if (img->shader_access & PIPE_IMAGE_ACCESS_READ) { - /* On Gen8, try to use typed surfaces reads (which support a - * limited number of formats), and if not possible, fall back - * to untyped reads. - */ - untyped_fallback = GEN_GEN == 8 && - !isl_has_matching_typed_storage_image_format(devinfo, isl_fmt); + alloc_surface_states(&iv->surface_state, aux_usages); + iv->surface_state.bo_address = res->bo->gtt_offset; - if (untyped_fallback) - isl_fmt = ISL_FORMAT_RAW; - else - isl_fmt = isl_lower_storage_image_format(devinfo, isl_fmt); - } + void *map = iv->surface_state.cpu; if (res->base.target != PIPE_BUFFER) { struct isl_view view = { @@ -2656,21 +2770,21 @@ iris_set_shader_images(struct pipe_context *ctx, .base_array_layer = img->u.tex.first_layer, .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1, .swizzle = ISL_SWIZZLE_IDENTITY, - .usage = usage, + .usage = ISL_SURF_USAGE_STORAGE_BIT, }; - if (untyped_fallback) { + /* If using untyped fallback. */ + if (isl_fmt == ISL_FORMAT_RAW) { fill_buffer_surface_state(&screen->isl_dev, res, map, isl_fmt, ISL_SWIZZLE_IDENTITY, 0, res->bo->size); } else { - /* Images don't support compression */ - unsigned aux_modes = 1 << ISL_AUX_USAGE_NONE; + unsigned aux_modes = aux_usages; while (aux_modes) { enum isl_aux_usage usage = u_bit_scan(&aux_modes); fill_surface_state(&screen->isl_dev, map, res, &res->surf, - &view, usage, 0, 0); + &view, usage, 0, 0, 0); map += SURFACE_STATE_ALIGNMENT; } @@ -2689,21 +2803,23 @@ iris_set_shader_images(struct pipe_context *ctx, fill_buffer_image_param(&image_params[start_slot + i], img->format, img->u.buf.size); } + + upload_surface_states(ice->state.surface_uploader, &iv->surface_state); } else { pipe_resource_reference(&iv->base.resource, NULL); - pipe_resource_reference(&iv->surface_state.res, NULL); + pipe_resource_reference(&iv->surface_state.ref.res, NULL); fill_default_image_param(&image_params[start_slot + i]); } } - ice->state.dirty |= IRIS_DIRTY_BINDINGS_VS << stage; + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage; ice->state.dirty |= stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; /* Broadwell also needs brw_image_params re-uploaded */ if (GEN_GEN < 9) { - ice->state.dirty |= IRIS_DIRTY_CONSTANTS_VS << stage; + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage; shs->sysvals_need_upload = true; } } @@ -2734,15 +2850,51 @@ iris_set_sampler_views(struct pipe_context *ctx, view->res->bind_stages |= 1 << stage; shs->bound_sampler_views |= 1 << (start + i); + + update_surface_state_addrs(ice->state.surface_uploader, + &view->surface_state, view->res->bo); } } - ice->state.dirty |= (IRIS_DIRTY_BINDINGS_VS << stage); + ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_BINDINGS_VS << stage); ice->state.dirty |= stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; } +static void +iris_set_compute_resources(struct pipe_context *ctx, + unsigned start, unsigned count, + struct pipe_surface **resources) +{ + assert(count == 0); +} + +static void +iris_set_global_binding(struct pipe_context *ctx, + unsigned start_slot, unsigned count, + struct pipe_resource **resources, + uint32_t **handles) +{ + struct iris_context *ice = (struct iris_context *) ctx; + + assert(start_slot + count <= IRIS_MAX_GLOBAL_BINDINGS); + for (unsigned i = 0; i < count; i++) { + if (resources && resources[i]) { + pipe_resource_reference(&ice->state.global_bindings[start_slot + i], + resources[i]); + struct iris_resource *res = (void *) resources[i]; + uint64_t addr = res->bo->gtt_offset; + memcpy(handles[i], &addr, sizeof(addr)); + } else { + pipe_resource_reference(&ice->state.global_bindings[start_slot + i], + NULL); + } + } + + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_CS; +} + /** * The pipe->set_tess_state() driver hook. */ @@ -2757,7 +2909,7 @@ iris_set_tess_state(struct pipe_context *ctx, memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float)); memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float)); - ice->state.dirty |= IRIS_DIRTY_CONSTANTS_TCS; + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_TCS; shs->sysvals_need_upload = true; } @@ -2766,8 +2918,9 @@ iris_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf) { struct iris_surface *surf = (void *) p_surf; pipe_resource_reference(&p_surf->texture, NULL); - pipe_resource_reference(&surf->surface_state.res, NULL); - pipe_resource_reference(&surf->surface_state_read.res, NULL); + pipe_resource_reference(&surf->surface_state.ref.res, NULL); + pipe_resource_reference(&surf->surface_state_read.ref.res, NULL); + free(surf->surface_state.cpu); free(surf); } @@ -2782,8 +2935,9 @@ iris_set_clip_state(struct pipe_context *ctx, memcpy(&ice->state.clip_planes, state, sizeof(*state)); - ice->state.dirty |= IRIS_DIRTY_CONSTANTS_VS | IRIS_DIRTY_CONSTANTS_GS | - IRIS_DIRTY_CONSTANTS_TES; + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS | + IRIS_STAGE_DIRTY_CONSTANTS_GS | + IRIS_STAGE_DIRTY_CONSTANTS_TES; shs->sysvals_need_upload = true; gshs->sysvals_need_upload = true; tshs->sysvals_need_upload = true; @@ -2863,10 +3017,12 @@ iris_set_stencil_ref(struct pipe_context *ctx, { struct iris_context *ice = (struct iris_context *) ctx; memcpy(&ice->state.stencil_ref, state, sizeof(*state)); - if (GEN_GEN == 8) - ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE; - else + if (GEN_GEN >= 12) + ice->state.dirty |= IRIS_DIRTY_STENCIL_REF; + else if (GEN_GEN >= 9) ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL; + else + ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE; } static float @@ -2924,7 +3080,7 @@ iris_set_framebuffer_state(struct pipe_context *ctx, /* We need to toggle 3DSTATE_PS::32 Pixel Dispatch Enable */ if (GEN_GEN >= 9 && (cso->samples == 16 || samples == 16)) - ice->state.dirty |= IRIS_DIRTY_FS; + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS; } if (cso->nr_cbufs != state->nr_cbufs) { @@ -2973,7 +3129,7 @@ iris_set_framebuffer_state(struct pipe_context *ctx, info.depth_surf = &zres->surf; info.depth_address = zres->bo->gtt_offset + zres->offset; - info.mocs = mocs(zres->bo, isl_dev); + info.mocs = iris_mocs(zres->bo, isl_dev); view.format = zres->surf.format; @@ -2991,7 +3147,7 @@ iris_set_framebuffer_state(struct pipe_context *ctx, info.stencil_address = stencil_res->bo->gtt_offset + stencil_res->offset; if (!zres) { view.format = stencil_res->surf.format; - info.mocs = mocs(stencil_res->bo, isl_dev); + info.mocs = iris_mocs(stencil_res->bo, isl_dev); } } } @@ -3010,13 +3166,14 @@ iris_set_framebuffer_state(struct pipe_context *ctx, iris_bo_offset_from_base_address(iris_resource_bo(ice->state.null_fb.res)); /* Render target change */ - ice->state.dirty |= IRIS_DIRTY_BINDINGS_FS; + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_FS; ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER; ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; - ice->state.dirty |= ice->state.dirty_for_nos[IRIS_NOS_FRAMEBUFFER]; + ice->state.stage_dirty |= + ice->state.stage_dirty_for_nos[IRIS_NOS_FRAMEBUFFER]; if (GEN_GEN == 8) ice->state.dirty |= IRIS_DIRTY_PMA_FIX; @@ -3076,31 +3233,40 @@ iris_set_constant_buffer(struct pipe_context *ctx, pipe_resource_reference(&cbuf->buffer, NULL); } - ice->state.dirty |= IRIS_DIRTY_CONSTANTS_VS << stage; + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage; } static void upload_sysvals(struct iris_context *ice, - gl_shader_stage stage) + gl_shader_stage stage, + const struct pipe_grid_info *grid) { UNUSED struct iris_genx_state *genx = ice->state.genx; struct iris_shader_state *shs = &ice->state.shaders[stage]; struct iris_compiled_shader *shader = ice->shaders.prog[stage]; - if (!shader || shader->num_system_values == 0) + if (!shader || (shader->num_system_values == 0 && + shader->kernel_input_size == 0)) return; assert(shader->num_cbufs > 0); unsigned sysval_cbuf_index = shader->num_cbufs - 1; struct pipe_shader_buffer *cbuf = &shs->constbuf[sysval_cbuf_index]; - unsigned upload_size = shader->num_system_values * sizeof(uint32_t); - uint32_t *map = NULL; + unsigned system_values_start = + ALIGN(shader->kernel_input_size, sizeof(uint32_t)); + unsigned upload_size = system_values_start + + shader->num_system_values * sizeof(uint32_t); + void *map = NULL; assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS); u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64, - &cbuf->buffer_offset, &cbuf->buffer, (void **) &map); + &cbuf->buffer_offset, &cbuf->buffer, &map); + + if (shader->kernel_input_size > 0) + memcpy(map, grid->input, shader->kernel_input_size); + uint32_t *sysval_map = map + system_values_start; for (int i = 0; i < shader->num_system_values; i++) { uint32_t sysval = shader->system_values[i]; uint32_t value = 0; @@ -3141,11 +3307,15 @@ upload_sysvals(struct iris_context *ice, value = fui(ice->state.default_inner_level[0]); } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) { value = fui(ice->state.default_inner_level[1]); + } else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X && + sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) { + unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X; + value = ice->state.last_block[i]; } else { assert(!"unhandled system value"); } - *map++ = value; + *sysval_map++ = value; } cbuf->buffer_size = upload_size; @@ -3205,7 +3375,7 @@ iris_set_shader_buffers(struct pipe_context *ctx, } } - ice->state.dirty |= IRIS_DIRTY_BINDINGS_VS << stage; + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage; } static void @@ -3258,10 +3428,10 @@ iris_set_vertex_buffers(struct pipe_context *ctx, vb.AddressModifyEnable = true; vb.BufferPitch = buffer->stride; if (res) { - vb.BufferSize = res->bo->size - (int) buffer->buffer_offset; + vb.BufferSize = res->base.width0 - (int) buffer->buffer_offset; vb.BufferStartingAddress = ro_bo(NULL, res->bo->gtt_offset + (int) buffer->buffer_offset); - vb.MOCS = mocs(res->bo, &screen->isl_dev); + vb.MOCS = iris_mocs(res->bo, &screen->isl_dev); } else { vb.NullVertexBuffer = true; } @@ -3559,17 +3729,18 @@ iris_set_stream_output_targets(struct pipe_context *ctx, sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i; #endif sob.SurfaceBaseAddress = - rw_bo(NULL, res->bo->gtt_offset + tgt->base.buffer_offset); + rw_bo(NULL, res->bo->gtt_offset + tgt->base.buffer_offset, + IRIS_DOMAIN_OTHER_WRITE); sob.SOBufferEnable = true; sob.StreamOffsetWriteEnable = true; sob.StreamOutputBufferOffsetAddressEnable = true; - sob.MOCS = mocs(res->bo, &screen->isl_dev); + sob.MOCS = iris_mocs(res->bo, &screen->isl_dev); sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1; sob.StreamOffset = offset; sob.StreamOutputBufferOffsetAddress = rw_bo(NULL, iris_resource_bo(tgt->offset.res)->gtt_offset + - tgt->offset.offset); + tgt->offset.offset, IRIS_DOMAIN_OTHER_WRITE); } } @@ -3785,7 +3956,8 @@ iris_emit_sbe_swiz(struct iris_batch *batch, /* XXX: this should be generated when putting programs in place */ - for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) { + for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) { + const uint8_t fs_attr = wm_prog_data->urb_setup_attribs[idx]; const int input_index = wm_prog_data->urb_setup[fs_attr]; if (input_index < 0 || input_index >= 16) continue; @@ -3937,14 +4109,14 @@ static void iris_populate_vs_key(const struct iris_context *ice, const struct shader_info *info, gl_shader_stage last_stage, - struct brw_vs_prog_key *key) + struct iris_vs_prog_key *key) { const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast; if (info->clip_distance_array_size == 0 && (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) && last_stage == MESA_SHADER_VERTEX) - key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; + key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; } /** @@ -3952,7 +4124,7 @@ iris_populate_vs_key(const struct iris_context *ice, */ static void iris_populate_tcs_key(const struct iris_context *ice, - struct brw_tcs_prog_key *key) + struct iris_tcs_prog_key *key) { } @@ -3963,14 +4135,14 @@ static void iris_populate_tes_key(const struct iris_context *ice, const struct shader_info *info, gl_shader_stage last_stage, - struct brw_tes_prog_key *key) + struct iris_tes_prog_key *key) { const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast; if (info->clip_distance_array_size == 0 && (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) && last_stage == MESA_SHADER_TESS_EVAL) - key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; + key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; } /** @@ -3980,14 +4152,14 @@ static void iris_populate_gs_key(const struct iris_context *ice, const struct shader_info *info, gl_shader_stage last_stage, - struct brw_gs_prog_key *key) + struct iris_gs_prog_key *key) { const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast; if (info->clip_distance_array_size == 0 && (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) && last_stage == MESA_SHADER_GEOMETRY) - key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; + key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; } /** @@ -3996,7 +4168,7 @@ iris_populate_gs_key(const struct iris_context *ice, static void iris_populate_fs_key(const struct iris_context *ice, const struct shader_info *info, - struct brw_wm_prog_key *key) + struct iris_fs_prog_key *key) { struct iris_screen *screen = (void *) ice->ctx.screen; const struct pipe_framebuffer_state *fb = &ice->state.framebuffer; @@ -4029,7 +4201,7 @@ iris_populate_fs_key(const struct iris_context *ice, static void iris_populate_cs_key(const struct iris_context *ice, - struct brw_cs_prog_key *key) + struct iris_cs_prog_key *key) { } @@ -4040,17 +4212,9 @@ KSP(const struct iris_compiled_shader *shader) return iris_bo_offset_from_base_address(res->bo) + shader->assembly.offset; } -/* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable - * prefetching of binding tables in A0 and B0 steppings. XXX: Revisit - * this WA on C0 stepping. - * - * TODO: Fill out SamplerCount for prefetching? - */ - #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \ pkt.KernelStartPointer = KSP(shader); \ - pkt.BindingTableEntryCount = GEN_GEN == 11 ? 0 : \ - shader->bt.size_bytes / 4; \ + pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \ pkt.FloatingPointMode = prog_data->use_alt_mode; \ \ pkt.DispatchGRFStartRegisterForURBData = \ @@ -4066,7 +4230,8 @@ KSP(const struct iris_compiled_shader *shader) iris_get_scratch_space(ice, prog_data->total_scratch, stage); \ uint32_t scratch_addr = bo->gtt_offset; \ pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; \ - pkt.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr); \ + pkt.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr, \ + IRIS_DOMAIN_NONE); \ } /** @@ -4104,10 +4269,29 @@ iris_store_tcs_state(struct iris_context *ice, iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) { INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL); +#if GEN_GEN >= 12 + /* GEN:BUG:1604578095: + * + * Hang occurs when the number of max threads is less than 2 times + * the number of instance count. The number of max threads must be + * more than 2 times the number of instance count. + */ + assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances); + hs.DispatchGRFStartRegisterForURBData = prog_data->dispatch_grf_start_reg & 0x1f; + hs.DispatchGRFStartRegisterForURBData5 = prog_data->dispatch_grf_start_reg >> 5; +#endif + hs.InstanceCount = tcs_prog_data->instances - 1; hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1; hs.IncludeVertexHandles = true; +#if GEN_GEN == 12 + /* Patch Count threshold specifies the maximum number of patches that + * will be accumulated before a thread dispatch is forced. + */ + hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold; +#endif + #if GEN_GEN >= 9 hs.DispatchMode = vue_prog_data->dispatch_mode; hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id; @@ -4217,9 +4401,7 @@ iris_store_fs_state(struct iris_context *ice, iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) { ps.VectorMaskEnable = true; - // XXX: WABTPPrefetchDisable, see above, drop at C0 - ps.BindingTableEntryCount = GEN_GEN == 11 ? 0 : - shader->bt.size_bytes / 4; + ps.BindingTableEntryCount = shader->bt.size_bytes / 4; ps.FloatingPointMode = prog_data->use_alt_mode; ps.MaximumNumberofThreadsPerPSD = 64 - (GEN_GEN == 8 ? 2 : 1); @@ -4247,7 +4429,8 @@ iris_store_fs_state(struct iris_context *ice, MESA_SHADER_FRAGMENT); uint32_t scratch_addr = bo->gtt_offset; ps.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; - ps.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr); + ps.ScratchSpaceBasePointer = rw_bo(NULL, scratch_addr, + IRIS_DOMAIN_NONE); } } @@ -4283,14 +4466,24 @@ iris_store_cs_state(struct iris_context *ice, void *map = shader->derived_data; iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), map, desc) { - desc.KernelStartPointer = KSP(shader); desc.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs; - desc.NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads; desc.SharedLocalMemorySize = encode_slm_size(GEN_GEN, prog_data->total_shared); desc.BarrierEnable = cs_prog_data->uses_barrier; desc.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs; +#if GEN_GEN >= 12 + /* TODO: Check if we are missing workarounds and enable mid-thread + * preemption. + * + * We still have issues with mid-thread preemption (it was already + * disabled by the kernel on gen11, due to missing workarounds). It's + * possible that we are just missing some workarounds, and could enable + * it later, but for now let's disable it to fix a GPU in compute in Car + * Chase (and possibly more). + */ + desc.ThreadPreemptionDisable = true; +#endif } } @@ -4368,7 +4561,7 @@ use_null_surface(struct iris_batch *batch, struct iris_context *ice) { struct iris_bo *state_bo = iris_resource_bo(ice->state.unbound_tex.res); - iris_use_pinned_bo(batch, state_bo, false); + iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE); return ice->state.unbound_tex.offset; } @@ -4382,7 +4575,7 @@ use_null_fb_surface(struct iris_batch *batch, struct iris_context *ice) struct iris_bo *state_bo = iris_resource_bo(ice->state.null_fb.res); - iris_use_pinned_bo(batch, state_bo, false); + iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE); return ice->state.null_fb.offset; } @@ -4392,6 +4585,7 @@ surf_state_offset_for_aux(struct iris_resource *res, unsigned aux_modes, enum isl_aux_usage aux_usage) { + assert(aux_modes & (1 << aux_usage)); return SURFACE_STATE_ALIGNMENT * util_bitcount(aux_modes & ((1 << aux_usage) - 1)); } @@ -4443,7 +4637,7 @@ static void update_clear_value(struct iris_context *ice, struct iris_batch *batch, struct iris_resource *res, - struct iris_state_ref *state, + struct iris_surface_state *surf_state, unsigned all_aux_modes, struct isl_view *view) { @@ -4460,19 +4654,23 @@ update_clear_value(struct iris_context *ice, while (aux_modes) { enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes); - surf_state_update_clear_value(batch, res, state, all_aux_modes, - aux_usage); + surf_state_update_clear_value(batch, res, &surf_state->ref, + all_aux_modes, aux_usage); } #elif GEN_GEN == 8 - pipe_resource_reference(&state->res, NULL); + /* TODO: Could update rather than re-filling */ + alloc_surface_states(surf_state, all_aux_modes); + + void *map = surf_state->cpu; - void *map = alloc_surface_states(ice->state.surface_uploader, - state, all_aux_modes); while (aux_modes) { enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes); - fill_surface_state(isl_dev, map, res, &res->surf, view, aux_usage, 0, 0); + fill_surface_state(isl_dev, map, res, &res->surf, view, aux_usage, + 0, 0, 0); map += SURFACE_STATE_ALIGNMENT; } + + upload_surface_states(ice->state.surface_uploader, surf_state); #endif } @@ -4488,23 +4686,27 @@ use_surface(struct iris_context *ice, struct pipe_surface *p_surf, bool writeable, enum isl_aux_usage aux_usage, - bool is_read_surface) + bool is_read_surface, + enum iris_domain access) { struct iris_surface *surf = (void *) p_surf; struct iris_resource *res = (void *) p_surf->texture; uint32_t offset = 0; - iris_use_pinned_bo(batch, iris_resource_bo(p_surf->texture), writeable); + iris_use_pinned_bo(batch, iris_resource_bo(p_surf->texture), + writeable, access); if (GEN_GEN == 8 && is_read_surface) { - iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state_read.res), false); + iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state_read.ref.res), false, + IRIS_DOMAIN_NONE); } else { - iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state.res), false); + iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state.ref.res), false, + IRIS_DOMAIN_NONE); } if (res->aux.bo) { - iris_use_pinned_bo(batch, res->aux.bo, writeable); + iris_use_pinned_bo(batch, res->aux.bo, writeable, access); if (res->aux.clear_color_bo) - iris_use_pinned_bo(batch, res->aux.clear_color_bo, false); + iris_use_pinned_bo(batch, res->aux.clear_color_bo, false, access); if (memcmp(&res->aux.clear_color, &surf->clear_color, sizeof(surf->clear_color)) != 0) { @@ -4518,8 +4720,9 @@ use_surface(struct iris_context *ice, } } - offset = (GEN_GEN == 8 && is_read_surface) ? surf->surface_state_read.offset - : surf->surface_state.offset; + offset = (GEN_GEN == 8 && is_read_surface) + ? surf->surface_state_read.ref.offset + : surf->surface_state.ref.offset; return offset + surf_state_offset_for_aux(res, res->aux.possible_usages, aux_usage); @@ -4530,17 +4733,19 @@ use_sampler_view(struct iris_context *ice, struct iris_batch *batch, struct iris_sampler_view *isv) { - // XXX: ASTC hacks enum isl_aux_usage aux_usage = - iris_resource_texture_aux_usage(ice, isv->res, isv->view.format, 0); + iris_resource_texture_aux_usage(ice, isv->res, isv->view.format); - iris_use_pinned_bo(batch, isv->res->bo, false); - iris_use_pinned_bo(batch, iris_resource_bo(isv->surface_state.res), false); + iris_use_pinned_bo(batch, isv->res->bo, false, IRIS_DOMAIN_OTHER_READ); + iris_use_pinned_bo(batch, iris_resource_bo(isv->surface_state.ref.res), false, + IRIS_DOMAIN_NONE); if (isv->res->aux.bo) { - iris_use_pinned_bo(batch, isv->res->aux.bo, false); + iris_use_pinned_bo(batch, isv->res->aux.bo, + false, IRIS_DOMAIN_OTHER_READ); if (isv->res->aux.clear_color_bo) - iris_use_pinned_bo(batch, isv->res->aux.clear_color_bo, false); + iris_use_pinned_bo(batch, isv->res->aux.clear_color_bo, + false, IRIS_DOMAIN_OTHER_READ); if (memcmp(&isv->res->aux.clear_color, &isv->clear_color, sizeof(isv->clear_color)) != 0) { update_clear_value(ice, batch, isv->res, &isv->surface_state, @@ -4549,7 +4754,7 @@ use_sampler_view(struct iris_context *ice, } } - return isv->surface_state.offset + + return isv->surface_state.ref.offset + surf_state_offset_for_aux(isv->res, isv->res->aux.sampler_usages, aux_usage); } @@ -4559,20 +4764,22 @@ use_ubo_ssbo(struct iris_batch *batch, struct iris_context *ice, struct pipe_shader_buffer *buf, struct iris_state_ref *surf_state, - bool writable) + bool writable, enum iris_domain access) { if (!buf->buffer || !surf_state->res) return use_null_surface(batch, ice); - iris_use_pinned_bo(batch, iris_resource_bo(buf->buffer), writable); - iris_use_pinned_bo(batch, iris_resource_bo(surf_state->res), false); + iris_use_pinned_bo(batch, iris_resource_bo(buf->buffer), writable, access); + iris_use_pinned_bo(batch, iris_resource_bo(surf_state->res), false, + IRIS_DOMAIN_NONE); return surf_state->offset; } static uint32_t use_image(struct iris_batch *batch, struct iris_context *ice, - struct iris_shader_state *shs, int i) + struct iris_shader_state *shs, const struct shader_info *info, + int i) { struct iris_image_view *iv = &shs->image[i]; struct iris_resource *res = (void *) iv->base.resource; @@ -4582,13 +4789,18 @@ use_image(struct iris_batch *batch, struct iris_context *ice, bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE; - iris_use_pinned_bo(batch, res->bo, write); - iris_use_pinned_bo(batch, iris_resource_bo(iv->surface_state.res), false); + iris_use_pinned_bo(batch, res->bo, write, IRIS_DOMAIN_NONE); + iris_use_pinned_bo(batch, iris_resource_bo(iv->surface_state.ref.res), + false, IRIS_DOMAIN_NONE); if (res->aux.bo) - iris_use_pinned_bo(batch, res->aux.bo, write); + iris_use_pinned_bo(batch, res->aux.bo, write, IRIS_DOMAIN_NONE); - return iv->surface_state.offset; + enum isl_aux_usage aux_usage = + iris_image_view_aux_usage(ice, &iv->base, info); + + return iv->surface_state.ref.offset + + surf_state_offset_for_aux(res, res->aux.possible_usages, aux_usage); } #define push_bt_entry(addr) \ @@ -4639,8 +4851,10 @@ iris_populate_binding_table(struct iris_context *ice, /* surface for gl_NumWorkGroups */ struct iris_state_ref *grid_data = &ice->state.grid_size; struct iris_state_ref *grid_state = &ice->state.grid_surf_state; - iris_use_pinned_bo(batch, iris_resource_bo(grid_data->res), false); - iris_use_pinned_bo(batch, iris_resource_bo(grid_state->res), false); + iris_use_pinned_bo(batch, iris_resource_bo(grid_data->res), false, + IRIS_DOMAIN_OTHER_READ); + iris_use_pinned_bo(batch, iris_resource_bo(grid_state->res), false, + IRIS_DOMAIN_NONE); push_bt_entry(grid_state->offset); } @@ -4652,7 +4866,8 @@ iris_populate_binding_table(struct iris_context *ice, uint32_t addr; if (cso_fb->cbufs[i]) { addr = use_surface(ice, batch, cso_fb->cbufs[i], true, - ice->state.draw_aux_usage[i], false); + ice->state.draw_aux_usage[i], false, + IRIS_DOMAIN_RENDER_WRITE); } else { addr = use_null_fb_surface(batch, ice); } @@ -4675,7 +4890,8 @@ iris_populate_binding_table(struct iris_context *ice, uint32_t addr; if (cso_fb->cbufs[i]) { addr = use_surface(ice, batch, cso_fb->cbufs[i], - true, ice->state.draw_aux_usage[i], true); + false, ice->state.draw_aux_usage[i], true, + IRIS_DOMAIN_OTHER_READ); push_bt_entry(addr); } } @@ -4688,7 +4904,7 @@ iris_populate_binding_table(struct iris_context *ice, } foreach_surface_used(i, IRIS_SURFACE_GROUP_IMAGE) { - uint32_t addr = use_image(batch, ice, shs, i); + uint32_t addr = use_image(batch, ice, shs, info, i); push_bt_entry(addr); } @@ -4697,9 +4913,10 @@ iris_populate_binding_table(struct iris_context *ice, if (i == bt->sizes[IRIS_SURFACE_GROUP_UBO] - 1) { if (ish->const_data) { - iris_use_pinned_bo(batch, iris_resource_bo(ish->const_data), false); + iris_use_pinned_bo(batch, iris_resource_bo(ish->const_data), false, + IRIS_DOMAIN_OTHER_READ); iris_use_pinned_bo(batch, iris_resource_bo(ish->const_data_state.res), - false); + false, IRIS_DOMAIN_NONE); addr = ish->const_data_state.offset; } else { /* This can only happen with INTEL_DISABLE_COMPACT_BINDING_TABLE=1. */ @@ -4707,7 +4924,8 @@ iris_populate_binding_table(struct iris_context *ice, } } else { addr = use_ubo_ssbo(batch, ice, &shs->constbuf[i], - &shs->constbuf_surf_state[i], false); + &shs->constbuf_surf_state[i], false, + IRIS_DOMAIN_OTHER_READ); } push_bt_entry(addr); @@ -4716,7 +4934,7 @@ iris_populate_binding_table(struct iris_context *ice, foreach_surface_used(i, IRIS_SURFACE_GROUP_SSBO) { uint32_t addr = use_ubo_ssbo(batch, ice, &shs->ssbo[i], &shs->ssbo_surf_state[i], - shs->writable_ssbos & (1u << i)); + shs->writable_ssbos & (1u << i), IRIS_DOMAIN_NONE); push_bt_entry(addr); } @@ -4730,11 +4948,12 @@ iris_populate_binding_table(struct iris_context *ice, static void iris_use_optional_res(struct iris_batch *batch, struct pipe_resource *res, - bool writeable) + bool writeable, + enum iris_domain access) { if (res) { struct iris_bo *bo = iris_resource_bo(res); - iris_use_pinned_bo(batch, bo, writeable); + iris_use_pinned_bo(batch, bo, writeable, access); } } @@ -4750,15 +4969,21 @@ pin_depth_and_stencil_buffers(struct iris_batch *batch, iris_get_depth_stencil_resources(zsbuf->texture, &zres, &sres); if (zres) { - iris_use_pinned_bo(batch, zres->bo, cso_zsa->depth_writes_enabled); + const enum iris_domain access = cso_zsa->depth_writes_enabled ? + IRIS_DOMAIN_DEPTH_WRITE : IRIS_DOMAIN_OTHER_READ; + iris_use_pinned_bo(batch, zres->bo, cso_zsa->depth_writes_enabled, + access); if (zres->aux.bo) { iris_use_pinned_bo(batch, zres->aux.bo, - cso_zsa->depth_writes_enabled); + cso_zsa->depth_writes_enabled, access); } } if (sres) { - iris_use_pinned_bo(batch, sres->bo, cso_zsa->stencil_writes_enabled); + const enum iris_domain access = cso_zsa->stencil_writes_enabled ? + IRIS_DOMAIN_DEPTH_WRITE : IRIS_DOMAIN_OTHER_READ; + iris_use_pinned_bo(batch, sres->bo, cso_zsa->stencil_writes_enabled, + access); } } @@ -4785,25 +5010,31 @@ iris_restore_render_saved_bos(struct iris_context *ice, struct iris_genx_state *genx = ice->state.genx; const uint64_t clean = ~ice->state.dirty; + const uint64_t stage_clean = ~ice->state.stage_dirty; if (clean & IRIS_DIRTY_CC_VIEWPORT) { - iris_use_optional_res(batch, ice->state.last_res.cc_vp, false); + iris_use_optional_res(batch, ice->state.last_res.cc_vp, false, + IRIS_DOMAIN_NONE); } if (clean & IRIS_DIRTY_SF_CL_VIEWPORT) { - iris_use_optional_res(batch, ice->state.last_res.sf_cl_vp, false); + iris_use_optional_res(batch, ice->state.last_res.sf_cl_vp, false, + IRIS_DOMAIN_NONE); } if (clean & IRIS_DIRTY_BLEND_STATE) { - iris_use_optional_res(batch, ice->state.last_res.blend, false); + iris_use_optional_res(batch, ice->state.last_res.blend, false, + IRIS_DOMAIN_NONE); } if (clean & IRIS_DIRTY_COLOR_CALC_STATE) { - iris_use_optional_res(batch, ice->state.last_res.color_calc, false); + iris_use_optional_res(batch, ice->state.last_res.color_calc, false, + IRIS_DOMAIN_NONE); } if (clean & IRIS_DIRTY_SCISSOR_RECT) { - iris_use_optional_res(batch, ice->state.last_res.scissor, false); + iris_use_optional_res(batch, ice->state.last_res.scissor, false, + IRIS_DOMAIN_NONE); } if (ice->state.streamout_active && (clean & IRIS_DIRTY_SO_BUFFERS)) { @@ -4812,15 +5043,15 @@ iris_restore_render_saved_bos(struct iris_context *ice, (void *) ice->state.so_target[i]; if (tgt) { iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer), - true); + true, IRIS_DOMAIN_OTHER_WRITE); iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res), - true); + true, IRIS_DOMAIN_OTHER_WRITE); } } } for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { - if (!(clean & (IRIS_DIRTY_CONSTANTS_VS << stage))) + if (!(stage_clean & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage))) continue; struct iris_shader_state *shs = &ice->state.shaders[stage]; @@ -4846,14 +5077,15 @@ iris_restore_render_saved_bos(struct iris_context *ice, struct iris_resource *res = (void *) cbuf->buffer; if (res) - iris_use_pinned_bo(batch, res->bo, false); + iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_OTHER_READ); else - iris_use_pinned_bo(batch, batch->screen->workaround_bo, false); + iris_use_pinned_bo(batch, batch->screen->workaround_bo, false, + IRIS_DOMAIN_OTHER_READ); } } for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { - if (clean & (IRIS_DIRTY_BINDINGS_VS << stage)) { + if (stage_clean & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) { /* Re-pin any buffers referred to by the binding table. */ iris_populate_binding_table(ice, batch, stage, true); } @@ -4863,23 +5095,24 @@ iris_restore_render_saved_bos(struct iris_context *ice, struct iris_shader_state *shs = &ice->state.shaders[stage]; struct pipe_resource *res = shs->sampler_table.res; if (res) - iris_use_pinned_bo(batch, iris_resource_bo(res), false); + iris_use_pinned_bo(batch, iris_resource_bo(res), false, + IRIS_DOMAIN_NONE); } for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { - if (clean & (IRIS_DIRTY_VS << stage)) { + if (stage_clean & (IRIS_STAGE_DIRTY_VS << stage)) { struct iris_compiled_shader *shader = ice->shaders.prog[stage]; if (shader) { struct iris_bo *bo = iris_resource_bo(shader->assembly.res); - iris_use_pinned_bo(batch, bo, false); + iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE); struct brw_stage_prog_data *prog_data = shader->prog_data; if (prog_data->total_scratch > 0) { struct iris_bo *bo = iris_get_scratch_space(ice, prog_data->total_scratch, stage); - iris_use_pinned_bo(batch, bo, true); + iris_use_pinned_bo(batch, bo, true, IRIS_DOMAIN_NONE); } } } @@ -4891,14 +5124,16 @@ iris_restore_render_saved_bos(struct iris_context *ice, pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa); } - iris_use_optional_res(batch, ice->state.last_res.index_buffer, false); + iris_use_optional_res(batch, ice->state.last_res.index_buffer, false, + IRIS_DOMAIN_OTHER_READ); if (clean & IRIS_DIRTY_VERTEX_BUFFERS) { uint64_t bound = ice->state.bound_vertex_buffers; while (bound) { const int i = u_bit_scan64(&bound); struct pipe_resource *res = genx->vertex_buffers[i].resource; - iris_use_pinned_bo(batch, iris_resource_bo(res), false); + iris_use_pinned_bo(batch, iris_resource_bo(res), false, + IRIS_DOMAIN_OTHER_READ); } } } @@ -4908,44 +5143,46 @@ iris_restore_compute_saved_bos(struct iris_context *ice, struct iris_batch *batch, const struct pipe_grid_info *grid) { - const uint64_t clean = ~ice->state.dirty; + const uint64_t stage_clean = ~ice->state.stage_dirty; const int stage = MESA_SHADER_COMPUTE; struct iris_shader_state *shs = &ice->state.shaders[stage]; - if (clean & IRIS_DIRTY_BINDINGS_CS) { + if (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) { /* Re-pin any buffers referred to by the binding table. */ iris_populate_binding_table(ice, batch, stage, true); } struct pipe_resource *sampler_res = shs->sampler_table.res; if (sampler_res) - iris_use_pinned_bo(batch, iris_resource_bo(sampler_res), false); + iris_use_pinned_bo(batch, iris_resource_bo(sampler_res), false, + IRIS_DOMAIN_NONE); - if ((clean & IRIS_DIRTY_SAMPLER_STATES_CS) && - (clean & IRIS_DIRTY_BINDINGS_CS) && - (clean & IRIS_DIRTY_CONSTANTS_CS) && - (clean & IRIS_DIRTY_CS)) { - iris_use_optional_res(batch, ice->state.last_res.cs_desc, false); + if ((stage_clean & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS) && + (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) && + (stage_clean & IRIS_STAGE_DIRTY_CONSTANTS_CS) && + (stage_clean & IRIS_STAGE_DIRTY_CS)) { + iris_use_optional_res(batch, ice->state.last_res.cs_desc, false, + IRIS_DOMAIN_NONE); } - if (clean & IRIS_DIRTY_CS) { + if (stage_clean & IRIS_STAGE_DIRTY_CS) { struct iris_compiled_shader *shader = ice->shaders.prog[stage]; if (shader) { struct iris_bo *bo = iris_resource_bo(shader->assembly.res); - iris_use_pinned_bo(batch, bo, false); + iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE); struct iris_bo *curbe_bo = iris_resource_bo(ice->state.last_res.cs_thread_ids); - iris_use_pinned_bo(batch, curbe_bo, false); + iris_use_pinned_bo(batch, curbe_bo, false, IRIS_DOMAIN_NONE); struct brw_stage_prog_data *prog_data = shader->prog_data; if (prog_data->total_scratch > 0) { struct iris_bo *bo = iris_get_scratch_space(ice, prog_data->total_scratch, stage); - iris_use_pinned_bo(batch, bo, true); + iris_use_pinned_bo(batch, bo, true, IRIS_DOMAIN_NONE); } } } @@ -4963,8 +5200,20 @@ iris_update_surface_base_address(struct iris_batch *batch, uint32_t mocs = batch->screen->isl_dev.mocs.internal; + iris_batch_sync_region_start(batch); + flush_before_state_base_change(batch); +#if GEN_GEN == 12 + /* GEN:BUG:1607854226: + * + * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline + * mode by putting the pipeline temporarily in 3D mode.. + */ + if (batch->name == IRIS_BATCH_COMPUTE) + emit_pipeline_select(batch, _3D); +#endif + iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) { sba.SurfaceStateBaseAddressModifyEnable = true; sba.SurfaceStateBaseAddress = ro_bo(binder->bo, 0); @@ -4983,7 +5232,17 @@ iris_update_surface_base_address(struct iris_batch *batch, #endif } +#if GEN_GEN == 12 + /* GEN:BUG:1607854226: + * + * Put the pipeline back into compute mode. + */ + if (batch->name == IRIS_BATCH_COMPUTE) + emit_pipeline_select(batch, GPGPU); +#endif + flush_after_state_base_change(batch); + iris_batch_sync_region_end(batch); batch->last_surface_base_address = binder->bo->gtt_offset; } @@ -5002,7 +5261,7 @@ iris_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz, #if GEN_GEN >= 12 void -genX(emit_aux_map_state)(struct iris_batch *batch) +genX(invalidate_aux_map_state)(struct iris_batch *batch) { struct iris_screen *screen = batch->screen; void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr); @@ -5010,18 +5269,182 @@ genX(emit_aux_map_state)(struct iris_batch *batch) return; uint32_t aux_map_state_num = gen_aux_map_get_state_num(aux_map_ctx); if (batch->last_aux_map_state != aux_map_state_num) { + /* HSD 1209978178: docs say that before programming the aux table: + * + * "Driver must ensure that the engine is IDLE but ensure it doesn't + * add extra flushes in the case it knows that the engine is already + * IDLE." + * + * An end of pipe sync is needed here, otherwise we see GPU hangs in + * dEQP-GLES31.functional.copy_image.* tests. + */ + iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table", + PIPE_CONTROL_CS_STALL); + /* If the aux-map state number increased, then we need to rewrite the * register. Rewriting the register is used to both set the aux-map * translation table address, and also to invalidate any previously * cached translations. */ - uint64_t base_addr = gen_aux_map_get_base(aux_map_ctx); - assert(base_addr != 0 && ALIGN(base_addr, 32 * 1024) == base_addr); - iris_load_register_imm64(batch, GENX(GFX_AUX_TABLE_BASE_ADDR_num), - base_addr); + iris_load_register_imm32(batch, GENX(GFX_CCS_AUX_INV_num), 1); batch->last_aux_map_state = aux_map_state_num; } } + +static void +init_aux_map_state(struct iris_batch *batch) +{ + struct iris_screen *screen = batch->screen; + void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr); + if (!aux_map_ctx) + return; + + uint64_t base_addr = gen_aux_map_get_base(aux_map_ctx); + assert(base_addr != 0 && align64(base_addr, 32 * 1024) == base_addr); + iris_load_register_imm64(batch, GENX(GFX_AUX_TABLE_BASE_ADDR_num), + base_addr); +} +#endif + +struct push_bos { + struct { + struct iris_address addr; + uint32_t length; + } buffers[4]; + int buffer_count; + uint32_t max_length; +}; + +static void +setup_constant_buffers(struct iris_context *ice, + struct iris_batch *batch, + int stage, + struct push_bos *push_bos) +{ + struct iris_shader_state *shs = &ice->state.shaders[stage]; + struct iris_compiled_shader *shader = ice->shaders.prog[stage]; + struct brw_stage_prog_data *prog_data = (void *) shader->prog_data; + + uint32_t push_range_sum = 0; + + int n = 0; + for (int i = 0; i < 4; i++) { + const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; + + if (range->length == 0) + continue; + + push_range_sum += range->length; + + if (range->length > push_bos->max_length) + push_bos->max_length = range->length; + + /* Range block is a binding table index, map back to UBO index. */ + unsigned block_index = iris_bti_to_group_index( + &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block); + assert(block_index != IRIS_SURFACE_NOT_USED); + + struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index]; + struct iris_resource *res = (void *) cbuf->buffer; + + assert(cbuf->buffer_offset % 32 == 0); + + push_bos->buffers[n].length = range->length; + push_bos->buffers[n].addr = + res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset) + : batch->screen->workaround_address; + n++; + } + + /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes: + * + * "The sum of all four read length fields must be less than or + * equal to the size of 64." + */ + assert(push_range_sum <= 64); + + push_bos->buffer_count = n; +} + +static void +emit_push_constant_packets(struct iris_context *ice, + struct iris_batch *batch, + int stage, + const struct push_bos *push_bos) +{ + UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev; + struct iris_compiled_shader *shader = ice->shaders.prog[stage]; + struct brw_stage_prog_data *prog_data = (void *) shader->prog_data; + + iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) { + pkt._3DCommandSubOpcode = push_constant_opcodes[stage]; +#if GEN_GEN >= 12 + pkt.MOCS = isl_dev->mocs.internal; +#endif + if (prog_data) { + /* The Skylake PRM contains the following restriction: + * + * "The driver must ensure The following case does not occur + * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with + * buffer 3 read length equal to zero committed followed by a + * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to + * zero committed." + * + * To avoid this, we program the buffers in the highest slots. + * This way, slot 0 is only used if slot 3 is also used. + */ + int n = push_bos->buffer_count; + assert(n <= 4); + const unsigned shift = 4 - n; + for (int i = 0; i < n; i++) { + pkt.ConstantBody.ReadLength[i + shift] = + push_bos->buffers[i].length; + pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr; + } + } + } +} + +#if GEN_GEN >= 12 +static void +emit_push_constant_packet_all(struct iris_context *ice, + struct iris_batch *batch, + uint32_t shader_mask, + const struct push_bos *push_bos) +{ + struct isl_device *isl_dev = &batch->screen->isl_dev; + + if (!push_bos) { + iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), pc) { + pc.ShaderUpdateEnable = shader_mask; + } + return; + } + + const uint32_t n = push_bos->buffer_count; + const uint32_t max_pointers = 4; + const uint32_t num_dwords = 2 + 2 * n; + uint32_t const_all[2 + 2 * max_pointers]; + uint32_t *dw = &const_all[0]; + + assert(n <= max_pointers); + iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) { + all.DWordLength = num_dwords - 2; + all.MOCS = isl_dev->mocs.internal; + all.ShaderUpdateEnable = shader_mask; + all.PointerBufferMask = (1 << n) - 1; + } + dw += 2; + + for (int i = 0; i < n; i++) { + _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA), + dw + i * 2, data) { + data.PointerToConstantBuffer = push_bos->buffers[i].addr; + data.ConstantBufferReadLength = push_bos->buffers[i].length; + } + } + iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords); +} #endif static void @@ -5030,8 +5453,10 @@ iris_upload_dirty_render_state(struct iris_context *ice, const struct pipe_draw_info *draw) { const uint64_t dirty = ice->state.dirty; + const uint64_t stage_dirty = ice->state.stage_dirty; - if (!(dirty & IRIS_ALL_DIRTY_FOR_RENDER)) + if (!(dirty & IRIS_ALL_DIRTY_FOR_RENDER) && + !(stage_dirty & IRIS_ALL_STAGE_DIRTY_FOR_RENDER)) return; struct iris_genx_state *genx = ice->state.genx; @@ -5134,9 +5559,22 @@ iris_upload_dirty_render_state(struct iris_context *ice, assert(size[i] != 0); } - genX(emit_urb_setup)(ice, batch, size, - ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL, - ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL); + unsigned entries[4], start[4]; + gen_get_urb_config(&batch->screen->devinfo, + batch->screen->l3_config_3d, + ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL, + ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL, + size, entries, start, + &ice->state.urb_deref_block_size); + + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { + iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) { + urb._3DCommandSubOpcode += i; + urb.VSURBStartingAddress = start[i]; + urb.VSURBEntryAllocationSize = size[i] - 1; + urb.VSNumberofURBEntries = entries[i]; + } + } } if (dirty & IRIS_DIRTY_BLEND_STATE) { @@ -5202,8 +5640,24 @@ iris_upload_dirty_render_state(struct iris_context *ice, } } + /* GEN:BUG:1604061319 + * + * 3DSTATE_CONSTANT_* needs to be programmed before BTP_* + * + * Testing shows that all the 3DSTATE_CONSTANT_XS need to be emitted if + * any stage has a dirty binding table. + */ + const bool emit_const_wa = GEN_GEN >= 11 && + ((dirty & IRIS_DIRTY_RENDER_BUFFER) || + (stage_dirty & IRIS_ALL_STAGE_DIRTY_BINDINGS)); + +#if GEN_GEN >= 12 + uint32_t nobuffer_stages = 0; +#endif + for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { - if (!(dirty & (IRIS_DIRTY_CONSTANTS_VS << stage))) + if (!(stage_dirty & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)) && + !emit_const_wa) continue; struct iris_shader_state *shs = &ice->state.shaders[stage]; @@ -5213,59 +5667,45 @@ iris_upload_dirty_render_state(struct iris_context *ice, continue; if (shs->sysvals_need_upload) - upload_sysvals(ice, stage); - - struct brw_stage_prog_data *prog_data = (void *) shader->prog_data; - - iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) { - pkt._3DCommandSubOpcode = push_constant_opcodes[stage]; - if (prog_data) { - /* The Skylake PRM contains the following restriction: - * - * "The driver must ensure The following case does not occur - * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with - * buffer 3 read length equal to zero committed followed by a - * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to - * zero committed." - * - * To avoid this, we program the buffers in the highest slots. - * This way, slot 0 is only used if slot 3 is also used. - */ - int n = 3; + upload_sysvals(ice, stage, NULL); - for (int i = 3; i >= 0; i--) { - const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; + struct push_bos push_bos = {}; + setup_constant_buffers(ice, batch, stage, &push_bos); - if (range->length == 0) - continue; - - /* Range block is a binding table index, map back to UBO index. */ - unsigned block_index = iris_bti_to_group_index( - &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block); - assert(block_index != IRIS_SURFACE_NOT_USED); - - struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index]; - struct iris_resource *res = (void *) cbuf->buffer; - - assert(cbuf->buffer_offset % 32 == 0); +#if GEN_GEN >= 12 + /* If this stage doesn't have any push constants, emit it later in a + * single CONSTANT_ALL packet with all the other stages. + */ + if (push_bos.buffer_count == 0) { + nobuffer_stages |= 1 << stage; + continue; + } - pkt.ConstantBody.ReadLength[n] = range->length; - pkt.ConstantBody.Buffer[n] = - res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset) - : ro_bo(batch->screen->workaround_bo, 0); - n--; - } - } + /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL + * contains only 5 bits, so we can only use it for buffers smaller than + * 32. + */ + if (push_bos.max_length < 32) { + emit_push_constant_packet_all(ice, batch, 1 << stage, &push_bos); + continue; } +#endif + emit_push_constant_packets(ice, batch, stage, &push_bos); } +#if GEN_GEN >= 12 + if (nobuffer_stages) + emit_push_constant_packet_all(ice, batch, nobuffer_stages, NULL); +#endif + for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { /* Gen9 requires 3DSTATE_BINDING_TABLE_POINTERS_XS to be re-emitted * in order to commit constants. TODO: Investigate "Disable Gather * at Set Shader" to go back to legacy mode... */ - if (dirty & ((IRIS_DIRTY_BINDINGS_VS | - (GEN_GEN == 9 ? IRIS_DIRTY_CONSTANTS_VS : 0)) << stage)) { + if (stage_dirty & ((IRIS_STAGE_DIRTY_BINDINGS_VS | + (GEN_GEN == 9 ? IRIS_STAGE_DIRTY_CONSTANTS_VS : 0)) + << stage)) { iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) { ptr._3DCommandSubOpcode = 38 + stage; ptr.PointertoVSBindingTable = binder->bt_offset[stage]; @@ -5292,13 +5732,13 @@ iris_upload_dirty_render_state(struct iris_context *ice, } for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { - if (dirty & (IRIS_DIRTY_BINDINGS_VS << stage)) { + if (stage_dirty & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) { iris_populate_binding_table(ice, batch, stage, false); } } for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { - if (!(dirty & (IRIS_DIRTY_SAMPLER_STATES_VS << stage)) || + if (!(stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) || !ice->shaders.prog[stage]) continue; @@ -5307,7 +5747,8 @@ iris_upload_dirty_render_state(struct iris_context *ice, struct iris_shader_state *shs = &ice->state.shaders[stage]; struct pipe_resource *res = shs->sampler_table.res; if (res) - iris_use_pinned_bo(batch, iris_resource_bo(res), false); + iris_use_pinned_bo(batch, iris_resource_bo(res), false, + IRIS_DOMAIN_NONE); iris_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) { ptr._3DCommandSubOpcode = 43 + stage; @@ -5316,7 +5757,8 @@ iris_upload_dirty_render_state(struct iris_context *ice, } if (ice->state.need_border_colors) - iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false); + iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false, + IRIS_DOMAIN_NONE); if (dirty & IRIS_DIRTY_MULTISAMPLE) { iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) { @@ -5334,7 +5776,7 @@ iris_upload_dirty_render_state(struct iris_context *ice, } for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { - if (!(dirty & (IRIS_DIRTY_VS << stage))) + if (!(stage_dirty & (IRIS_STAGE_DIRTY_VS << stage))) continue; struct iris_compiled_shader *shader = ice->shaders.prog[stage]; @@ -5342,12 +5784,12 @@ iris_upload_dirty_render_state(struct iris_context *ice, if (shader) { struct brw_stage_prog_data *prog_data = shader->prog_data; struct iris_resource *cache = (void *) shader->assembly.res; - iris_use_pinned_bo(batch, cache->bo, false); + iris_use_pinned_bo(batch, cache->bo, false, IRIS_DOMAIN_NONE); if (prog_data->total_scratch > 0) { struct iris_bo *bo = iris_get_scratch_space(ice, prog_data->total_scratch, stage); - iris_use_pinned_bo(batch, bo, true); + iris_use_pinned_bo(batch, bo, true, IRIS_DOMAIN_NONE); } if (stage == MESA_SHADER_FRAGMENT) { @@ -5438,9 +5880,9 @@ iris_upload_dirty_render_state(struct iris_context *ice, if (tgt) { tgt->zeroed = true; iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer), - true); + true, IRIS_DOMAIN_OTHER_WRITE); iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res), - true); + true, IRIS_DOMAIN_OTHER_WRITE); } } } @@ -5509,13 +5951,17 @@ iris_upload_dirty_render_state(struct iris_context *ice, ARRAY_SIZE(cso_rast->clip)); } - if (dirty & IRIS_DIRTY_RASTER) { + if (dirty & (IRIS_DIRTY_RASTER | IRIS_DIRTY_URB)) { struct iris_rasterizer_state *cso = ice->state.cso_rast; iris_batch_emit(batch, cso->raster, sizeof(cso->raster)); uint32_t dynamic_sf[GENX(3DSTATE_SF_length)]; iris_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) { sf.ViewportTransformEnable = !ice->state.window_space_position; + +#if GEN_GEN >= 12 + sf.DerefBlockSize = ice->state.urb_deref_block_size; +#endif } iris_emit_merge(batch, cso->sf, dynamic_sf, ARRAY_SIZE(dynamic_sf)); @@ -5573,7 +6019,7 @@ iris_upload_dirty_render_state(struct iris_context *ice, if (dirty & IRIS_DIRTY_WM_DEPTH_STENCIL) { struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa; -#if GEN_GEN >= 9 +#if GEN_GEN >= 9 && GEN_GEN < 12 struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref; uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)]; iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) { @@ -5582,6 +6028,9 @@ iris_upload_dirty_render_state(struct iris_context *ice, } iris_emit_merge(batch, cso->wmds, stencil_refs, ARRAY_SIZE(cso->wmds)); #else + /* Use modify disable fields which allow us to emit packets + * directly instead of merging them later. + */ iris_batch_emit(batch, cso->wmds, sizeof(cso->wmds)); #endif @@ -5590,6 +6039,25 @@ iris_upload_dirty_render_state(struct iris_context *ice, #endif } + if (dirty & IRIS_DIRTY_STENCIL_REF) { +#if GEN_GEN >= 12 + /* Use modify disable fields which allow us to emit packets + * directly instead of merging them later. + */ + struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref; + uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)]; + iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) { + wmds.StencilReferenceValue = p_stencil_refs->ref_value[0]; + wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1]; + wmds.StencilTestMaskModifyDisable = true; + wmds.StencilWriteMaskModifyDisable = true; + wmds.StencilStateModifyDisable = true; + wmds.DepthStateModifyDisable = true; + } + iris_batch_emit(batch, stencil_refs, sizeof(stencil_refs)); +#endif + } + if (dirty & IRIS_DIRTY_SCISSOR_RECT) { uint32_t scissor_offset = emit_state(batch, ice->state.dynamic_uploader, @@ -5610,8 +6078,36 @@ iris_upload_dirty_render_state(struct iris_context *ice, * first. */ uint32_t clear_length = GENX(3DSTATE_CLEAR_PARAMS_length) * 4; - uint32_t cso_z_size = sizeof(cso_z->packets) - clear_length; + uint32_t cso_z_size = batch->screen->isl_dev.ds.size - clear_length;; + +#if GEN_GEN == 12 + /* GEN:BUG:14010455700 + * + * ISL will change some CHICKEN registers depending on the depth surface + * format, along with emitting the depth and stencil packets. In that + * case, we want to do a depth flush and stall, so the pipeline is not + * using these settings while we change the registers. + */ + iris_emit_end_of_pipe_sync(batch, + "Workaround: Stop pipeline for 14010455700", + PIPE_CONTROL_DEPTH_STALL | + PIPE_CONTROL_DEPTH_CACHE_FLUSH); +#endif + iris_batch_emit(batch, cso_z->packets, cso_z_size); + if (GEN_GEN >= 12) { + /* GEN:BUG:1408224581 + * + * Workaround: Gen12LP Astep only An additional pipe control with + * post-sync = store dword operation would be required.( w/a is to + * have an additional pipe control after the stencil state whenever + * the surface state bits of this state is changing). + */ + iris_emit_pipe_control_write(batch, "WA for stencil state", + PIPE_CONTROL_WRITE_IMMEDIATE, + batch->screen->workaround_address.bo, + batch->screen->workaround_address.offset, 0); + } union isl_color_value clear_value = { .f32 = { 0, } }; @@ -5660,7 +6156,7 @@ iris_upload_dirty_render_state(struct iris_context *ice, if (dirty & IRIS_DIRTY_VERTEX_BUFFERS) { int count = util_bitcount64(ice->state.bound_vertex_buffers); - int dynamic_bound = ice->state.bound_vertex_buffers; + uint64_t dynamic_bound = ice->state.bound_vertex_buffers; if (ice->state.vs_uses_draw_params) { assert(ice->draw.draw_params.res); @@ -5678,7 +6174,7 @@ iris_upload_dirty_render_state(struct iris_context *ice, vb.BufferStartingAddress = ro_bo(NULL, res->bo->gtt_offset + (int) ice->draw.draw_params.offset); - vb.MOCS = mocs(res->bo, &batch->screen->isl_dev); + vb.MOCS = iris_mocs(res->bo, &batch->screen->isl_dev); } dynamic_bound |= 1ull << count; count++; @@ -5700,13 +6196,22 @@ iris_upload_dirty_render_state(struct iris_context *ice, vb.BufferStartingAddress = ro_bo(NULL, res->bo->gtt_offset + (int) ice->draw.derived_draw_params.offset); - vb.MOCS = mocs(res->bo, &batch->screen->isl_dev); + vb.MOCS = iris_mocs(res->bo, &batch->screen->isl_dev); } dynamic_bound |= 1ull << count; count++; } if (count) { +#if GEN_GEN >= 11 + /* Gen11+ doesn't need the cache workaround below */ + uint64_t bound = dynamic_bound; + while (bound) { + const int i = u_bit_scan64(&bound); + iris_use_optional_res(batch, genx->vertex_buffers[i].resource, + false, IRIS_DOMAIN_OTHER_READ); + } +#else /* The VF cache designers cut corners, and made the cache key's * tuple only consider the bottom * 32 bits of the address. If you have two vertex buffers which get @@ -5726,7 +6231,7 @@ iris_upload_dirty_render_state(struct iris_context *ice, struct iris_resource *res = (void *) genx->vertex_buffers[i].resource; if (res) { - iris_use_pinned_bo(batch, res->bo, false); + iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_OTHER_READ); high_bits = res->bo->gtt_offset >> 32ull; if (high_bits != ice->state.last_vbo_high_bits[i]) { @@ -5742,6 +6247,7 @@ iris_upload_dirty_render_state(struct iris_context *ice, "workaround: VF cache 32-bit key [VB]", flush_flags); } +#endif const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length); @@ -5899,7 +6405,7 @@ iris_upload_dirty_render_state(struct iris_context *ice, genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1); #if GEN_GEN >= 12 - genX(emit_aux_map_state)(batch); + genX(invalidate_aux_map_state)(batch); #endif } @@ -5910,16 +6416,19 @@ iris_upload_render_state(struct iris_context *ice, { bool use_predicate = ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT; + iris_batch_sync_region_start(batch); + /* Always pin the binder. If we're emitting new binding table pointers, * we need it. If not, we're probably inheriting old tables via the * context, and need it anyway. Since true zero-bindings cases are * practically non-existent, just pin it and avoid last_res tracking. */ - iris_use_pinned_bo(batch, ice->state.binder.bo, false); + iris_use_pinned_bo(batch, ice->state.binder.bo, false, + IRIS_DOMAIN_NONE); - if (!batch->contains_draw) { + if (!batch->contains_draw_with_next_seqno) { iris_restore_render_saved_bos(ice, batch, draw); - batch->contains_draw = true; + batch->contains_draw_with_next_seqno = batch->contains_draw = true; } iris_upload_dirty_render_state(ice, batch, draw); @@ -5946,7 +6455,7 @@ iris_upload_render_state(struct iris_context *ice, uint32_t ib_packet[GENX(3DSTATE_INDEX_BUFFER_length)]; iris_pack_command(GENX(3DSTATE_INDEX_BUFFER), ib_packet, ib) { ib.IndexFormat = draw->index_size >> 1; - ib.MOCS = mocs(bo, &batch->screen->isl_dev); + ib.MOCS = iris_mocs(bo, &batch->screen->isl_dev); ib.BufferSize = bo->size - offset; ib.BufferStartingAddress = ro_bo(NULL, bo->gtt_offset + offset); } @@ -5954,9 +6463,10 @@ iris_upload_render_state(struct iris_context *ice, if (memcmp(genx->last_index_buffer, ib_packet, sizeof(ib_packet)) != 0) { memcpy(genx->last_index_buffer, ib_packet, sizeof(ib_packet)); iris_batch_emit(batch, ib_packet, sizeof(ib_packet)); - iris_use_pinned_bo(batch, bo, false); + iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_OTHER_READ); } +#if GEN_GEN < 11 /* The VF cache key only uses 32-bits, see vertex buffer comment above */ uint16_t high_bits = bo->gtt_offset >> 32ull; if (high_bits != ice->state.last_index_bo_high_bits) { @@ -5966,6 +6476,7 @@ iris_upload_render_state(struct iris_context *ice, PIPE_CONTROL_CS_STALL); ice->state.last_index_bo_high_bits = high_bits; } +#endif } #define _3DPRIM_END_OFFSET 0x2420 @@ -6114,14 +6625,43 @@ iris_upload_render_state(struct iris_context *ice, } } } + + iris_batch_sync_region_end(batch); } static void -iris_upload_compute_state(struct iris_context *ice, - struct iris_batch *batch, - const struct pipe_grid_info *grid) +iris_load_indirect_location(struct iris_context *ice, + struct iris_batch *batch, + const struct pipe_grid_info *grid) { - const uint64_t dirty = ice->state.dirty; +#define GPGPU_DISPATCHDIMX 0x2500 +#define GPGPU_DISPATCHDIMY 0x2504 +#define GPGPU_DISPATCHDIMZ 0x2508 + + assert(grid->indirect); + + struct iris_state_ref *grid_size = &ice->state.grid_size; + struct iris_bo *bo = iris_resource_bo(grid_size->res); + iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = GPGPU_DISPATCHDIMX; + lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0); + } + iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = GPGPU_DISPATCHDIMY; + lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4); + } + iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = GPGPU_DISPATCHDIMZ; + lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8); + } +} + +static void +iris_upload_gpgpu_walker(struct iris_context *ice, + struct iris_batch *batch, + const struct pipe_grid_info *grid) +{ + const uint64_t stage_dirty = ice->state.stage_dirty; struct iris_screen *screen = batch->screen; const struct gen_device_info *devinfo = &screen->devinfo; struct iris_binder *binder = &ice->state.binder; @@ -6130,34 +6670,13 @@ iris_upload_compute_state(struct iris_context *ice, ice->shaders.prog[MESA_SHADER_COMPUTE]; struct brw_stage_prog_data *prog_data = shader->prog_data; struct brw_cs_prog_data *cs_prog_data = (void *) prog_data; + const uint32_t group_size = grid->block[0] * grid->block[1] * grid->block[2]; + const unsigned simd_size = + brw_cs_simd_size_for_group_size(devinfo, cs_prog_data, group_size); + const unsigned threads = DIV_ROUND_UP(group_size, simd_size); - /* Always pin the binder. If we're emitting new binding table pointers, - * we need it. If not, we're probably inheriting old tables via the - * context, and need it anyway. Since true zero-bindings cases are - * practically non-existent, just pin it and avoid last_res tracking. - */ - iris_use_pinned_bo(batch, ice->state.binder.bo, false); - - if ((dirty & IRIS_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload) - upload_sysvals(ice, MESA_SHADER_COMPUTE); - - if (dirty & IRIS_DIRTY_BINDINGS_CS) - iris_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false); - if (dirty & IRIS_DIRTY_SAMPLER_STATES_CS) - iris_upload_sampler_states(ice, MESA_SHADER_COMPUTE); - - iris_use_optional_res(batch, shs->sampler_table.res, false); - iris_use_pinned_bo(batch, iris_resource_bo(shader->assembly.res), false); - - if (ice->state.need_border_colors) - iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false); - -#if GEN_GEN >= 12 - genX(emit_aux_map_state)(batch); -#endif - - if (dirty & IRIS_DIRTY_CS) { + if (stage_dirty & IRIS_STAGE_DIRTY_CS) { /* The MEDIA_VFE_STATE documentation for Gen8+ says: * * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless @@ -6176,7 +6695,7 @@ iris_upload_compute_state(struct iris_context *ice, iris_get_scratch_space(ice, prog_data->total_scratch, MESA_SHADER_COMPUTE); vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; - vfe.ScratchSpaceBasePointer = rw_bo(bo, 0); + vfe.ScratchSpaceBasePointer = rw_bo(bo, 0, IRIS_DOMAIN_NONE); } vfe.MaximumNumberofThreads = @@ -6192,42 +6711,55 @@ iris_upload_compute_state(struct iris_context *ice, vfe.URBEntryAllocationSize = 2; vfe.CURBEAllocationSize = - ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads + + ALIGN(cs_prog_data->push.per_thread.regs * threads + cs_prog_data->push.cross_thread.regs, 2); } } /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */ - if (dirty & IRIS_DIRTY_CS) { + if (stage_dirty & IRIS_STAGE_DIRTY_CS) { uint32_t curbe_data_offset = 0; assert(cs_prog_data->push.cross_thread.dwords == 0 && cs_prog_data->push.per_thread.dwords == 1 && cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID); + const unsigned push_const_size = + brw_cs_push_const_total_size(cs_prog_data, threads); uint32_t *curbe_data_map = stream_state(batch, ice->state.dynamic_uploader, &ice->state.last_res.cs_thread_ids, - ALIGN(cs_prog_data->push.total.size, 64), 64, + ALIGN(push_const_size, 64), 64, &curbe_data_offset); assert(curbe_data_map); - memset(curbe_data_map, 0x5a, ALIGN(cs_prog_data->push.total.size, 64)); - iris_fill_cs_push_const_buffer(cs_prog_data, curbe_data_map); + memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64)); + iris_fill_cs_push_const_buffer(cs_prog_data, threads, curbe_data_map); iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) { - curbe.CURBETotalDataLength = - ALIGN(cs_prog_data->push.total.size, 64); + curbe.CURBETotalDataLength = ALIGN(push_const_size, 64); curbe.CURBEDataStartAddress = curbe_data_offset; } } - if (dirty & (IRIS_DIRTY_SAMPLER_STATES_CS | - IRIS_DIRTY_BINDINGS_CS | - IRIS_DIRTY_CONSTANTS_CS | - IRIS_DIRTY_CS)) { + for (unsigned i = 0; i < IRIS_MAX_GLOBAL_BINDINGS; i++) { + struct pipe_resource *res = ice->state.global_bindings[i]; + if (!res) + continue; + + iris_use_pinned_bo(batch, iris_resource_bo(res), + true, IRIS_DOMAIN_NONE); + } + + if (stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_CS | + IRIS_STAGE_DIRTY_BINDINGS_CS | + IRIS_STAGE_DIRTY_CONSTANTS_CS | + IRIS_STAGE_DIRTY_CS)) { uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) { + idd.KernelStartPointer = + KSP(shader) + brw_cs_prog_data_prog_offset(cs_prog_data, simd_size); idd.SamplerStatePointer = shs->sampler_table.offset; idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE]; + idd.NumberofThreadsinGPGPUThreadGroup = threads; } for (int i = 0; i < GENX(INTERFACE_DESCRIPTOR_DATA_length); i++) @@ -6242,42 +6774,17 @@ iris_upload_compute_state(struct iris_context *ice, } } - uint32_t group_size = grid->block[0] * grid->block[1] * grid->block[2]; - uint32_t remainder = group_size & (cs_prog_data->simd_size - 1); - uint32_t right_mask; + if (grid->indirect) + iris_load_indirect_location(ice, batch, grid); - if (remainder > 0) - right_mask = ~0u >> (32 - remainder); - else - right_mask = ~0u >> (32 - cs_prog_data->simd_size); - -#define GPGPU_DISPATCHDIMX 0x2500 -#define GPGPU_DISPATCHDIMY 0x2504 -#define GPGPU_DISPATCHDIMZ 0x2508 - - if (grid->indirect) { - struct iris_state_ref *grid_size = &ice->state.grid_size; - struct iris_bo *bo = iris_resource_bo(grid_size->res); - iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = GPGPU_DISPATCHDIMX; - lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0); - } - iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = GPGPU_DISPATCHDIMY; - lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4); - } - iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { - lrm.RegisterAddress = GPGPU_DISPATCHDIMZ; - lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8); - } - } + const uint32_t right_mask = brw_cs_right_mask(group_size, simd_size); iris_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) { ggw.IndirectParameterEnable = grid->indirect != NULL; - ggw.SIMDSize = cs_prog_data->simd_size / 16; + ggw.SIMDSize = simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; - ggw.ThreadWidthCounterMaximum = cs_prog_data->threads - 1; + ggw.ThreadWidthCounterMaximum = threads - 1; ggw.ThreadGroupIDXDimension = grid->grid[0]; ggw.ThreadGroupIDYDimension = grid->grid[1]; ggw.ThreadGroupIDZDimension = grid->grid[2]; @@ -6286,11 +6793,58 @@ iris_upload_compute_state(struct iris_context *ice, } iris_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf); +} - if (!batch->contains_draw) { +static void +iris_upload_compute_state(struct iris_context *ice, + struct iris_batch *batch, + const struct pipe_grid_info *grid) +{ + const uint64_t stage_dirty = ice->state.stage_dirty; + struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE]; + struct iris_compiled_shader *shader = + ice->shaders.prog[MESA_SHADER_COMPUTE]; + + iris_batch_sync_region_start(batch); + + /* Always pin the binder. If we're emitting new binding table pointers, + * we need it. If not, we're probably inheriting old tables via the + * context, and need it anyway. Since true zero-bindings cases are + * practically non-existent, just pin it and avoid last_res tracking. + */ + iris_use_pinned_bo(batch, ice->state.binder.bo, false, IRIS_DOMAIN_NONE); + + if ((stage_dirty & IRIS_STAGE_DIRTY_CONSTANTS_CS) && + shs->sysvals_need_upload) + upload_sysvals(ice, MESA_SHADER_COMPUTE, grid); + + if (stage_dirty & IRIS_STAGE_DIRTY_BINDINGS_CS) + iris_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false); + + if (stage_dirty & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS) + iris_upload_sampler_states(ice, MESA_SHADER_COMPUTE); + + iris_use_optional_res(batch, shs->sampler_table.res, false, + IRIS_DOMAIN_NONE); + iris_use_pinned_bo(batch, iris_resource_bo(shader->assembly.res), false, + IRIS_DOMAIN_NONE); + + if (ice->state.need_border_colors) + iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false, + IRIS_DOMAIN_NONE); + +#if GEN_GEN >= 12 + genX(invalidate_aux_map_state)(batch); +#endif + + iris_upload_gpgpu_walker(ice, batch, grid); + + if (!batch->contains_draw_with_next_seqno) { iris_restore_compute_saved_bos(ice, batch, grid); - batch->contains_draw = true; + batch->contains_draw_with_next_seqno = batch->contains_draw = true; } + + iris_batch_sync_region_end(batch); } /** @@ -6329,7 +6883,8 @@ iris_destroy_state(struct iris_context *ice) } for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) { pipe_resource_reference(&shs->image[i].base.resource, NULL); - pipe_resource_reference(&shs->image[i].surface_state.res, NULL); + pipe_resource_reference(&shs->image[i].surface_state.ref.res, NULL); + free(shs->image[i].surface_state.cpu); } for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) { pipe_resource_reference(&shs->ssbo[i].buffer, NULL); @@ -6361,11 +6916,9 @@ iris_destroy_state(struct iris_context *ice) static void iris_rebind_buffer(struct iris_context *ice, - struct iris_resource *res, - uint64_t old_address) + struct iris_resource *res) { struct pipe_context *ctx = &ice->ctx; - struct iris_screen *screen = (void *) ctx->screen; struct iris_genx_state *genx = ice->state.genx; assert(res->base.target == PIPE_BUFFER); @@ -6391,9 +6944,10 @@ iris_rebind_buffer(struct iris_context *ice, STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_start) == 32); STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) == 64); uint64_t *addr = (uint64_t *) &state->state[1]; + struct iris_bo *bo = iris_resource_bo(state->resource); - if (*addr == old_address + state->offset) { - *addr = res->bo->gtt_offset + state->offset; + if (*addr != bo->gtt_offset + state->offset) { + *addr = bo->gtt_offset + state->offset; ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS; } } @@ -6429,7 +6983,7 @@ iris_rebind_buffer(struct iris_context *ice, if (res->bo == iris_resource_bo(cbuf->buffer)) { pipe_resource_reference(&surf_state->res, NULL); - ice->state.dirty |= IRIS_DIRTY_CONSTANTS_VS << s; + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << s; } } } @@ -6457,17 +7011,11 @@ iris_rebind_buffer(struct iris_context *ice, while (bound_sampler_views) { const int i = u_bit_scan(&bound_sampler_views); struct iris_sampler_view *isv = shs->textures[i]; + struct iris_bo *bo = isv->res->bo; - if (res->bo == iris_resource_bo(isv->base.texture)) { - void *map = alloc_surface_states(ice->state.surface_uploader, - &isv->surface_state, - isv->res->aux.sampler_usages); - assert(map); - fill_buffer_surface_state(&screen->isl_dev, isv->res, map, - isv->view.format, isv->view.swizzle, - isv->base.u.buf.offset, - isv->base.u.buf.size); - ice->state.dirty |= IRIS_DIRTY_BINDINGS_VS << s; + if (update_surface_state_addrs(ice->state.surface_uploader, + &isv->surface_state, bo)) { + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s; } } } @@ -6477,9 +7025,11 @@ iris_rebind_buffer(struct iris_context *ice, while (bound_image_views) { const int i = u_bit_scan(&bound_image_views); struct iris_image_view *iv = &shs->image[i]; + struct iris_bo *bo = iris_resource_bo(iv->base.resource); - if (res->bo == iris_resource_bo(iv->base.resource)) { - iris_set_shader_images(ctx, p_stage, i, 1, &iv->base); + if (update_surface_state_addrs(ice->state.surface_uploader, + &iv->surface_state, bo)) { + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s; } } } @@ -6488,6 +7038,45 @@ iris_rebind_buffer(struct iris_context *ice, /* ------------------------------------------------------------------- */ +/** + * Introduce a batch synchronization boundary, and update its cache coherency + * status to reflect the execution of a PIPE_CONTROL command with the + * specified flags. + */ +static void +batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags) +{ + iris_batch_sync_boundary(batch); + + if ((flags & PIPE_CONTROL_CS_STALL)) { + if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) + iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_RENDER_WRITE); + + if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) + iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DEPTH_WRITE); + + if ((flags & PIPE_CONTROL_FLUSH_ENABLE)) + iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_WRITE); + + if ((flags & (PIPE_CONTROL_CACHE_FLUSH_BITS | + PIPE_CONTROL_STALL_AT_SCOREBOARD))) + iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_READ); + } + + if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) + iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_RENDER_WRITE); + + if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) + iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DEPTH_WRITE); + + if ((flags & PIPE_CONTROL_FLUSH_ENABLE)) + iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_WRITE); + + if ((flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) && + (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE)) + iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_READ); +} + static unsigned flags_to_post_sync_op(uint32_t flags) { @@ -6572,7 +7161,20 @@ iris_emit_raw_pipe_control(struct iris_batch *batch, 0, NULL, 0, 0); } - if (GEN_GEN == 9 && IS_COMPUTE_PIPELINE(batch) && post_sync_flags) { + /* GEN:BUG:1409226450, Wait for EU to be idle before pipe control which + * invalidates the instruction cache + */ + if (GEN_GEN == 12 && (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE)) { + iris_emit_raw_pipe_control(batch, + "workaround: CS stall before instruction " + "cache invalidate", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_STALL_AT_SCOREBOARD, bo, offset, + imm); + } + + if ((GEN_GEN == 9 || (GEN_GEN == 12 && devinfo->revision == 0 /* A0*/)) && + IS_COMPUTE_PIPELINE(batch) && post_sync_flags) { /* Project: SKL / Argument: LRI Post Sync Operation [23] * * "PIPECONTROL command with “Command Streamer Stall Enable” must be @@ -6581,23 +7183,14 @@ iris_emit_raw_pipe_control(struct iris_batch *batch, * PIPELINE_SELECT command is set to GPGPU mode of operation)." * * The same text exists a few rows below for Post Sync Op. + * + * On Gen12 this is GEN:BUG:1607156449. */ iris_emit_raw_pipe_control(batch, "workaround: CS stall before gpgpu post-sync", PIPE_CONTROL_CS_STALL, bo, offset, imm); } - if (GEN_GEN == 10 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) { - /* Cannonlake: - * "Before sending a PIPE_CONTROL command with bit 12 set, SW must issue - * another PIPE_CONTROL with Render Target Cache Flush Enable (bit 12) - * = 0 and Pipe Control Flush Enable (bit 7) = 1" - */ - iris_emit_raw_pipe_control(batch, - "workaround: PC flush before RT flush", - PIPE_CONTROL_FLUSH_ENABLE, bo, offset, imm); - } - /* "Flush Types" workarounds --------------------------------------------- * We do these now because they may add post-sync operations or CS stalls. */ @@ -6612,26 +7205,8 @@ iris_emit_raw_pipe_control(struct iris_batch *batch, flags |= PIPE_CONTROL_WRITE_IMMEDIATE; post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE; non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE; - bo = batch->screen->workaround_bo; - } - } - - /* #1130 from Gen10 workarounds page: - * - * "Enable Depth Stall on every Post Sync Op if Render target Cache - * Flush is not enabled in same PIPE CONTROL and Enable Pixel score - * board stall if Render target cache flush is enabled." - * - * Applicable to CNL B0 and C0 steppings only. - * - * The wording here is unclear, and this workaround doesn't look anything - * like the internal bug report recommendations, but leave it be for now... - */ - if (GEN_GEN == 10) { - if (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) { - flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD; - } else if (flags & non_lri_post_sync_flags) { - flags |= PIPE_CONTROL_DEPTH_STALL; + bo = batch->screen->workaround_address.bo; + offset = batch->screen->workaround_address.offset; } } @@ -6890,11 +7465,20 @@ iris_emit_raw_pipe_control(struct iris_batch *batch, flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD; } + if (GEN_GEN >= 12 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) { + /* GEN:BUG:1409600907: + * + * "PIPE_CONTROL with Depth Stall Enable bit must be set + * with any PIPE_CONTROL with Depth Flush Enable bit set. + */ + flags |= PIPE_CONTROL_DEPTH_STALL; + } + /* Emit --------------------------------------------------------------- */ if (INTEL_DEBUG & DEBUG_PIPE_CONTROL) { fprintf(stderr, - " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n", + " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n", (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "", (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "", (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "", @@ -6917,12 +7501,19 @@ iris_emit_raw_pipe_control(struct iris_batch *batch, (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "", (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "", (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "", + (flags & PIPE_CONTROL_FLUSH_HDC) ? "HDC " : "", imm, reason); } + batch_mark_sync_for_pipe_control(batch, flags); + iris_batch_sync_region_start(batch); + iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) { #if GEN_GEN >= 12 pc.TileCacheFlushEnable = flags & PIPE_CONTROL_TILE_CACHE_FLUSH; +#endif +#if GEN_GEN >= 11 + pc.HDCPipelineFlushEnable = flags & PIPE_CONTROL_FLUSH_HDC; #endif pc.LRIPostSyncOperation = NoLRIOperation; pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE; @@ -6951,37 +7542,11 @@ iris_emit_raw_pipe_control(struct iris_batch *batch, flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE; pc.TextureCacheInvalidationEnable = flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; - pc.Address = rw_bo(bo, offset); + pc.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE); pc.ImmediateData = imm; } -} - -void -genX(emit_urb_setup)(struct iris_context *ice, - struct iris_batch *batch, - const unsigned size[4], - bool tess_present, bool gs_present) -{ - const struct gen_device_info *devinfo = &batch->screen->devinfo; - const unsigned push_size_kB = 32; - unsigned entries[4]; - unsigned start[4]; - ice->shaders.last_vs_entry_size = size[MESA_SHADER_VERTEX]; - - gen_get_urb_config(devinfo, 1024 * push_size_kB, - 1024 * ice->shaders.urb_size, - tess_present, gs_present, - size, entries, start); - - for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { - iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) { - urb._3DCommandSubOpcode += i; - urb.VSURBStartingAddress = start[i]; - urb.VSURBEntryAllocationSize = size[i] - 1; - urb.VSNumberofURBEntries = entries[i]; - } - } + iris_batch_sync_region_end(batch); } #if GEN_GEN == 9 @@ -7066,10 +7631,13 @@ iris_emit_mi_report_perf_count(struct iris_batch *batch, uint32_t offset_in_bytes, uint32_t report_id) { + iris_batch_sync_region_start(batch); iris_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) { - mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes); + mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes, + IRIS_DOMAIN_OTHER_WRITE); mi_rpc.ReportID = report_id; } + iris_batch_sync_region_end(batch); } /** @@ -7165,6 +7733,22 @@ genX(emit_hashing_mode)(struct iris_context *ice, struct iris_batch *batch, #endif } +static void +iris_set_frontend_noop(struct pipe_context *ctx, bool enable) +{ + struct iris_context *ice = (struct iris_context *) ctx; + + if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_RENDER], enable)) { + ice->state.dirty |= IRIS_ALL_DIRTY_FOR_RENDER; + ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_RENDER; + } + + if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_COMPUTE], enable)) { + ice->state.dirty |= IRIS_ALL_DIRTY_FOR_COMPUTE; + ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_COMPUTE; + } +} + void genX(init_state)(struct iris_context *ice) { @@ -7194,6 +7778,8 @@ genX(init_state)(struct iris_context *ice) ctx->set_shader_buffers = iris_set_shader_buffers; ctx->set_shader_images = iris_set_shader_images; ctx->set_sampler_views = iris_set_sampler_views; + ctx->set_compute_resources = iris_set_compute_resources; + ctx->set_global_binding = iris_set_global_binding; ctx->set_tess_state = iris_set_tess_state; ctx->set_framebuffer_state = iris_set_framebuffer_state; ctx->set_polygon_stipple = iris_set_polygon_stipple; @@ -7209,40 +7795,41 @@ genX(init_state)(struct iris_context *ice) ctx->create_stream_output_target = iris_create_stream_output_target; ctx->stream_output_target_destroy = iris_stream_output_target_destroy; ctx->set_stream_output_targets = iris_set_stream_output_targets; - - ice->vtbl.destroy_state = iris_destroy_state; - ice->vtbl.init_render_context = iris_init_render_context; - ice->vtbl.init_compute_context = iris_init_compute_context; - ice->vtbl.upload_render_state = iris_upload_render_state; - ice->vtbl.update_surface_base_address = iris_update_surface_base_address; - ice->vtbl.upload_compute_state = iris_upload_compute_state; - ice->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control; - ice->vtbl.emit_mi_report_perf_count = iris_emit_mi_report_perf_count; - ice->vtbl.rebind_buffer = iris_rebind_buffer; - ice->vtbl.load_register_reg32 = iris_load_register_reg32; - ice->vtbl.load_register_reg64 = iris_load_register_reg64; - ice->vtbl.load_register_imm32 = iris_load_register_imm32; - ice->vtbl.load_register_imm64 = iris_load_register_imm64; - ice->vtbl.load_register_mem32 = iris_load_register_mem32; - ice->vtbl.load_register_mem64 = iris_load_register_mem64; - ice->vtbl.store_register_mem32 = iris_store_register_mem32; - ice->vtbl.store_register_mem64 = iris_store_register_mem64; - ice->vtbl.store_data_imm32 = iris_store_data_imm32; - ice->vtbl.store_data_imm64 = iris_store_data_imm64; - ice->vtbl.copy_mem_mem = iris_copy_mem_mem; - ice->vtbl.derived_program_state_size = iris_derived_program_state_size; - ice->vtbl.store_derived_program_state = iris_store_derived_program_state; - ice->vtbl.create_so_decl_list = iris_create_so_decl_list; - ice->vtbl.populate_vs_key = iris_populate_vs_key; - ice->vtbl.populate_tcs_key = iris_populate_tcs_key; - ice->vtbl.populate_tes_key = iris_populate_tes_key; - ice->vtbl.populate_gs_key = iris_populate_gs_key; - ice->vtbl.populate_fs_key = iris_populate_fs_key; - ice->vtbl.populate_cs_key = iris_populate_cs_key; - ice->vtbl.mocs = mocs; - ice->vtbl.lost_genx_state = iris_lost_genx_state; + ctx->set_frontend_noop = iris_set_frontend_noop; + + screen->vtbl.destroy_state = iris_destroy_state; + screen->vtbl.init_render_context = iris_init_render_context; + screen->vtbl.init_compute_context = iris_init_compute_context; + screen->vtbl.upload_render_state = iris_upload_render_state; + screen->vtbl.update_surface_base_address = iris_update_surface_base_address; + screen->vtbl.upload_compute_state = iris_upload_compute_state; + screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control; + screen->vtbl.emit_mi_report_perf_count = iris_emit_mi_report_perf_count; + screen->vtbl.rebind_buffer = iris_rebind_buffer; + screen->vtbl.load_register_reg32 = iris_load_register_reg32; + screen->vtbl.load_register_reg64 = iris_load_register_reg64; + screen->vtbl.load_register_imm32 = iris_load_register_imm32; + screen->vtbl.load_register_imm64 = iris_load_register_imm64; + screen->vtbl.load_register_mem32 = iris_load_register_mem32; + screen->vtbl.load_register_mem64 = iris_load_register_mem64; + screen->vtbl.store_register_mem32 = iris_store_register_mem32; + screen->vtbl.store_register_mem64 = iris_store_register_mem64; + screen->vtbl.store_data_imm32 = iris_store_data_imm32; + screen->vtbl.store_data_imm64 = iris_store_data_imm64; + screen->vtbl.copy_mem_mem = iris_copy_mem_mem; + screen->vtbl.derived_program_state_size = iris_derived_program_state_size; + screen->vtbl.store_derived_program_state = iris_store_derived_program_state; + screen->vtbl.create_so_decl_list = iris_create_so_decl_list; + screen->vtbl.populate_vs_key = iris_populate_vs_key; + screen->vtbl.populate_tcs_key = iris_populate_tcs_key; + screen->vtbl.populate_tes_key = iris_populate_tes_key; + screen->vtbl.populate_gs_key = iris_populate_gs_key; + screen->vtbl.populate_fs_key = iris_populate_fs_key; + screen->vtbl.populate_cs_key = iris_populate_cs_key; + screen->vtbl.lost_genx_state = iris_lost_genx_state; ice->state.dirty = ~0ull; + ice->state.stage_dirty = ~0ull; ice->state.statistics_counters_enabled = true;