From f8796079dab4a3d6fb9ea034b93fd38de53fbe1f Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Fri, 19 Oct 2018 01:29:05 -0700 Subject: [PATCH] iris: better dirty checking --- src/gallium/drivers/iris/iris_program_cache.c | 4 + src/gallium/drivers/iris/iris_state.c | 115 ++++++++++-------- 2 files changed, 67 insertions(+), 52 deletions(-) diff --git a/src/gallium/drivers/iris/iris_program_cache.c b/src/gallium/drivers/iris/iris_program_cache.c index 0006e8e8bea..9fad14a2847 100644 --- a/src/gallium/drivers/iris/iris_program_cache.c +++ b/src/gallium/drivers/iris/iris_program_cache.c @@ -115,6 +115,10 @@ dirty_flag_for_cache(enum iris_program_cache_id cache_id) if (cache_id == IRIS_CACHE_VS) return IRIS_DIRTY_VS | IRIS_DIRTY_VF_SGVS; + /* For compute, prog_data->threads needs to be uploaded as constants. */ + if (cache_id == IRIS_CACHE_CS) + return IRIS_DIRTY_CS | IRIS_DIRTY_CONSTANTS_CS; + return IRIS_DIRTY_VS << cache_id | IRIS_DIRTY_BINDINGS_VS << cache_id; } diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index c11e4e3e87b..b30d4b7b27f 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -4284,43 +4284,46 @@ iris_upload_compute_state(struct iris_context *ice, if (ice->state.need_border_colors) iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false); - /* The MEDIA_VFE_STATE documentation for Gen8+ says: - * - * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless - * the only bits that are changed are scoreboard related: Scoreboard - * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For - * these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient." - */ - iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL); + if (dirty & IRIS_DIRTY_CS) { + /* The MEDIA_VFE_STATE documentation for Gen8+ says: + * + * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless + * the only bits that are changed are scoreboard related: Scoreboard + * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta. For + * these scoreboard related states, a MEDIA_STATE_FLUSH is + * sufficient." + */ + iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL); - iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) { - if (prog_data->total_scratch) { - /* Per Thread Scratch Space is in the range [0, 11] where - * 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M. - */ - // XXX: vfe.ScratchSpaceBasePointer - //vfe.PerThreadScratchSpace = - //ffs(stage_state->per_thread_scratch) - 11; - } + iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) { + if (prog_data->total_scratch) { + /* Per Thread Scratch Space is in the range [0, 11] where + * 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M. + */ + // XXX: vfe.ScratchSpaceBasePointer + //vfe.PerThreadScratchSpace = + //ffs(stage_state->per_thread_scratch) - 11; + } - vfe.MaximumNumberofThreads = - devinfo->max_cs_threads * screen->subslice_total - 1; + vfe.MaximumNumberofThreads = + devinfo->max_cs_threads * screen->subslice_total - 1; #if GEN_GEN < 11 - vfe.ResetGatewayTimer = - Resettingrelativetimerandlatchingtheglobaltimestamp; + vfe.ResetGatewayTimer = + Resettingrelativetimerandlatchingtheglobaltimestamp; #endif - vfe.NumberofURBEntries = 2; - vfe.URBEntryAllocationSize = 2; + vfe.NumberofURBEntries = 2; + vfe.URBEntryAllocationSize = 2; - // XXX: Use Indirect Payload Storage? - vfe.CURBEAllocationSize = - ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads + - cs_prog_data->push.cross_thread.regs, 2); + // XXX: Use Indirect Payload Storage? + vfe.CURBEAllocationSize = + ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads + + cs_prog_data->push.cross_thread.regs, 2); + } } - // XXX: hack iris_set_constant_buffers to upload compute shader constants - // XXX: differently...? + // XXX: hack iris_set_constant_buffers to upload these thread counts + // XXX: along with regular uniforms for compute shaders, somehow. uint32_t curbe_data_offset = 0; // TODO: Move subgroup-id into uniforms ubo so we can push uniforms @@ -4335,35 +4338,43 @@ iris_upload_compute_state(struct iris_context *ice, assert(curbe_data_map); memset(curbe_data_map, 0x5a, ALIGN(cs_prog_data->push.total.size, 64)); iris_fill_cs_push_const_buffer(cs_prog_data, curbe_data_map); - iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) { - curbe.CURBETotalDataLength = - ALIGN(cs_prog_data->push.total.size, 64); - curbe.CURBEDataStartAddress = curbe_data_offset; + + if (dirty & IRIS_DIRTY_CONSTANTS_CS) { + iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) { + curbe.CURBETotalDataLength = + ALIGN(cs_prog_data->push.total.size, 64); + curbe.CURBEDataStartAddress = curbe_data_offset; + } } - struct pipe_resource *desc_res = NULL; - uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; + if (dirty & (IRIS_DIRTY_SAMPLER_STATES_CS | + IRIS_DIRTY_BINDINGS_CS | + IRIS_DIRTY_CONSTANTS_CS | + IRIS_DIRTY_CS)) { + struct pipe_resource *desc_res = NULL; + uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; - iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) { - idd.SamplerStatePointer = shs->sampler_table.offset; - idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE]; - idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs; - idd.CrossThreadConstantDataReadLength = - cs_prog_data->push.cross_thread.regs; - } + iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) { + idd.SamplerStatePointer = shs->sampler_table.offset; + idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE]; + idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs; + idd.CrossThreadConstantDataReadLength = + cs_prog_data->push.cross_thread.regs; + } - for (int i = 0; i < GENX(INTERFACE_DESCRIPTOR_DATA_length); i++) - desc[i] |= ((uint32_t *) shader->derived_data)[i]; + for (int i = 0; i < GENX(INTERFACE_DESCRIPTOR_DATA_length); i++) + desc[i] |= ((uint32_t *) shader->derived_data)[i]; - iris_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) { - load.InterfaceDescriptorTotalLength = - GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); - load.InterfaceDescriptorDataStartAddress = - emit_state(batch, ice->state.dynamic_uploader, - &desc_res, desc, sizeof(desc), 32); - } + iris_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) { + load.InterfaceDescriptorTotalLength = + GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); + load.InterfaceDescriptorDataStartAddress = + emit_state(batch, ice->state.dynamic_uploader, + &desc_res, desc, sizeof(desc), 32); + } - pipe_resource_reference(&desc_res, NULL); + pipe_resource_reference(&desc_res, NULL); + } uint32_t group_size = grid->block[0] * grid->block[1] * grid->block[2]; uint32_t remainder = group_size & (cs_prog_data->simd_size - 1); -- 2.30.2