X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fvulkan%2Fgen8_cmd_buffer.c;h=5b6afb3d70d5adf5283fbd99dacab08f00c7a54a;hb=6a049687841d87fc5bbd0fb0a192f03776f67630;hp=f1c82235d3ddb79073f8c929e1e7c334a84c4566;hpb=28cbc45b3c83d645bb2b805a0ed6008e2f9dad61;p=mesa.git diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c index f1c82235d3d..5b6afb3d70d 100644 --- a/src/intel/vulkan/gen8_cmd_buffer.c +++ b/src/intel/vulkan/gen8_cmd_buffer.c @@ -32,46 +32,6 @@ #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" -static uint32_t -cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer) -{ - static const uint32_t push_constant_opcodes[] = { - [MESA_SHADER_VERTEX] = 21, - [MESA_SHADER_TESS_CTRL] = 25, /* HS */ - [MESA_SHADER_TESS_EVAL] = 26, /* DS */ - [MESA_SHADER_GEOMETRY] = 22, - [MESA_SHADER_FRAGMENT] = 23, - [MESA_SHADER_COMPUTE] = 0, - }; - - VkShaderStageFlags flushed = 0; - - anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) { - if (stage == MESA_SHADER_COMPUTE) - continue; - - struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage); - - if (state.offset == 0) { - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), - ._3DCommandSubOpcode = push_constant_opcodes[stage]); - } else { - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), - ._3DCommandSubOpcode = push_constant_opcodes[stage], - .ConstantBody = { - .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset }, - .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32), - }); - } - - flushed |= mesa_to_vk_shader_stage(stage); - } - - cmd_buffer->state.push_constants_dirty &= ~flushed; - - return flushed; -} - #if GEN_GEN == 8 static void emit_viewport_state(struct anv_cmd_buffer *cmd_buffer, @@ -148,18 +108,13 @@ gen8_cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer) } #endif -static void -emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm) -{ - anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), - .RegisterOffset = reg, - .DataDWord = imm); -} - -#define GEN8_L3CNTLREG 0x7034 +#define emit_lri(batch, reg, imm) \ + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), \ + .RegisterOffset = __anv_reg_num(reg), \ + .DataDWord = imm) -static void -config_l3(struct anv_cmd_buffer *cmd_buffer, bool enable_slm) +void +genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, bool enable_slm) { /* References for GL state: * @@ -167,38 +122,59 @@ config_l3(struct anv_cmd_buffer *cmd_buffer, bool enable_slm) * - src/mesa/drivers/dri/i965/gen7_l3_state.c */ - uint32_t val = enable_slm ? - /* All = 48 ways; URB = 16 ways; DC and RO = 0, SLM = 1 */ - 0x60000021 : - /* All = 48 ways; URB = 48 ways; DC, RO and SLM = 0 */ - 0x60000060; - bool changed = cmd_buffer->state.current_l3_config != val; + uint32_t l3cr_slm, l3cr_noslm; + anv_pack_struct(&l3cr_noslm, GENX(L3CNTLREG), + .URBAllocation = 48, + .AllAllocation = 48); + anv_pack_struct(&l3cr_slm, GENX(L3CNTLREG), + .SLMEnable = 1, + .URBAllocation = 16, + .AllAllocation = 48); + const uint32_t l3cr_val = enable_slm ? l3cr_slm : l3cr_noslm; + bool changed = cmd_buffer->state.current_l3_config != l3cr_val; if (changed) { - /* According to the hardware docs, the L3 partitioning can only be changed - * while the pipeline is completely drained and the caches are flushed, - * which involves a first PIPE_CONTROL flush which stalls the pipeline and - * initiates invalidation of the relevant caches... + /* According to the hardware docs, the L3 partitioning can only be + * changed while the pipeline is completely drained and the caches are + * flushed, which involves a first PIPE_CONTROL flush which stalls the + * pipeline... */ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), - .TextureCacheInvalidationEnable = true, - .ConstantCacheInvalidationEnable = true, - .InstructionCacheInvalidateEnable = true, .DCFlushEnable = true, .PostSyncOperation = NoWrite, .CommandStreamerStallEnable = true); - /* ...followed by a second stalling flush which guarantees that - * invalidation is complete when the L3 configuration registers are - * modified. + /* ...followed by a second pipelined PIPE_CONTROL that initiates + * invalidation of the relevant caches. Note that because RO + * invalidation happens at the top of the pipeline (i.e. right away as + * the PIPE_CONTROL command is processed by the CS) we cannot combine it + * with the previous stalling flush as the hardware documentation + * suggests, because that would cause the CS to stall on previous + * rendering *after* RO invalidation and wouldn't prevent the RO caches + * from being polluted by concurrent rendering before the stall + * completes. This intentionally doesn't implement the SKL+ hardware + * workaround suggesting to enable CS stall on PIPE_CONTROLs with the + * texture cache invalidation bit set for GPGPU workloads because the + * previous and subsequent PIPE_CONTROLs already guarantee that there is + * no concurrent GPGPU kernel execution (see SKL HSD 2132585). + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), + .TextureCacheInvalidationEnable = true, + .ConstantCacheInvalidationEnable = true, + .InstructionCacheInvalidateEnable = true, + .StateCacheInvalidationEnable = true, + .PostSyncOperation = NoWrite); + + /* Now send a third stalling flush to make sure that invalidation is + * complete when the L3 configuration registers are modified. */ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), .DCFlushEnable = true, .PostSyncOperation = NoWrite, .CommandStreamerStallEnable = true); - emit_lri(&cmd_buffer->batch, GEN8_L3CNTLREG, val); - cmd_buffer->state.current_l3_config = val; + emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG), l3cr_val); + cmd_buffer->state.current_l3_config = l3cr_val; } } @@ -240,95 +216,6 @@ __emit_sf_state(struct anv_cmd_buffer *cmd_buffer) __emit_genx_sf_state(cmd_buffer); } -void -genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) -{ - struct anv_pipeline *pipeline = cmd_buffer->state.pipeline; - uint32_t *p; - - uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used; - - assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0); - - config_l3(cmd_buffer, false); - - genX(flush_pipeline_select_3d)(cmd_buffer); - - if (vb_emit) { - const uint32_t num_buffers = __builtin_popcount(vb_emit); - const uint32_t num_dwords = 1 + num_buffers * 4; - - p = anv_batch_emitn(&cmd_buffer->batch, num_dwords, - GENX(3DSTATE_VERTEX_BUFFERS)); - uint32_t vb, i = 0; - for_each_bit(vb, vb_emit) { - struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer; - uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset; - - struct GENX(VERTEX_BUFFER_STATE) state = { - .VertexBufferIndex = vb, - .MemoryObjectControlState = GENX(MOCS), - .AddressModifyEnable = true, - .BufferPitch = pipeline->binding_stride[vb], - .BufferStartingAddress = { buffer->bo, buffer->offset + offset }, - .BufferSize = buffer->size - offset - }; - - GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state); - i++; - } - } - - cmd_buffer->state.vb_dirty &= ~vb_emit; - - if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) { - /* If somebody compiled a pipeline after starting a command buffer the - * scratch bo may have grown since we started this cmd buffer (and - * emitted STATE_BASE_ADDRESS). If we're binding that pipeline now, - * reemit STATE_BASE_ADDRESS so that we use the bigger scratch bo. */ - if (cmd_buffer->state.scratch_size < pipeline->total_scratch) - anv_cmd_buffer_emit_state_base_address(cmd_buffer); - - anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); - - /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS: - * - * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to - * the next 3DPRIMITIVE command after programming the - * 3DSTATE_PUSH_CONSTANT_ALLOC_VS" - * - * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of - * pipeline setup, we need to dirty push constants. - */ - cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; - } - - /* We emit the binding tables and sampler tables first, then emit push - * constants and then finally emit binding table and sampler table - * pointers. It has to happen in this order, since emitting the binding - * tables may change the push constants (in case of storage images). After - * emitting push constants, on SKL+ we have to emit the corresponding - * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect. - */ - uint32_t dirty = 0; - if (cmd_buffer->state.descriptors_dirty) - dirty = gen7_cmd_buffer_flush_descriptor_sets(cmd_buffer); - - if (cmd_buffer->state.push_constants_dirty) - dirty |= cmd_buffer_flush_push_constants(cmd_buffer); - - if (dirty) - gen7_cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty); - - if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT) - gen8_cmd_buffer_emit_viewport(cmd_buffer); - - if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR) - gen7_cmd_buffer_emit_scissor(cmd_buffer); - - genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); -} - void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) { @@ -573,30 +460,9 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT); bool needs_slm = cs_prog_data->base.total_shared > 0; - config_l3(cmd_buffer, needs_slm); - - if (cmd_buffer->state.current_pipeline != GPGPU) { -#if GEN_GEN < 10 - /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT: - * - * Software must clear the COLOR_CALC_STATE Valid field in - * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT - * with Pipeline Select set to GPGPU. - * - * The internal hardware docs recommend the same workaround for Gen9 - * hardware too. - */ - anv_batch_emit(&cmd_buffer->batch, - GENX(3DSTATE_CC_STATE_POINTERS)); -#endif + genX(cmd_buffer_config_l3)(cmd_buffer, needs_slm); - anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), -#if GEN_GEN >= 9 - .MaskBits = 3, -#endif - .PipelineSelection = GPGPU); - cmd_buffer->state.current_pipeline = GPGPU; - } + genX(flush_pipeline_select_gpgpu)(cmd_buffer); if (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE) anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);