From 2db471953ab57e47f9d950f474c1c8267cb0d456 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Fri, 16 Feb 2018 12:01:34 -0800 Subject: [PATCH] anv: Use a proper end-of-pipe sync instead of just CS stall Reviewed-by: Rafael Antognolli Part-of: --- src/intel/vulkan/anv_private.h | 9 ++- src/intel/vulkan/genX_cmd_buffer.c | 124 ++++++++++++++++++++++++++--- 2 files changed, 118 insertions(+), 15 deletions(-) diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 0072bd8422d..288d6ece12e 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2251,13 +2251,14 @@ enum anv_pipe_bits { ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT = (1 << 12), ANV_PIPE_DEPTH_STALL_BIT = (1 << 13), ANV_PIPE_CS_STALL_BIT = (1 << 20), + ANV_PIPE_END_OF_PIPE_SYNC_BIT = (1 << 21), /* This bit does not exist directly in PIPE_CONTROL. Instead it means that * a flush has happened but not a CS stall. The next time we do any sort * of invalidation we need to insert a CS stall at that time. Otherwise, * we would have to CS stall on every flush which could be bad. */ - ANV_PIPE_NEEDS_CS_STALL_BIT = (1 << 21), + ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT = (1 << 22), /* This bit does not exist directly in PIPE_CONTROL. It means that render * target operations related to transfer commands with VkBuffer as @@ -2265,19 +2266,19 @@ enum anv_pipe_bits { * streamer might need to be aware of this to trigger the appropriate stall * before they can proceed with the copy. */ - ANV_PIPE_RENDER_TARGET_BUFFER_WRITES = (1 << 22), + ANV_PIPE_RENDER_TARGET_BUFFER_WRITES = (1 << 23), /* This bit does not exist directly in PIPE_CONTROL. It means that Gen12 * AUX-TT data has changed and we need to invalidate AUX-TT data. This is * done by writing the AUX-TT register. */ - ANV_PIPE_AUX_TABLE_INVALIDATE_BIT = (1 << 23), + ANV_PIPE_AUX_TABLE_INVALIDATE_BIT = (1 << 24), /* This bit does not exist directly in PIPE_CONTROL. It means that a * PIPE_CONTROL with a post-sync operation will follow. This is used to * implement a workaround for Gen9. */ - ANV_PIPE_POST_SYNC_BIT = (1 << 24), + ANV_PIPE_POST_SYNC_BIT = (1 << 25), }; #define ANV_PIPE_FLUSH_BITS ( \ diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 99885298126..b7889680223 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -1999,20 +1999,43 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) if (cmd_buffer->device->physical->always_flush_cache) bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS; - /* Flushes are pipelined while invalidations are handled immediately. - * Therefore, if we're flushing anything then we need to schedule a stall - * before any invalidations can happen. + /* + * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization": + * + * Write synchronization is a special case of end-of-pipe + * synchronization that requires that the render cache and/or depth + * related caches are flushed to memory, where the data will become + * globally visible. This type of synchronization is required prior to + * SW (CPU) actually reading the result data from memory, or initiating + * an operation that will use as a read surface (such as a texture + * surface) a previous render target and/or depth/stencil buffer + * + * + * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": + * + * Exercising the write cache flush bits (Render Target Cache Flush + * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only + * ensures the write caches are flushed and doesn't guarantee the data + * is globally visible. + * + * SW can track the completion of the end-of-pipe-synchronization by + * using "Notify Enable" and "PostSync Operation - Write Immediate + * Data" in the PIPE_CONTROL command. + * + * In other words, flushes are pipelined while invalidations are handled + * immediately. Therefore, if we're flushing anything then we need to + * schedule an end-of-pipe sync before any invalidations can happen. */ if (bits & ANV_PIPE_FLUSH_BITS) - bits |= ANV_PIPE_NEEDS_CS_STALL_BIT; + bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; - /* If we're going to do an invalidate and we have a pending CS stall that - * has yet to be resolved, we do the CS stall now. + /* If we're going to do an invalidate and we have a pending end-of-pipe + * sync that has yet to be resolved, we do the end-of-pipe sync now. */ if ((bits & ANV_PIPE_INVALIDATE_BITS) && - (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) { - bits |= ANV_PIPE_CS_STALL_BIT; - bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT; + (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) { + bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT; + bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; } if (GEN_GEN >= 12 && @@ -2069,7 +2092,8 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) bits &= ~ANV_PIPE_POST_SYNC_BIT; } - if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) { + if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT)) { anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { #if GEN_GEN >= 12 pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT; @@ -2092,6 +2116,40 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT; pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory": + * + * "The most common action to perform upon reaching a + * synchronization point is to write a value out to memory. An + * immediate value (included with the synchronization command) may + * be written." + * + * + * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization": + * + * "In case the data flushed out by the render engine is to be + * read back in to the render engine in coherent manner, then the + * render engine has to wait for the fence completion before + * accessing the flushed data. This can be achieved by following + * means on various products: PIPE_CONTROL command with CS Stall + * and the required write caches flushed with Post-Sync-Operation + * as Write Immediate Data. + * + * Example: + * - Workload-1 (3D/GPGPU/MEDIA) + * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write + * Immediate Data, Required Write Cache Flush bits set) + * - Workload-2 (Can use the data produce or output by + * Workload-1) + */ + if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) { + pipe.CommandStreamerStallEnable = true; + pipe.PostSyncOperation = WriteImmediateData; + pipe.Address = (struct anv_address) { + .bo = cmd_buffer->device->workaround_bo, + .offset = 0 + }; + } + /* * According to the Broadwell documentation, any PIPE_CONTROL with the * "Command Streamer Stall" bit set must also have another bit set, @@ -2123,7 +2181,51 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES); - bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT); + if (GEN_IS_HASWELL) { + /* Haswell needs addition work-arounds: + * + * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": + * + * Option 1: + * PIPE_CONTROL command with the CS Stall and the required write + * caches flushed with Post-SyncOperation as Write Immediate Data + * followed by eight dummy MI_STORE_DATA_IMM (write to scratch + * spce) commands. + * + * Example: + * - Workload-1 + * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write + * Immediate Data, Required Write Cache Flush bits set) + * - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address) + * - Workload-2 (Can use the data produce or output by + * Workload-1) + * + * Unfortunately, both the PRMs and the internal docs are a bit + * out-of-date in this regard. What the windows driver does (and + * this appears to actually work) is to emit a register read from the + * memory address written by the pipe control above. + * + * What register we load into doesn't matter. We choose an indirect + * rendering register because we know it always exists and it's one + * of the first registers the command parser allows us to write. If + * you don't have command parser support in your kernel (pre-4.2), + * this will get turned into MI_NOOP and you won't get the + * workaround. Unfortunately, there's just not much we can do in + * that case. This register is perfectly safe to write since we + * always re-load all of the indirect draw registers right before + * 3DPRIMITIVE when needed anyway. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = 0x243C; /* GEN7_3DPRIM_START_INSTANCE */ + lrm.MemoryAddress = (struct anv_address) { + .bo = cmd_buffer->device->workaround_bo, + .offset = 0 + }; + } + } + + bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT); } if (bits & ANV_PIPE_INVALIDATE_BITS) { -- 2.30.2