src/gallium/drivers/iris/iris_pipe_control.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "iris_context.h"
  25 #include "util/hash_table.h"
  26 #include "util/set.h"
  27
  28 /**
  29  * Emit a PIPE_CONTROL with various flushing flags.
  30  *
  31  * The caller is responsible for deciding what flags are appropriate for the
  32  * given generation.
  33  */
  34 void
  35 iris_emit_pipe_control_flush(struct iris_batch *batch, uint32_t flags)
  36 {
  37    if ((flags & PIPE_CONTROL_CACHE_FLUSH_BITS) &&
  38        (flags & PIPE_CONTROL_CACHE_INVALIDATE_BITS)) {
  39       /* A pipe control command with flush and invalidate bits set
  40        * simultaneously is an inherently racy operation on Gen6+ if the
  41        * contents of the flushed caches were intended to become visible from
  42        * any of the invalidated caches.  Split it in two PIPE_CONTROLs, the
  43        * first one should stall the pipeline to make sure that the flushed R/W
  44        * caches are coherent with memory once the specified R/O caches are
  45        * invalidated.  On pre-Gen6 hardware the (implicit) R/O cache
  46        * invalidation seems to happen at the bottom of the pipeline together
  47        * with any write cache flush, so this shouldn't be a concern.  In order
  48        * to ensure a full stall, we do an end-of-pipe sync.
  49        */
  50       iris_emit_end_of_pipe_sync(batch, flags & PIPE_CONTROL_CACHE_FLUSH_BITS);
  51       flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL);
  52    }
  53
  54    batch->vtbl->emit_raw_pipe_control(batch, flags, NULL, 0, 0);
  55 }
  56
  57 /**
  58  * Emit a PIPE_CONTROL that writes to a buffer object.
  59  *
  60  * \p flags should contain one of the following items:
  61  *  - PIPE_CONTROL_WRITE_IMMEDIATE
  62  *  - PIPE_CONTROL_WRITE_TIMESTAMP
  63  *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
  64  */
  65 void
  66 iris_emit_pipe_control_write(struct iris_batch *batch, uint32_t flags,
  67                              struct iris_bo *bo, uint32_t offset,
  68                              uint64_t imm)
  69 {
  70    batch->vtbl->emit_raw_pipe_control(batch, flags, bo, offset, imm);
  71 }
  72
  73 /*
  74  * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
  75  *
  76  *  Write synchronization is a special case of end-of-pipe
  77  *  synchronization that requires that the render cache and/or depth
  78  *  related caches are flushed to memory, where the data will become
  79  *  globally visible. This type of synchronization is required prior to
  80  *  SW (CPU) actually reading the result data from memory, or initiating
  81  *  an operation that will use as a read surface (such as a texture
  82  *  surface) a previous render target and/or depth/stencil buffer
  83  *
  84  * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
  85  *
  86  *  Exercising the write cache flush bits (Render Target Cache Flush
  87  *  Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
  88  *  ensures the write caches are flushed and doesn't guarantee the data
  89  *  is globally visible.
  90  *
  91  *  SW can track the completion of the end-of-pipe-synchronization by
  92  *  using "Notify Enable" and "PostSync Operation - Write Immediate
  93  *  Data" in the PIPE_CONTROL command.
  94  */
  95 void
  96 iris_emit_end_of_pipe_sync(struct iris_batch *batch, uint32_t flags)
  97 {
  98    /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
  99     *
 100     *    "The most common action to perform upon reaching a synchronization
 101     *    point is to write a value out to memory. An immediate value
 102     *    (included with the synchronization command) may be written."
 103     *
 104     * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
 105     *
 106     *    "In case the data flushed out by the render engine is to be read
 107     *    back in to the render engine in coherent manner, then the render
 108     *    engine has to wait for the fence completion before accessing the
 109     *    flushed data. This can be achieved by following means on various
 110     *    products: PIPE_CONTROL command with CS Stall and the required
 111     *    write caches flushed with Post-Sync-Operation as Write Immediate
 112     *    Data.
 113     *
 114     *    Example:
 115     *       - Workload-1 (3D/GPGPU/MEDIA)
 116     *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write Immediate
 117     *         Data, Required Write Cache Flush bits set)
 118     *       - Workload-2 (Can use the data produce or output by Workload-1)
 119     */
 120    iris_emit_pipe_control_write(batch, flags | PIPE_CONTROL_CS_STALL |
 121                                 PIPE_CONTROL_WRITE_IMMEDIATE,
 122                                 batch->screen->workaround_bo, 0, 0);
 123 }
 124
 125 void
 126 iris_cache_sets_clear(struct iris_batch *batch)
 127 {
 128    struct hash_entry *render_entry;
 129    hash_table_foreach(batch->cache.render, render_entry)
 130       _mesa_hash_table_remove(batch->cache.render, render_entry);
 131
 132    struct set_entry *depth_entry;
 133    set_foreach(batch->cache.depth, depth_entry)
 134       _mesa_set_remove(batch->cache.depth, depth_entry);
 135 }
 136
 137 /**
 138  * Emits an appropriate flush for a BO if it has been rendered to within the
 139  * same batchbuffer as a read that's about to be emitted.
 140  *
 141  * The GPU has separate, incoherent caches for the render cache and the
 142  * sampler cache, along with other caches.  Usually data in the different
 143  * caches don't interact (e.g. we don't render to our driver-generated
 144  * immediate constant data), but for render-to-texture in FBOs we definitely
 145  * do.  When a batchbuffer is flushed, the kernel will ensure that everything
 146  * necessary is flushed before another use of that BO, but for reuse from
 147  * different caches within a batchbuffer, it's all our responsibility.
 148  */
 149 static void
 150 flush_depth_and_render_caches(struct iris_batch *batch, struct iris_bo *bo)
 151 {
 152    iris_emit_pipe_control_flush(batch,
 153                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 154                                 PIPE_CONTROL_RENDER_TARGET_FLUSH |
 155                                 PIPE_CONTROL_CS_STALL);
 156
 157    iris_emit_pipe_control_flush(batch,
 158                                 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 159                                 PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 160
 161    iris_cache_sets_clear(batch);
 162 }
 163
 164 void
 165 iris_cache_flush_for_read(struct iris_batch *batch,
 166                           struct iris_bo *bo)
 167 {
 168    if (_mesa_hash_table_search(batch->cache.render, bo) ||
 169        _mesa_set_search(batch->cache.depth, bo))
 170       flush_depth_and_render_caches(batch, bo);
 171 }
 172
 173 static void *
 174 format_aux_tuple(enum isl_format format, enum isl_aux_usage aux_usage)
 175 {
 176    return (void *)(uintptr_t)((uint32_t)format << 8 | aux_usage);
 177 }
 178
 179 void
 180 iris_cache_flush_for_render(struct iris_batch *batch,
 181                             struct iris_bo *bo,
 182                             enum isl_format format,
 183                             enum isl_aux_usage aux_usage)
 184 {
 185    if (_mesa_set_search(batch->cache.depth, bo))
 186       flush_depth_and_render_caches(batch, bo);
 187
 188    /* Check to see if this bo has been used by a previous rendering operation
 189     * but with a different format or aux usage.  If it has, flush the render
 190     * cache so we ensure that it's only in there with one format or aux usage
 191     * at a time.
 192     *
 193     * Even though it's not obvious, this can easily happen in practice.
 194     * Suppose a client is blending on a surface with sRGB encode enabled on
 195     * gen9.  This implies that you get AUX_USAGE_CCS_D at best.  If the client
 196     * then disables sRGB decode and continues blending we will flip on
 197     * AUX_USAGE_CCS_E without doing any sort of resolve in-between (this is
 198     * perfectly valid since CCS_E is a subset of CCS_D).  However, this means
 199     * that we have fragments in-flight which are rendering with UNORM+CCS_E
 200     * and other fragments in-flight with SRGB+CCS_D on the same surface at the
 201     * same time and the pixel scoreboard and color blender are trying to sort
 202     * it all out.  This ends badly (i.e. GPU hangs).
 203     *
 204     * To date, we have never observed GPU hangs or even corruption to be
 205     * associated with switching the format, only the aux usage.  However,
 206     * there are comments in various docs which indicate that the render cache
 207     * isn't 100% resilient to format changes.  We may as well be conservative
 208     * and flush on format changes too.  We can always relax this later if we
 209     * find it to be a performance problem.
 210     */
 211    struct hash_entry *entry = _mesa_hash_table_search(batch->cache.render, bo);
 212    if (entry && entry->data != format_aux_tuple(format, aux_usage))
 213       flush_depth_and_render_caches(batch, bo);
 214 }
 215
 216 void
 217 iris_render_cache_add_bo(struct iris_batch *batch,
 218                          struct iris_bo *bo,
 219                          enum isl_format format,
 220                          enum isl_aux_usage aux_usage)
 221 {
 222 #ifndef NDEBUG
 223    struct hash_entry *entry = _mesa_hash_table_search(batch->cache.render, bo);
 224    if (entry) {
 225       /* Otherwise, someone didn't do a flush_for_render and that would be
 226        * very bad indeed.
 227        */
 228       assert(entry->data == format_aux_tuple(format, aux_usage));
 229    }
 230 #endif
 231
 232    _mesa_hash_table_insert(batch->cache.render, bo,
 233                            format_aux_tuple(format, aux_usage));
 234 }
 235
 236 void
 237 iris_cache_flush_for_depth(struct iris_batch *batch,
 238                            struct iris_bo *bo)
 239 {
 240    if (_mesa_hash_table_search(batch->cache.render, bo))
 241       flush_depth_and_render_caches(batch, bo);
 242 }
 243
 244 void
 245 iris_depth_cache_add_bo(struct iris_batch *batch, struct iris_bo *bo)
 246 {
 247    _mesa_set_add(batch->cache.depth, bo);
 248 }