src/gallium/drivers/iris/iris_pipe_control.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * @file iris_pipe_control.c
  26  *
  27  * PIPE_CONTROL is the main flushing and synchronization primitive on Intel
  28  * GPUs.  It can invalidate caches, stall until rendering reaches various
  29  * stages of completion, write to memory, and other things.  In a way, it's
  30  * a swiss army knife command - it has all kinds of capabilities, but some
  31  * significant limitations as well.
  32  *
  33  * Unfortunately, it's notoriously complicated and difficult to use.  Many
  34  * sub-commands can't be used together.  Some are meant to be used at the
  35  * top of the pipeline (invalidating caches before drawing), while some are
  36  * meant to be used at the end (stalling or flushing after drawing).
  37  *
  38  * Also, there's a list of restrictions a mile long, which vary by generation.
  39  * Do this before doing that, or suffer the consequences (usually a GPU hang).
  40  *
  41  * This file contains helpers for emitting them safely.  You can simply call
  42  * iris_emit_pipe_control_flush() with the desired operations (as logical
  43  * PIPE_CONTROL_* bits), and it will take care of splitting it into multiple
  44  * PIPE_CONTROL commands as necessary.  The per-generation workarounds are
  45  * applied in iris_emit_raw_pipe_control() in iris_state.c.
  46  *
  47  * This file also contains our cache tracking helpers.  We have sets for
  48  * the render cache, depth cache, and so on.  If a BO is in the set, then
  49  * it may have data in that cache.  These take care of emitting flushes for
  50  * render-to-texture, format reinterpretation issues, and other situations.
  51  */
  52
  53 #include "iris_context.h"
  54 #include "util/hash_table.h"
  55 #include "util/set.h"
  56
  57 /**
  58  * Emit a PIPE_CONTROL with various flushing flags.
  59  *
  60  * The caller is responsible for deciding what flags are appropriate for the
  61  * given generation.
  62  */
  63 void
  64 iris_emit_pipe_control_flush(struct iris_batch *batch, uint32_t flags)
  65 {
  66    if ((flags & PIPE_CONTROL_CACHE_FLUSH_BITS) &&
  67        (flags & PIPE_CONTROL_CACHE_INVALIDATE_BITS)) {
  68       /* A pipe control command with flush and invalidate bits set
  69        * simultaneously is an inherently racy operation on Gen6+ if the
  70        * contents of the flushed caches were intended to become visible from
  71        * any of the invalidated caches.  Split it in two PIPE_CONTROLs, the
  72        * first one should stall the pipeline to make sure that the flushed R/W
  73        * caches are coherent with memory once the specified R/O caches are
  74        * invalidated.  On pre-Gen6 hardware the (implicit) R/O cache
  75        * invalidation seems to happen at the bottom of the pipeline together
  76        * with any write cache flush, so this shouldn't be a concern.  In order
  77        * to ensure a full stall, we do an end-of-pipe sync.
  78        */
  79       iris_emit_end_of_pipe_sync(batch, flags & PIPE_CONTROL_CACHE_FLUSH_BITS);
  80       flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL);
  81    }
  82
  83    batch->vtbl->emit_raw_pipe_control(batch, flags, NULL, 0, 0);
  84 }
  85
  86 /**
  87  * Emit a PIPE_CONTROL that writes to a buffer object.
  88  *
  89  * \p flags should contain one of the following items:
  90  *  - PIPE_CONTROL_WRITE_IMMEDIATE
  91  *  - PIPE_CONTROL_WRITE_TIMESTAMP
  92  *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
  93  */
  94 void
  95 iris_emit_pipe_control_write(struct iris_batch *batch, uint32_t flags,
  96                              struct iris_bo *bo, uint32_t offset,
  97                              uint64_t imm)
  98 {
  99    batch->vtbl->emit_raw_pipe_control(batch, flags, bo, offset, imm);
 100 }
 101
 102 /*
 103  * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
 104  *
 105  *  Write synchronization is a special case of end-of-pipe
 106  *  synchronization that requires that the render cache and/or depth
 107  *  related caches are flushed to memory, where the data will become
 108  *  globally visible. This type of synchronization is required prior to
 109  *  SW (CPU) actually reading the result data from memory, or initiating
 110  *  an operation that will use as a read surface (such as a texture
 111  *  surface) a previous render target and/or depth/stencil buffer
 112  *
 113  * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
 114  *
 115  *  Exercising the write cache flush bits (Render Target Cache Flush
 116  *  Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
 117  *  ensures the write caches are flushed and doesn't guarantee the data
 118  *  is globally visible.
 119  *
 120  *  SW can track the completion of the end-of-pipe-synchronization by
 121  *  using "Notify Enable" and "PostSync Operation - Write Immediate
 122  *  Data" in the PIPE_CONTROL command.
 123  */
 124 void
 125 iris_emit_end_of_pipe_sync(struct iris_batch *batch, uint32_t flags)
 126 {
 127    /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
 128     *
 129     *    "The most common action to perform upon reaching a synchronization
 130     *    point is to write a value out to memory. An immediate value
 131     *    (included with the synchronization command) may be written."
 132     *
 133     * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
 134     *
 135     *    "In case the data flushed out by the render engine is to be read
 136     *    back in to the render engine in coherent manner, then the render
 137     *    engine has to wait for the fence completion before accessing the
 138     *    flushed data. This can be achieved by following means on various
 139     *    products: PIPE_CONTROL command with CS Stall and the required
 140     *    write caches flushed with Post-Sync-Operation as Write Immediate
 141     *    Data.
 142     *
 143     *    Example:
 144     *       - Workload-1 (3D/GPGPU/MEDIA)
 145     *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write Immediate
 146     *         Data, Required Write Cache Flush bits set)
 147     *       - Workload-2 (Can use the data produce or output by Workload-1)
 148     */
 149    iris_emit_pipe_control_write(batch, flags | PIPE_CONTROL_CS_STALL |
 150                                 PIPE_CONTROL_WRITE_IMMEDIATE,
 151                                 batch->screen->workaround_bo, 0, 0);
 152 }
 153
 154 void
 155 iris_cache_sets_clear(struct iris_batch *batch)
 156 {
 157    struct hash_entry *render_entry;
 158    hash_table_foreach(batch->cache.render, render_entry)
 159       _mesa_hash_table_remove(batch->cache.render, render_entry);
 160
 161    struct set_entry *depth_entry;
 162    set_foreach(batch->cache.depth, depth_entry)
 163       _mesa_set_remove(batch->cache.depth, depth_entry);
 164 }
 165
 166 /**
 167  * Emits an appropriate flush for a BO if it has been rendered to within the
 168  * same batchbuffer as a read that's about to be emitted.
 169  *
 170  * The GPU has separate, incoherent caches for the render cache and the
 171  * sampler cache, along with other caches.  Usually data in the different
 172  * caches don't interact (e.g. we don't render to our driver-generated
 173  * immediate constant data), but for render-to-texture in FBOs we definitely
 174  * do.  When a batchbuffer is flushed, the kernel will ensure that everything
 175  * necessary is flushed before another use of that BO, but for reuse from
 176  * different caches within a batchbuffer, it's all our responsibility.
 177  */
 178 static void
 179 flush_depth_and_render_caches(struct iris_batch *batch)
 180 {
 181    iris_emit_pipe_control_flush(batch,
 182                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 183                                 PIPE_CONTROL_RENDER_TARGET_FLUSH |
 184                                 PIPE_CONTROL_CS_STALL);
 185
 186    iris_emit_pipe_control_flush(batch,
 187                                 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 188                                 PIPE_CONTROL_CONST_CACHE_INVALIDATE);
 189
 190    iris_cache_sets_clear(batch);
 191 }
 192
 193 void
 194 iris_cache_flush_for_read(struct iris_batch *batch,
 195                           struct iris_bo *bo)
 196 {
 197    if (_mesa_hash_table_search(batch->cache.render, bo) ||
 198        _mesa_set_search(batch->cache.depth, bo))
 199       flush_depth_and_render_caches(batch);
 200 }
 201
 202 static void *
 203 format_aux_tuple(enum isl_format format, enum isl_aux_usage aux_usage)
 204 {
 205    return (void *)(uintptr_t)((uint32_t)format << 8 | aux_usage);
 206 }
 207
 208 void
 209 iris_cache_flush_for_render(struct iris_batch *batch,
 210                             struct iris_bo *bo,
 211                             enum isl_format format,
 212                             enum isl_aux_usage aux_usage)
 213 {
 214    if (_mesa_set_search(batch->cache.depth, bo))
 215       flush_depth_and_render_caches(batch);
 216
 217    /* Check to see if this bo has been used by a previous rendering operation
 218     * but with a different format or aux usage.  If it has, flush the render
 219     * cache so we ensure that it's only in there with one format or aux usage
 220     * at a time.
 221     *
 222     * Even though it's not obvious, this can easily happen in practice.
 223     * Suppose a client is blending on a surface with sRGB encode enabled on
 224     * gen9.  This implies that you get AUX_USAGE_CCS_D at best.  If the client
 225     * then disables sRGB decode and continues blending we will flip on
 226     * AUX_USAGE_CCS_E without doing any sort of resolve in-between (this is
 227     * perfectly valid since CCS_E is a subset of CCS_D).  However, this means
 228     * that we have fragments in-flight which are rendering with UNORM+CCS_E
 229     * and other fragments in-flight with SRGB+CCS_D on the same surface at the
 230     * same time and the pixel scoreboard and color blender are trying to sort
 231     * it all out.  This ends badly (i.e. GPU hangs).
 232     *
 233     * To date, we have never observed GPU hangs or even corruption to be
 234     * associated with switching the format, only the aux usage.  However,
 235     * there are comments in various docs which indicate that the render cache
 236     * isn't 100% resilient to format changes.  We may as well be conservative
 237     * and flush on format changes too.  We can always relax this later if we
 238     * find it to be a performance problem.
 239     */
 240    struct hash_entry *entry = _mesa_hash_table_search(batch->cache.render, bo);
 241    if (entry && entry->data != format_aux_tuple(format, aux_usage))
 242       flush_depth_and_render_caches(batch);
 243 }
 244
 245 void
 246 iris_render_cache_add_bo(struct iris_batch *batch,
 247                          struct iris_bo *bo,
 248                          enum isl_format format,
 249                          enum isl_aux_usage aux_usage)
 250 {
 251 #ifndef NDEBUG
 252    struct hash_entry *entry = _mesa_hash_table_search(batch->cache.render, bo);
 253    if (entry) {
 254       /* Otherwise, someone didn't do a flush_for_render and that would be
 255        * very bad indeed.
 256        */
 257       assert(entry->data == format_aux_tuple(format, aux_usage));
 258    }
 259 #endif
 260
 261    _mesa_hash_table_insert(batch->cache.render, bo,
 262                            format_aux_tuple(format, aux_usage));
 263 }
 264
 265 void
 266 iris_cache_flush_for_depth(struct iris_batch *batch,
 267                            struct iris_bo *bo)
 268 {
 269    if (_mesa_hash_table_search(batch->cache.render, bo))
 270       flush_depth_and_render_caches(batch);
 271 }
 272
 273 void
 274 iris_depth_cache_add_bo(struct iris_batch *batch, struct iris_bo *bo)
 275 {
 276    _mesa_set_add(batch->cache.depth, bo);
 277 }
 278
 279 static void
 280 iris_texture_barrier(struct pipe_context *ctx, unsigned flags)
 281 {
 282    struct iris_context *ice = (void *) ctx;
 283
 284    // XXX: compute batch?
 285
 286    flush_depth_and_render_caches(&ice->render_batch);
 287 }
 288
 289 static void
 290 iris_memory_barrier(struct pipe_context *ctx, unsigned flags)
 291 {
 292    struct iris_context *ice = (void *) ctx;
 293    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 294
 295    if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
 296                 PIPE_BARRIER_INDEX_BUFFER |
 297                 PIPE_BARRIER_INDIRECT_BUFFER)) {
 298       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 299    }
 300
 301    if (flags & PIPE_BARRIER_CONSTANT_BUFFER) {
 302       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 303               PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 304    }
 305
 306    if (flags & PIPE_BARRIER_TEXTURE) {
 307       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 308    }
 309
 310    if (flags & PIPE_BARRIER_FRAMEBUFFER) {
 311       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 312               PIPE_CONTROL_RENDER_TARGET_FLUSH;
 313    }
 314
 315    // XXX: MAPPED_BUFFER, QUERY_BUFFER, STREAMOUT_BUFFER, GLOBAL_BUFFER?
 316    // XXX: compute batch?
 317
 318    iris_emit_pipe_control_flush(&ice->render_batch, bits);
 319 }
 320
 321 void
 322 iris_init_flush_functions(struct pipe_context *ctx)
 323 {
 324    ctx->memory_barrier = iris_memory_barrier;
 325    ctx->texture_barrier = iris_texture_barrier;
 326 }