src/gallium/drivers/iris/iris_pipe_control.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included
  12  * in all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20  * DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 /**
  24  * @file iris_pipe_control.c
  25  *
  26  * PIPE_CONTROL is the main flushing and synchronization primitive on Intel
  27  * GPUs.  It can invalidate caches, stall until rendering reaches various
  28  * stages of completion, write to memory, and other things.  In a way, it's
  29  * a swiss army knife command - it has all kinds of capabilities, but some
  30  * significant limitations as well.
  31  *
  32  * Unfortunately, it's notoriously complicated and difficult to use.  Many
  33  * sub-commands can't be used together.  Some are meant to be used at the
  34  * top of the pipeline (invalidating caches before drawing), while some are
  35  * meant to be used at the end (stalling or flushing after drawing).
  36  *
  37  * Also, there's a list of restrictions a mile long, which vary by generation.
  38  * Do this before doing that, or suffer the consequences (usually a GPU hang).
  39  *
  40  * This file contains helpers for emitting them safely.  You can simply call
  41  * iris_emit_pipe_control_flush() with the desired operations (as logical
  42  * PIPE_CONTROL_* bits), and it will take care of splitting it into multiple
  43  * PIPE_CONTROL commands as necessary.  The per-generation workarounds are
  44  * applied in iris_emit_raw_pipe_control() in iris_state.c.
  45  */
  46
  47 #include "iris_context.h"
  48 #include "util/hash_table.h"
  49 #include "util/set.h"
  50
  51 /**
  52  * Emit a PIPE_CONTROL with various flushing flags.
  53  *
  54  * The caller is responsible for deciding what flags are appropriate for the
  55  * given generation.
  56  */
  57 void
  58 iris_emit_pipe_control_flush(struct iris_batch *batch,
  59                              const char *reason,
  60                              uint32_t flags)
  61 {
  62    if ((flags & PIPE_CONTROL_CACHE_FLUSH_BITS) &&
  63        (flags & PIPE_CONTROL_CACHE_INVALIDATE_BITS)) {
  64       /* A pipe control command with flush and invalidate bits set
  65        * simultaneously is an inherently racy operation on Gen6+ if the
  66        * contents of the flushed caches were intended to become visible from
  67        * any of the invalidated caches.  Split it in two PIPE_CONTROLs, the
  68        * first one should stall the pipeline to make sure that the flushed R/W
  69        * caches are coherent with memory once the specified R/O caches are
  70        * invalidated.  On pre-Gen6 hardware the (implicit) R/O cache
  71        * invalidation seems to happen at the bottom of the pipeline together
  72        * with any write cache flush, so this shouldn't be a concern.  In order
  73        * to ensure a full stall, we do an end-of-pipe sync.
  74        */
  75       iris_emit_end_of_pipe_sync(batch, reason,
  76                                  flags & PIPE_CONTROL_CACHE_FLUSH_BITS);
  77       flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL);
  78    }
  79
  80    batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, NULL, 0, 0);
  81 }
  82
  83 /**
  84  * Emit a PIPE_CONTROL that writes to a buffer object.
  85  *
  86  * \p flags should contain one of the following items:
  87  *  - PIPE_CONTROL_WRITE_IMMEDIATE
  88  *  - PIPE_CONTROL_WRITE_TIMESTAMP
  89  *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
  90  */
  91 void
  92 iris_emit_pipe_control_write(struct iris_batch *batch,
  93                              const char *reason, uint32_t flags,
  94                              struct iris_bo *bo, uint32_t offset,
  95                              uint64_t imm)
  96 {
  97    batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, bo, offset, imm);
  98 }
  99
 100 /*
 101  * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
 102  *
 103  *  Write synchronization is a special case of end-of-pipe
 104  *  synchronization that requires that the render cache and/or depth
 105  *  related caches are flushed to memory, where the data will become
 106  *  globally visible. This type of synchronization is required prior to
 107  *  SW (CPU) actually reading the result data from memory, or initiating
 108  *  an operation that will use as a read surface (such as a texture
 109  *  surface) a previous render target and/or depth/stencil buffer
 110  *
 111  * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
 112  *
 113  *  Exercising the write cache flush bits (Render Target Cache Flush
 114  *  Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
 115  *  ensures the write caches are flushed and doesn't guarantee the data
 116  *  is globally visible.
 117  *
 118  *  SW can track the completion of the end-of-pipe-synchronization by
 119  *  using "Notify Enable" and "PostSync Operation - Write Immediate
 120  *  Data" in the PIPE_CONTROL command.
 121  */
 122 void
 123 iris_emit_end_of_pipe_sync(struct iris_batch *batch,
 124                            const char *reason, uint32_t flags)
 125 {
 126    /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
 127     *
 128     *    "The most common action to perform upon reaching a synchronization
 129     *    point is to write a value out to memory. An immediate value
 130     *    (included with the synchronization command) may be written."
 131     *
 132     * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
 133     *
 134     *    "In case the data flushed out by the render engine is to be read
 135     *    back in to the render engine in coherent manner, then the render
 136     *    engine has to wait for the fence completion before accessing the
 137     *    flushed data. This can be achieved by following means on various
 138     *    products: PIPE_CONTROL command with CS Stall and the required
 139     *    write caches flushed with Post-Sync-Operation as Write Immediate
 140     *    Data.
 141     *
 142     *    Example:
 143     *       - Workload-1 (3D/GPGPU/MEDIA)
 144     *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write Immediate
 145     *         Data, Required Write Cache Flush bits set)
 146     *       - Workload-2 (Can use the data produce or output by Workload-1)
 147     */
 148    iris_emit_pipe_control_write(batch, reason,
 149                                 flags | PIPE_CONTROL_CS_STALL |
 150                                 PIPE_CONTROL_WRITE_IMMEDIATE,
 151                                 batch->screen->workaround_address.bo,
 152                                 batch->screen->workaround_address.offset, 0);
 153 }
 154
 155 /**
 156  * Emits appropriate flushes and invalidations for any previous memory
 157  * operations on \p bo to be strictly ordered relative to any subsequent
 158  * memory operations performed from the caching domain \p access.
 159  *
 160  * This is useful because the GPU has separate incoherent caches for the
 161  * render target, sampler, etc., which need to be explicitly invalidated or
 162  * flushed in order to obtain the expected memory ordering in cases where the
 163  * same surface is accessed through multiple caches (e.g. due to
 164  * render-to-texture).
 165  *
 166  * This provides the expected memory ordering guarantees whether or not the
 167  * previous access was performed from the same batch or a different one, but
 168  * only the former case needs to be handled explicitly here, since the kernel
 169  * already inserts implicit flushes and synchronization in order to guarantee
 170  * that any data dependencies between batches are satisfied.
 171  *
 172  * Even though no flushing nor invalidation is required in order to account
 173  * for concurrent updates from other batches, we provide the guarantee that a
 174  * required synchronization operation due to a previous batch-local update
 175  * will never be omitted due to the influence of another thread accessing the
 176  * same buffer concurrently from the same caching domain: Such a concurrent
 177  * update will only ever change the seqno of the last update to a value
 178  * greater than the local value (see iris_bo_bump_seqno()), which means that
 179  * we will always emit at least as much flushing and invalidation as we would
 180  * have for the local seqno (see the coherent_seqnos comparisons below).
 181  */
 182 void
 183 iris_emit_buffer_barrier_for(struct iris_batch *batch,
 184                              struct iris_bo *bo,
 185                              enum iris_domain access)
 186 {
 187    const uint32_t all_flush_bits = (PIPE_CONTROL_CACHE_FLUSH_BITS |
 188                                     PIPE_CONTROL_STALL_AT_SCOREBOARD |
 189                                     PIPE_CONTROL_FLUSH_ENABLE);
 190    const uint32_t flush_bits[NUM_IRIS_DOMAINS] = {
 191       [IRIS_DOMAIN_RENDER_WRITE] = PIPE_CONTROL_RENDER_TARGET_FLUSH,
 192       [IRIS_DOMAIN_DEPTH_WRITE] = PIPE_CONTROL_DEPTH_CACHE_FLUSH,
 193       [IRIS_DOMAIN_OTHER_WRITE] = PIPE_CONTROL_FLUSH_ENABLE,
 194       [IRIS_DOMAIN_OTHER_READ] = PIPE_CONTROL_STALL_AT_SCOREBOARD,
 195    };
 196    const uint32_t invalidate_bits[NUM_IRIS_DOMAINS] = {
 197       [IRIS_DOMAIN_RENDER_WRITE] = PIPE_CONTROL_RENDER_TARGET_FLUSH,
 198       [IRIS_DOMAIN_DEPTH_WRITE] = PIPE_CONTROL_DEPTH_CACHE_FLUSH,
 199       [IRIS_DOMAIN_OTHER_WRITE] = PIPE_CONTROL_FLUSH_ENABLE,
 200       [IRIS_DOMAIN_OTHER_READ] = (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 201                                   PIPE_CONTROL_CONST_CACHE_INVALIDATE),
 202    };
 203    uint32_t bits = 0;
 204
 205    /* Iterate over all read/write domains first in order to handle RaW
 206     * and WaW dependencies, which might involve flushing the domain of
 207     * the previous access and invalidating the specified domain.
 208     */
 209    for (unsigned i = 0; i < IRIS_DOMAIN_OTHER_WRITE; i++) {
 210       assert(!iris_domain_is_read_only(i));
 211       if (i != access) {
 212          const uint64_t seqno = READ_ONCE(bo->last_seqnos[i]);
 213
 214          /* Invalidate unless the most recent read/write access from
 215           * this domain is already guaranteed to be visible to the
 216           * specified domain.  Flush if the most recent access from
 217           * this domain occurred after its most recent flush.
 218           */
 219          if (seqno > batch->coherent_seqnos[access][i]) {
 220             bits |= invalidate_bits[access];
 221
 222             if (seqno > batch->coherent_seqnos[i][i])
 223                bits |= flush_bits[i];
 224          }
 225       }
 226    }
 227
 228    /* All read-only domains can be considered mutually coherent since
 229     * the order of read-only memory operations is immaterial.  If the
 230     * specified domain is read/write we need to iterate over them too,
 231     * in order to handle any WaR dependencies.
 232     */
 233    if (!iris_domain_is_read_only(access)) {
 234       for (unsigned i = IRIS_DOMAIN_OTHER_READ; i < NUM_IRIS_DOMAINS; i++) {
 235          assert(iris_domain_is_read_only(i));
 236          const uint64_t seqno = READ_ONCE(bo->last_seqnos[i]);
 237
 238          /* Flush if the most recent access from this domain occurred
 239           * after its most recent flush.
 240           */
 241          if (seqno > batch->coherent_seqnos[i][i])
 242             bits |= flush_bits[i];
 243       }
 244    }
 245
 246    /* The IRIS_DOMAIN_OTHER_WRITE kitchen-sink domain cannot be
 247     * considered coherent with itself since it's really a collection
 248     * of multiple incoherent read/write domains, so we special-case it
 249     * here.
 250     */
 251    const unsigned i = IRIS_DOMAIN_OTHER_WRITE;
 252    const uint64_t seqno = READ_ONCE(bo->last_seqnos[i]);
 253
 254    /* Invalidate unless the most recent read/write access from this
 255     * domain is already guaranteed to be visible to the specified
 256     * domain.  Flush if the most recent access from this domain
 257     * occurred after its most recent flush.
 258     */
 259    if (seqno > batch->coherent_seqnos[access][i]) {
 260       bits |= invalidate_bits[access];
 261
 262       if (seqno > batch->coherent_seqnos[i][i])
 263          bits |= flush_bits[i];
 264    }
 265
 266    if (bits) {
 267       /* Stall-at-scoreboard is not expected to work in combination with other
 268        * flush bits.
 269        */
 270       if (bits & PIPE_CONTROL_CACHE_FLUSH_BITS)
 271          bits &= ~PIPE_CONTROL_STALL_AT_SCOREBOARD;
 272
 273       /* Emit any required flushes and invalidations. */
 274       if (bits & all_flush_bits)
 275          iris_emit_end_of_pipe_sync(batch, "cache tracker: flush",
 276                                     bits & all_flush_bits);
 277
 278       if (bits & ~all_flush_bits)
 279          iris_emit_pipe_control_flush(batch, "cache tracker: invalidate",
 280                                       bits & ~all_flush_bits);
 281    }
 282 }
 283
 284 /**
 285  * Flush and invalidate all caches (for debugging purposes).
 286  */
 287 void
 288 iris_flush_all_caches(struct iris_batch *batch)
 289 {
 290    iris_emit_pipe_control_flush(batch, "debug: flush all caches",
 291                                 PIPE_CONTROL_CS_STALL |
 292                                 PIPE_CONTROL_DATA_CACHE_FLUSH |
 293                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 294                                 PIPE_CONTROL_RENDER_TARGET_FLUSH |
 295                                 PIPE_CONTROL_VF_CACHE_INVALIDATE |
 296                                 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
 297                                 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 298                                 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
 299                                 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
 300 }
 301
 302 static void
 303 iris_texture_barrier(struct pipe_context *ctx, unsigned flags)
 304 {
 305    struct iris_context *ice = (void *) ctx;
 306    struct iris_batch *render_batch = &ice->batches[IRIS_BATCH_RENDER];
 307    struct iris_batch *compute_batch = &ice->batches[IRIS_BATCH_COMPUTE];
 308
 309    if (render_batch->contains_draw) {
 310       iris_batch_maybe_flush(render_batch, 48);
 311       iris_emit_pipe_control_flush(render_batch,
 312                                    "API: texture barrier (1/2)",
 313                                    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 314                                    PIPE_CONTROL_RENDER_TARGET_FLUSH |
 315                                    PIPE_CONTROL_CS_STALL);
 316       iris_emit_pipe_control_flush(render_batch,
 317                                    "API: texture barrier (2/2)",
 318                                    PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 319    }
 320
 321    if (compute_batch->contains_draw) {
 322       iris_batch_maybe_flush(compute_batch, 48);
 323       iris_emit_pipe_control_flush(compute_batch,
 324                                    "API: texture barrier (1/2)",
 325                                    PIPE_CONTROL_CS_STALL);
 326       iris_emit_pipe_control_flush(compute_batch,
 327                                    "API: texture barrier (2/2)",
 328                                    PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
 329    }
 330 }
 331
 332 static void
 333 iris_memory_barrier(struct pipe_context *ctx, unsigned flags)
 334 {
 335    struct iris_context *ice = (void *) ctx;
 336    unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
 337
 338    if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
 339                 PIPE_BARRIER_INDEX_BUFFER |
 340                 PIPE_BARRIER_INDIRECT_BUFFER)) {
 341       bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 342    }
 343
 344    if (flags & PIPE_BARRIER_CONSTANT_BUFFER) {
 345       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 346               PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 347    }
 348
 349    if (flags & (PIPE_BARRIER_TEXTURE | PIPE_BARRIER_FRAMEBUFFER)) {
 350       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
 351               PIPE_CONTROL_RENDER_TARGET_FLUSH;
 352    }
 353
 354    for (int i = 0; i < IRIS_BATCH_COUNT; i++) {
 355       if (ice->batches[i].contains_draw) {
 356          iris_batch_maybe_flush(&ice->batches[i], 24);
 357          iris_emit_pipe_control_flush(&ice->batches[i], "API: memory barrier",
 358                                       bits);
 359       }
 360    }
 361 }
 362
 363 void
 364 iris_init_flush_functions(struct pipe_context *ctx)
 365 {
 366    ctx->memory_barrier = iris_memory_barrier;
 367    ctx->texture_barrier = iris_texture_barrier;
 368 }