src/mesa/drivers/dri/i965/intel_batchbuffer.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2006 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "intel_batchbuffer.h"
  29 #include "intel_buffer_objects.h"
  30 #include "intel_reg.h"
  31 #include "intel_bufmgr.h"
  32 #include "intel_buffers.h"
  33 #include "intel_fbo.h"
  34 #include "brw_context.h"
  35
  36 static void
  37 intel_batchbuffer_reset(struct brw_context *brw);
  38
  39 void
  40 intel_batchbuffer_init(struct brw_context *brw)
  41 {
  42    intel_batchbuffer_reset(brw);
  43
  44    if (brw->gen >= 6) {
  45       /* We can't just use brw_state_batch to get a chunk of space for
  46        * the gen6 workaround because it involves actually writing to
  47        * the buffer, and the kernel doesn't let us write to the batch.
  48        */
  49       brw->batch.workaround_bo = drm_intel_bo_alloc(brw->bufmgr,
  50                                                       "pipe_control workaround",
  51                                                       4096, 4096);
  52    }
  53
  54    brw->batch.need_workaround_flush = true;
  55
  56    if (!brw->has_llc) {
  57       brw->batch.cpu_map = malloc(BATCH_SZ);
  58       brw->batch.map = brw->batch.cpu_map;
  59    }
  60 }
  61
  62 static void
  63 intel_batchbuffer_reset(struct brw_context *brw)
  64 {
  65    if (brw->batch.last_bo != NULL) {
  66       drm_intel_bo_unreference(brw->batch.last_bo);
  67       brw->batch.last_bo = NULL;
  68    }
  69    brw->batch.last_bo = brw->batch.bo;
  70
  71    brw_render_cache_set_clear(brw);
  72
  73    brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer",
  74                                         BATCH_SZ, 4096);
  75    if (brw->has_llc) {
  76       drm_intel_bo_map(brw->batch.bo, true);
  77       brw->batch.map = brw->batch.bo->virtual;
  78    }
  79
  80    brw->batch.reserved_space = BATCH_RESERVED;
  81    brw->batch.state_batch_offset = brw->batch.bo->size;
  82    brw->batch.used = 0;
  83    brw->batch.needs_sol_reset = false;
  84    brw->batch.pipe_controls_since_last_cs_stall = 0;
  85
  86    /* We don't know what ring the new batch will be sent to until we see the
  87     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
  88     */
  89    brw->batch.ring = UNKNOWN_RING;
  90 }
  91
  92 void
  93 intel_batchbuffer_save_state(struct brw_context *brw)
  94 {
  95    brw->batch.saved.used = brw->batch.used;
  96    brw->batch.saved.reloc_count =
  97       drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
  98 }
  99
 100 void
 101 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 102 {
 103    drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
 104
 105    brw->batch.used = brw->batch.saved.used;
 106    if (brw->batch.used == 0)
 107       brw->batch.ring = UNKNOWN_RING;
 108 }
 109
 110 void
 111 intel_batchbuffer_free(struct brw_context *brw)
 112 {
 113    free(brw->batch.cpu_map);
 114    drm_intel_bo_unreference(brw->batch.last_bo);
 115    drm_intel_bo_unreference(brw->batch.bo);
 116    drm_intel_bo_unreference(brw->batch.workaround_bo);
 117 }
 118
 119 static void
 120 do_batch_dump(struct brw_context *brw)
 121 {
 122    struct drm_intel_decode *decode;
 123    struct intel_batchbuffer *batch = &brw->batch;
 124    int ret;
 125
 126    decode = drm_intel_decode_context_alloc(brw->intelScreen->deviceID);
 127    if (!decode)
 128       return;
 129
 130    ret = drm_intel_bo_map(batch->bo, false);
 131    if (ret == 0) {
 132       drm_intel_decode_set_batch_pointer(decode,
 133                                          batch->bo->virtual,
 134                                          batch->bo->offset64,
 135                                          batch->used);
 136    } else {
 137       fprintf(stderr,
 138               "WARNING: failed to map batchbuffer (%s), "
 139               "dumping uploaded data instead.\n", strerror(ret));
 140
 141       drm_intel_decode_set_batch_pointer(decode,
 142                                          batch->map,
 143                                          batch->bo->offset64,
 144                                          batch->used);
 145    }
 146
 147    drm_intel_decode_set_output_file(decode, stderr);
 148    drm_intel_decode(decode);
 149
 150    drm_intel_decode_context_free(decode);
 151
 152    if (ret == 0) {
 153       drm_intel_bo_unmap(batch->bo);
 154
 155       brw_debug_batch(brw);
 156    }
 157 }
 158
 159 void
 160 intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw)
 161 {
 162    /* We may need to enable and snapshot OA counters. */
 163    brw_perf_monitor_new_batch(brw);
 164 }
 165
 166 /**
 167  * Called when starting a new batch buffer.
 168  */
 169 static void
 170 brw_new_batch(struct brw_context *brw)
 171 {
 172    /* Create a new batchbuffer and reset the associated state: */
 173    intel_batchbuffer_reset(brw);
 174
 175    /* If the kernel supports hardware contexts, then most hardware state is
 176     * preserved between batches; we only need to re-emit state that is required
 177     * to be in every batch.  Otherwise we need to re-emit all the state that
 178     * would otherwise be stored in the context (which for all intents and
 179     * purposes means everything).
 180     */
 181    if (brw->hw_ctx == NULL)
 182       brw->state.dirty.brw |= BRW_NEW_CONTEXT;
 183
 184    brw->state.dirty.brw |= BRW_NEW_BATCH;
 185
 186    /* Assume that the last command before the start of our batch was a
 187     * primitive, for safety.
 188     */
 189    brw->batch.need_workaround_flush = true;
 190
 191    brw->state_batch_count = 0;
 192
 193    brw->ib.type = -1;
 194
 195    /* We need to periodically reap the shader time results, because rollover
 196     * happens every few seconds.  We also want to see results every once in a
 197     * while, because many programs won't cleanly destroy our context, so the
 198     * end-of-run printout may not happen.
 199     */
 200    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
 201       brw_collect_and_report_shader_time(brw);
 202
 203    if (INTEL_DEBUG & DEBUG_PERFMON)
 204       brw_dump_perf_monitors(brw);
 205 }
 206
 207 /**
 208  * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
 209  * sending it off.
 210  *
 211  * This function can emit state (say, to preserve registers that aren't saved
 212  * between batches).  All of this state MUST fit in the reserved space at the
 213  * end of the batchbuffer.  If you add more GPU state, increase the reserved
 214  * space by updating the BATCH_RESERVED macro.
 215  */
 216 static void
 217 brw_finish_batch(struct brw_context *brw)
 218 {
 219    /* Capture the closing pipeline statistics register values necessary to
 220     * support query objects (in the non-hardware context world).
 221     */
 222    brw_emit_query_end(brw);
 223
 224    /* We may also need to snapshot and disable OA counters. */
 225    if (brw->batch.ring == RENDER_RING)
 226       brw_perf_monitor_finish_batch(brw);
 227
 228    /* Mark that the current program cache BO has been used by the GPU.
 229     * It will be reallocated if we need to put new programs in for the
 230     * next batch.
 231     */
 232    brw->cache.bo_used_by_gpu = true;
 233 }
 234
 235 /* TODO: Push this whole function into bufmgr.
 236  */
 237 static int
 238 do_flush_locked(struct brw_context *brw)
 239 {
 240    struct intel_batchbuffer *batch = &brw->batch;
 241    int ret = 0;
 242
 243    if (brw->has_llc) {
 244       drm_intel_bo_unmap(batch->bo);
 245    } else {
 246       ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map);
 247       if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
 248          ret = drm_intel_bo_subdata(batch->bo,
 249                                     batch->state_batch_offset,
 250                                     batch->bo->size - batch->state_batch_offset,
 251                                     (char *)batch->map + batch->state_batch_offset);
 252       }
 253    }
 254
 255    if (!brw->intelScreen->no_hw) {
 256       int flags;
 257
 258       if (brw->gen >= 6 && batch->ring == BLT_RING) {
 259          flags = I915_EXEC_BLT;
 260       } else {
 261          flags = I915_EXEC_RENDER;
 262       }
 263       if (batch->needs_sol_reset)
 264          flags |= I915_EXEC_GEN7_SOL_RESET;
 265
 266       if (ret == 0) {
 267          if (unlikely(INTEL_DEBUG & DEBUG_AUB))
 268             brw_annotate_aub(brw);
 269          if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
 270             ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
 271                                         flags);
 272          } else {
 273             ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
 274                                                 4 * batch->used, flags);
 275          }
 276       }
 277    }
 278
 279    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
 280       do_batch_dump(brw);
 281
 282    if (ret != 0) {
 283       fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
 284       exit(1);
 285    }
 286
 287    return ret;
 288 }
 289
 290 int
 291 _intel_batchbuffer_flush(struct brw_context *brw,
 292                          const char *file, int line)
 293 {
 294    int ret;
 295
 296    if (brw->batch.used == 0)
 297       return 0;
 298
 299    if (brw->first_post_swapbuffers_batch == NULL) {
 300       brw->first_post_swapbuffers_batch = brw->batch.bo;
 301       drm_intel_bo_reference(brw->first_post_swapbuffers_batch);
 302    }
 303
 304    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
 305       int bytes_for_commands = 4 * brw->batch.used;
 306       int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
 307       int total_bytes = bytes_for_commands + bytes_for_state;
 308       fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
 309               "%4db (state) = %4db (%0.1f%%)\n", file, line,
 310               bytes_for_commands, bytes_for_state,
 311               total_bytes,
 312               100.0f * total_bytes / BATCH_SZ);
 313    }
 314
 315    brw->batch.reserved_space = 0;
 316
 317    brw_finish_batch(brw);
 318
 319    /* Mark the end of the buffer. */
 320    intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END);
 321    if (brw->batch.used & 1) {
 322       /* Round batchbuffer usage to 2 DWORDs. */
 323       intel_batchbuffer_emit_dword(brw, MI_NOOP);
 324    }
 325
 326    intel_upload_finish(brw);
 327
 328    /* Check that we didn't just wrap our batchbuffer at a bad time. */
 329    assert(!brw->no_batch_wrap);
 330
 331    ret = do_flush_locked(brw);
 332
 333    if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
 334       fprintf(stderr, "waiting for idle\n");
 335       drm_intel_bo_wait_rendering(brw->batch.bo);
 336    }
 337
 338    /* Start a new batch buffer. */
 339    brw_new_batch(brw);
 340
 341    return ret;
 342 }
 343
 344
 345 /*  This is the only way buffers get added to the validate list.
 346  */
 347 bool
 348 intel_batchbuffer_emit_reloc(struct brw_context *brw,
 349                              drm_intel_bo *buffer,
 350                              uint32_t read_domains, uint32_t write_domain,
 351                              uint32_t delta)
 352 {
 353    int ret;
 354
 355    ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
 356                                  buffer, delta,
 357                                  read_domains, write_domain);
 358    assert(ret == 0);
 359    (void)ret;
 360
 361    /* Using the old buffer offset, write in what the right data would be, in
 362     * case the buffer doesn't move and we can short-circuit the relocation
 363     * processing in the kernel
 364     */
 365    intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
 366
 367    return true;
 368 }
 369
 370 bool
 371 intel_batchbuffer_emit_reloc64(struct brw_context *brw,
 372                                drm_intel_bo *buffer,
 373                                uint32_t read_domains, uint32_t write_domain,
 374                                uint32_t delta)
 375 {
 376    int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
 377                                      buffer, delta,
 378                                      read_domains, write_domain);
 379    assert(ret == 0);
 380    (void) ret;
 381
 382    /* Using the old buffer offset, write in what the right data would be, in
 383     * case the buffer doesn't move and we can short-circuit the relocation
 384     * processing in the kernel
 385     */
 386    uint64_t offset = buffer->offset64 + delta;
 387    intel_batchbuffer_emit_dword(brw, offset);
 388    intel_batchbuffer_emit_dword(brw, offset >> 32);
 389
 390    return true;
 391 }
 392
 393
 394 void
 395 intel_batchbuffer_data(struct brw_context *brw,
 396                        const void *data, GLuint bytes, enum brw_gpu_ring ring)
 397 {
 398    assert((bytes & 3) == 0);
 399    intel_batchbuffer_require_space(brw, bytes, ring);
 400    __memcpy(brw->batch.map + brw->batch.used, data, bytes);
 401    brw->batch.used += bytes >> 2;
 402 }
 403
 404 /**
 405  * According to the latest documentation, any PIPE_CONTROL with the
 406  * "Command Streamer Stall" bit set must also have another bit set,
 407  * with five different options:
 408  *
 409  *  - Render Target Cache Flush
 410  *  - Depth Cache Flush
 411  *  - Stall at Pixel Scoreboard
 412  *  - Post-Sync Operation
 413  *  - Depth Stall
 414  *
 415  * I chose "Stall at Pixel Scoreboard" since we've used it effectively
 416  * in the past, but the choice is fairly arbitrary.
 417  */
 418 static void
 419 gen8_add_cs_stall_workaround_bits(uint32_t *flags)
 420 {
 421    uint32_t wa_bits = PIPE_CONTROL_WRITE_FLUSH |
 422                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 423                       PIPE_CONTROL_WRITE_IMMEDIATE |
 424                       PIPE_CONTROL_WRITE_DEPTH_COUNT |
 425                       PIPE_CONTROL_WRITE_TIMESTAMP |
 426                       PIPE_CONTROL_STALL_AT_SCOREBOARD |
 427                       PIPE_CONTROL_DEPTH_STALL;
 428
 429    /* If we're doing a CS stall, and don't already have one of the
 430     * workaround bits set, add "Stall at Pixel Scoreboard."
 431     */
 432    if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0)
 433       *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
 434 }
 435
 436 /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
 437  *
 438  * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
 439  *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
 440  *
 441  * Note that the kernel does CS stalls between batches, so we only need
 442  * to count them within a batch.
 443  */
 444 static uint32_t
 445 gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
 446 {
 447    if (brw->gen == 7 && !brw->is_haswell) {
 448       if (flags & PIPE_CONTROL_CS_STALL) {
 449          /* If we're doing a CS stall, reset the counter and carry on. */
 450          brw->batch.pipe_controls_since_last_cs_stall = 0;
 451          return 0;
 452       }
 453
 454       /* If this is the fourth pipe control without a CS stall, do one now. */
 455       if (++brw->batch.pipe_controls_since_last_cs_stall == 4) {
 456          brw->batch.pipe_controls_since_last_cs_stall = 0;
 457          return PIPE_CONTROL_CS_STALL;
 458       }
 459    }
 460    return 0;
 461 }
 462
 463 /**
 464  * Emit a PIPE_CONTROL with various flushing flags.
 465  *
 466  * The caller is responsible for deciding what flags are appropriate for the
 467  * given generation.
 468  */
 469 void
 470 brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
 471 {
 472    if (brw->gen >= 8) {
 473       gen8_add_cs_stall_workaround_bits(&flags);
 474
 475       BEGIN_BATCH(6);
 476       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
 477       OUT_BATCH(flags);
 478       OUT_BATCH(0);
 479       OUT_BATCH(0);
 480       OUT_BATCH(0);
 481       OUT_BATCH(0);
 482       ADVANCE_BATCH();
 483    } else if (brw->gen >= 6) {
 484       flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
 485
 486       BEGIN_BATCH(5);
 487       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
 488       OUT_BATCH(flags);
 489       OUT_BATCH(0);
 490       OUT_BATCH(0);
 491       OUT_BATCH(0);
 492       ADVANCE_BATCH();
 493    } else {
 494       BEGIN_BATCH(4);
 495       OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
 496       OUT_BATCH(0);
 497       OUT_BATCH(0);
 498       OUT_BATCH(0);
 499       ADVANCE_BATCH();
 500    }
 501 }
 502
 503 /**
 504  * Emit a PIPE_CONTROL that writes to a buffer object.
 505  *
 506  * \p flags should contain one of the following items:
 507  *  - PIPE_CONTROL_WRITE_IMMEDIATE
 508  *  - PIPE_CONTROL_WRITE_TIMESTAMP
 509  *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
 510  */
 511 void
 512 brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
 513                             drm_intel_bo *bo, uint32_t offset,
 514                             uint32_t imm_lower, uint32_t imm_upper)
 515 {
 516    if (brw->gen >= 8) {
 517       gen8_add_cs_stall_workaround_bits(&flags);
 518
 519       BEGIN_BATCH(6);
 520       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
 521       OUT_BATCH(flags);
 522       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 523                   offset);
 524       OUT_BATCH(imm_lower);
 525       OUT_BATCH(imm_upper);
 526       ADVANCE_BATCH();
 527    } else if (brw->gen >= 6) {
 528       flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
 529
 530       /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
 531        * on later platforms.  We always use PPGTT on Gen7+.
 532        */
 533       unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;
 534
 535       BEGIN_BATCH(5);
 536       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
 537       OUT_BATCH(flags);
 538       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 539                 gen6_gtt | offset);
 540       OUT_BATCH(imm_lower);
 541       OUT_BATCH(imm_upper);
 542       ADVANCE_BATCH();
 543    } else {
 544       BEGIN_BATCH(4);
 545       OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
 546       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 547                 PIPE_CONTROL_GLOBAL_GTT_WRITE | offset);
 548       OUT_BATCH(imm_lower);
 549       OUT_BATCH(imm_upper);
 550       ADVANCE_BATCH();
 551    }
 552 }
 553
 554 /**
 555  * Restriction [DevSNB, DevIVB]:
 556  *
 557  * Prior to changing Depth/Stencil Buffer state (i.e. any combination of
 558  * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
 559  * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
 560  * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
 561  * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
 562  * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
 563  * unless SW can otherwise guarantee that the pipeline from WM onwards is
 564  * already flushed (e.g., via a preceding MI_FLUSH).
 565  */
 566 void
 567 intel_emit_depth_stall_flushes(struct brw_context *brw)
 568 {
 569    assert(brw->gen >= 6 && brw->gen <= 9);
 570
 571    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
 572    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
 573    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
 574 }
 575
 576 /**
 577  * From the Ivybridge PRM, Volume 2 Part 1, Section 3.2 (VS Stage Input):
 578  * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
 579  *  stall needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
 580  *  3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
 581  *  3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL needs
 582  *  to be sent before any combination of VS associated 3DSTATE."
 583  */
 584 void
 585 gen7_emit_vs_workaround_flush(struct brw_context *brw)
 586 {
 587    assert(brw->gen == 7);
 588    brw_emit_pipe_control_write(brw,
 589                                PIPE_CONTROL_WRITE_IMMEDIATE
 590                                | PIPE_CONTROL_DEPTH_STALL,
 591                                brw->batch.workaround_bo, 0,
 592                                0, 0);
 593 }
 594
 595
 596 /**
 597  * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
 598  */
 599 void
 600 gen7_emit_cs_stall_flush(struct brw_context *brw)
 601 {
 602    brw_emit_pipe_control_write(brw,
 603                                PIPE_CONTROL_CS_STALL
 604                                | PIPE_CONTROL_WRITE_IMMEDIATE,
 605                                brw->batch.workaround_bo, 0,
 606                                0, 0);
 607 }
 608
 609
 610 /**
 611  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
 612  * implementing two workarounds on gen6.  From section 1.4.7.1
 613  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
 614  *
 615  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
 616  * produced by non-pipelined state commands), software needs to first
 617  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
 618  * 0.
 619  *
 620  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
 621  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
 622  *
 623  * And the workaround for these two requires this workaround first:
 624  *
 625  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
 626  * BEFORE the pipe-control with a post-sync op and no write-cache
 627  * flushes.
 628  *
 629  * And this last workaround is tricky because of the requirements on
 630  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
 631  * volume 2 part 1:
 632  *
 633  *     "1 of the following must also be set:
 634  *      - Render Target Cache Flush Enable ([12] of DW1)
 635  *      - Depth Cache Flush Enable ([0] of DW1)
 636  *      - Stall at Pixel Scoreboard ([1] of DW1)
 637  *      - Depth Stall ([13] of DW1)
 638  *      - Post-Sync Operation ([13] of DW1)
 639  *      - Notify Enable ([8] of DW1)"
 640  *
 641  * The cache flushes require the workaround flush that triggered this
 642  * one, so we can't use it.  Depth stall would trigger the same.
 643  * Post-sync nonzero is what triggered this second workaround, so we
 644  * can't use that one either.  Notify enable is IRQs, which aren't
 645  * really our business.  That leaves only stall at scoreboard.
 646  */
 647 void
 648 intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
 649 {
 650    if (!brw->batch.need_workaround_flush)
 651       return;
 652
 653    brw_emit_pipe_control_flush(brw,
 654                                PIPE_CONTROL_CS_STALL |
 655                                PIPE_CONTROL_STALL_AT_SCOREBOARD);
 656
 657    brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
 658                                brw->batch.workaround_bo, 0, 0, 0);
 659
 660    brw->batch.need_workaround_flush = false;
 661 }
 662
 663 /* Emit a pipelined flush to either flush render and texture cache for
 664  * reading from a FBO-drawn texture, or flush so that frontbuffer
 665  * render appears on the screen in DRI1.
 666  *
 667  * This is also used for the always_flush_cache driconf debug option.
 668  */
 669 void
 670 intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
 671 {
 672    if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
 673       BEGIN_BATCH_BLT(4);
 674       OUT_BATCH(MI_FLUSH_DW);
 675       OUT_BATCH(0);
 676       OUT_BATCH(0);
 677       OUT_BATCH(0);
 678       ADVANCE_BATCH();
 679    } else {
 680       int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_WRITE_FLUSH;
 681       if (brw->gen >= 6) {
 682          if (brw->gen == 9) {
 683             /* Hardware workaround: SKL
 684              *
 685              * Emit Pipe Control with all bits set to zero before emitting
 686              * a Pipe Control with VF Cache Invalidate set.
 687              */
 688             brw_emit_pipe_control_flush(brw, 0);
 689          }
 690
 691          flags |= PIPE_CONTROL_INSTRUCTION_FLUSH |
 692                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 693                   PIPE_CONTROL_VF_CACHE_INVALIDATE |
 694                   PIPE_CONTROL_TC_FLUSH |
 695                   PIPE_CONTROL_CS_STALL;
 696
 697          if (brw->gen == 6) {
 698             /* Hardware workaround: SNB B-Spec says:
 699              *
 700              * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
 701              * Flush Enable =1, a PIPE_CONTROL with any non-zero
 702              * post-sync-op is required.
 703              */
 704             intel_emit_post_sync_nonzero_flush(brw);
 705          }
 706       }
 707       brw_emit_pipe_control_flush(brw, flags);
 708    }
 709
 710    brw_render_cache_set_clear(brw);
 711 }
 712
 713 void
 714 brw_load_register_mem(struct brw_context *brw,
 715                       uint32_t reg,
 716                       drm_intel_bo *bo,
 717                       uint32_t read_domains, uint32_t write_domain,
 718                       uint32_t offset)
 719 {
 720    /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
 721    assert(brw->gen >= 7);
 722
 723    if (brw->gen >= 8) {
 724       BEGIN_BATCH(4);
 725       OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
 726       OUT_BATCH(reg);
 727       OUT_RELOC64(bo, read_domains, write_domain, offset);
 728       ADVANCE_BATCH();
 729    } else {
 730       BEGIN_BATCH(3);
 731       OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
 732       OUT_BATCH(reg);
 733       OUT_RELOC(bo, read_domains, write_domain, offset);
 734       ADVANCE_BATCH();
 735    }
 736 }