src/mesa/drivers/dri/i965/intel_batchbuffer.c

   1 /*
   2  * Copyright 2006 VMware, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the
   7  * "Software"), to deal in the Software without restriction, including
   8  * without limitation the rights to use, copy, modify, merge, publish,
   9  * distribute, sublicense, and/or sell copies of the Software, and to
  10  * permit persons to whom the Software is furnished to do so, subject to
  11  * the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the
  14  * next paragraph) shall be included in all copies or substantial portions
  15  * of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  19  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  20  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  21  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  22  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  23  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #include "intel_batchbuffer.h"
  27 #include "intel_buffer_objects.h"
  28 #include "intel_reg.h"
  29 #include "intel_bufmgr.h"
  30 #include "intel_buffers.h"
  31 #include "intel_fbo.h"
  32 #include "brw_context.h"
  33 #include "brw_defines.h"
  34 #include "brw_state.h"
  35
  36 #include <xf86drm.h>
  37 #include <i915_drm.h>
  38
  39 static void
  40 intel_batchbuffer_reset(struct brw_context *brw);
  41
  42 void
  43 intel_batchbuffer_init(struct brw_context *brw)
  44 {
  45    intel_batchbuffer_reset(brw);
  46
  47    if (!brw->has_llc) {
  48       brw->batch.cpu_map = malloc(BATCH_SZ);
  49       brw->batch.map = brw->batch.cpu_map;
  50       brw->batch.map_next = brw->batch.cpu_map;
  51    }
  52 }
  53
  54 static void
  55 intel_batchbuffer_reset(struct brw_context *brw)
  56 {
  57    if (brw->batch.last_bo != NULL) {
  58       drm_intel_bo_unreference(brw->batch.last_bo);
  59       brw->batch.last_bo = NULL;
  60    }
  61    brw->batch.last_bo = brw->batch.bo;
  62
  63    brw_render_cache_set_clear(brw);
  64
  65    brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer",
  66                                         BATCH_SZ, 4096);
  67    if (brw->has_llc) {
  68       drm_intel_bo_map(brw->batch.bo, true);
  69       brw->batch.map = brw->batch.bo->virtual;
  70    }
  71    brw->batch.map_next = brw->batch.map;
  72
  73    brw->batch.reserved_space = BATCH_RESERVED;
  74    brw->batch.state_batch_offset = brw->batch.bo->size;
  75    brw->batch.needs_sol_reset = false;
  76    brw->batch.state_base_address_emitted = false;
  77
  78    /* We don't know what ring the new batch will be sent to until we see the
  79     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
  80     */
  81    brw->batch.ring = UNKNOWN_RING;
  82 }
  83
  84 void
  85 intel_batchbuffer_save_state(struct brw_context *brw)
  86 {
  87    brw->batch.saved.map_next = brw->batch.map_next;
  88    brw->batch.saved.reloc_count =
  89       drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
  90 }
  91
  92 void
  93 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
  94 {
  95    drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
  96
  97    brw->batch.map_next = brw->batch.saved.map_next;
  98    if (USED_BATCH(brw->batch) == 0)
  99       brw->batch.ring = UNKNOWN_RING;
 100 }
 101
 102 void
 103 intel_batchbuffer_free(struct brw_context *brw)
 104 {
 105    free(brw->batch.cpu_map);
 106    drm_intel_bo_unreference(brw->batch.last_bo);
 107    drm_intel_bo_unreference(brw->batch.bo);
 108 }
 109
 110 void
 111 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
 112                                 enum brw_gpu_ring ring)
 113 {
 114    /* If we're switching rings, implicitly flush the batch. */
 115    if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
 116        brw->gen >= 6) {
 117       intel_batchbuffer_flush(brw);
 118    }
 119
 120 #ifdef DEBUG
 121    assert(sz < BATCH_SZ - BATCH_RESERVED);
 122 #endif
 123    if (intel_batchbuffer_space(brw) < sz)
 124       intel_batchbuffer_flush(brw);
 125
 126    enum brw_gpu_ring prev_ring = brw->batch.ring;
 127    /* The intel_batchbuffer_flush() calls above might have changed
 128     * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
 129     */
 130    brw->batch.ring = ring;
 131
 132    if (unlikely(prev_ring == UNKNOWN_RING && ring == RENDER_RING))
 133       intel_batchbuffer_emit_render_ring_prelude(brw);
 134 }
 135
 136 static void
 137 do_batch_dump(struct brw_context *brw)
 138 {
 139    struct drm_intel_decode *decode;
 140    struct intel_batchbuffer *batch = &brw->batch;
 141    int ret;
 142
 143    decode = drm_intel_decode_context_alloc(brw->intelScreen->deviceID);
 144    if (!decode)
 145       return;
 146
 147    ret = drm_intel_bo_map(batch->bo, false);
 148    if (ret == 0) {
 149       drm_intel_decode_set_batch_pointer(decode,
 150                                          batch->bo->virtual,
 151                                          batch->bo->offset64,
 152                                          USED_BATCH(*batch));
 153    } else {
 154       fprintf(stderr,
 155               "WARNING: failed to map batchbuffer (%s), "
 156               "dumping uploaded data instead.\n", strerror(ret));
 157
 158       drm_intel_decode_set_batch_pointer(decode,
 159                                          batch->map,
 160                                          batch->bo->offset64,
 161                                          USED_BATCH(*batch));
 162    }
 163
 164    drm_intel_decode_set_output_file(decode, stderr);
 165    drm_intel_decode(decode);
 166
 167    drm_intel_decode_context_free(decode);
 168
 169    if (ret == 0) {
 170       drm_intel_bo_unmap(batch->bo);
 171
 172       brw_debug_batch(brw);
 173    }
 174 }
 175
 176 void
 177 intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw)
 178 {
 179    /* We may need to enable and snapshot OA counters. */
 180    brw_perf_monitor_new_batch(brw);
 181 }
 182
 183 /**
 184  * Called when starting a new batch buffer.
 185  */
 186 static void
 187 brw_new_batch(struct brw_context *brw)
 188 {
 189    /* Create a new batchbuffer and reset the associated state: */
 190    drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
 191    intel_batchbuffer_reset(brw);
 192
 193    /* If the kernel supports hardware contexts, then most hardware state is
 194     * preserved between batches; we only need to re-emit state that is required
 195     * to be in every batch.  Otherwise we need to re-emit all the state that
 196     * would otherwise be stored in the context (which for all intents and
 197     * purposes means everything).
 198     */
 199    if (brw->hw_ctx == NULL)
 200       brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
 201
 202    brw->ctx.NewDriverState |= BRW_NEW_BATCH;
 203
 204    brw->state_batch_count = 0;
 205
 206    brw->ib.type = -1;
 207
 208    /* We need to periodically reap the shader time results, because rollover
 209     * happens every few seconds.  We also want to see results every once in a
 210     * while, because many programs won't cleanly destroy our context, so the
 211     * end-of-run printout may not happen.
 212     */
 213    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
 214       brw_collect_and_report_shader_time(brw);
 215
 216    if (INTEL_DEBUG & DEBUG_PERFMON)
 217       brw_dump_perf_monitors(brw);
 218 }
 219
 220 /**
 221  * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
 222  * sending it off.
 223  *
 224  * This function can emit state (say, to preserve registers that aren't saved
 225  * between batches).  All of this state MUST fit in the reserved space at the
 226  * end of the batchbuffer.  If you add more GPU state, increase the reserved
 227  * space by updating the BATCH_RESERVED macro.
 228  */
 229 static void
 230 brw_finish_batch(struct brw_context *brw)
 231 {
 232    /* Capture the closing pipeline statistics register values necessary to
 233     * support query objects (in the non-hardware context world).
 234     */
 235    brw_emit_query_end(brw);
 236
 237    if (brw->batch.ring == RENDER_RING) {
 238       /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
 239        * assume that the L3 cache is configured according to the hardware
 240        * defaults.
 241        */
 242       if (brw->gen >= 7)
 243          gen7_restore_default_l3_config(brw);
 244
 245       /* We may also need to snapshot and disable OA counters. */
 246       brw_perf_monitor_finish_batch(brw);
 247
 248       if (brw->is_haswell) {
 249          /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
 250           * 3DSTATE_CC_STATE_POINTERS > "Note":
 251           *
 252           * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
 253           *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
 254           *
 255           * From the example in the docs, it seems to expect a regular pipe control
 256           * flush here as well. We may have done it already, but meh.
 257           *
 258           * See also WaAvoidRCZCounterRollover.
 259           */
 260          brw_emit_mi_flush(brw);
 261          BEGIN_BATCH(2);
 262          OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
 263          OUT_BATCH(brw->cc.state_offset | 1);
 264          ADVANCE_BATCH();
 265          brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
 266                                           PIPE_CONTROL_CS_STALL);
 267       }
 268    }
 269
 270    /* Mark that the current program cache BO has been used by the GPU.
 271     * It will be reallocated if we need to put new programs in for the
 272     * next batch.
 273     */
 274    brw->cache.bo_used_by_gpu = true;
 275 }
 276
 277 static void
 278 throttle(struct brw_context *brw)
 279 {
 280    /* Wait for the swapbuffers before the one we just emitted, so we
 281     * don't get too many swaps outstanding for apps that are GPU-heavy
 282     * but not CPU-heavy.
 283     *
 284     * We're using intelDRI2Flush (called from the loader before
 285     * swapbuffer) and glFlush (for front buffer rendering) as the
 286     * indicator that a frame is done and then throttle when we get
 287     * here as we prepare to render the next frame.  At this point for
 288     * round trips for swap/copy and getting new buffers are done and
 289     * we'll spend less time waiting on the GPU.
 290     *
 291     * Unfortunately, we don't have a handle to the batch containing
 292     * the swap, and getting our hands on that doesn't seem worth it,
 293     * so we just use the first batch we emitted after the last swap.
 294     */
 295    if (brw->need_swap_throttle && brw->throttle_batch[0]) {
 296       if (brw->throttle_batch[1]) {
 297          if (!brw->disable_throttling)
 298             drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
 299          drm_intel_bo_unreference(brw->throttle_batch[1]);
 300       }
 301       brw->throttle_batch[1] = brw->throttle_batch[0];
 302       brw->throttle_batch[0] = NULL;
 303       brw->need_swap_throttle = false;
 304       /* Throttling here is more precise than the throttle ioctl, so skip it */
 305       brw->need_flush_throttle = false;
 306    }
 307
 308    if (brw->need_flush_throttle) {
 309       __DRIscreen *psp = brw->intelScreen->driScrnPriv;
 310       drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
 311       brw->need_flush_throttle = false;
 312    }
 313 }
 314
 315 /* Drop when RS headers get pulled to libdrm */
 316 #ifndef I915_EXEC_RESOURCE_STREAMER
 317 #define I915_EXEC_RESOURCE_STREAMER (1<<15)
 318 #endif
 319
 320 /* TODO: Push this whole function into bufmgr.
 321  */
 322 static int
 323 do_flush_locked(struct brw_context *brw)
 324 {
 325    struct intel_batchbuffer *batch = &brw->batch;
 326    int ret = 0;
 327
 328    if (brw->has_llc) {
 329       drm_intel_bo_unmap(batch->bo);
 330    } else {
 331       ret = drm_intel_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map);
 332       if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
 333          ret = drm_intel_bo_subdata(batch->bo,
 334                                     batch->state_batch_offset,
 335                                     batch->bo->size - batch->state_batch_offset,
 336                                     (char *)batch->map + batch->state_batch_offset);
 337       }
 338    }
 339
 340    if (!brw->intelScreen->no_hw) {
 341       int flags;
 342
 343       if (brw->gen >= 6 && batch->ring == BLT_RING) {
 344          flags = I915_EXEC_BLT;
 345       } else {
 346          flags = I915_EXEC_RENDER |
 347             (brw->use_resource_streamer ? I915_EXEC_RESOURCE_STREAMER : 0);
 348       }
 349       if (batch->needs_sol_reset)
 350          flags |= I915_EXEC_GEN7_SOL_RESET;
 351
 352       if (ret == 0) {
 353          if (unlikely(INTEL_DEBUG & DEBUG_AUB))
 354             brw_annotate_aub(brw);
 355
 356          if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
 357             ret = drm_intel_bo_mrb_exec(batch->bo, 4 * USED_BATCH(*batch),
 358                                         NULL, 0, 0, flags);
 359          } else {
 360             ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
 361                                                 4 * USED_BATCH(*batch), flags);
 362          }
 363       }
 364
 365       throttle(brw);
 366    }
 367
 368    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
 369       do_batch_dump(brw);
 370
 371    if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
 372       brw_check_for_reset(brw);
 373
 374    if (ret != 0) {
 375       fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
 376       exit(1);
 377    }
 378
 379    return ret;
 380 }
 381
 382 int
 383 _intel_batchbuffer_flush(struct brw_context *brw,
 384                          const char *file, int line)
 385 {
 386    int ret;
 387
 388    if (USED_BATCH(brw->batch) == 0)
 389       return 0;
 390
 391    if (brw->throttle_batch[0] == NULL) {
 392       brw->throttle_batch[0] = brw->batch.bo;
 393       drm_intel_bo_reference(brw->throttle_batch[0]);
 394    }
 395
 396    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
 397       int bytes_for_commands = 4 * USED_BATCH(brw->batch);
 398       int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
 399       int total_bytes = bytes_for_commands + bytes_for_state;
 400       fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
 401               "%4db (state) = %4db (%0.1f%%)\n", file, line,
 402               bytes_for_commands, bytes_for_state,
 403               total_bytes,
 404               100.0f * total_bytes / BATCH_SZ);
 405    }
 406
 407    brw->batch.reserved_space = 0;
 408
 409    brw_finish_batch(brw);
 410
 411    /* Mark the end of the buffer. */
 412    intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END);
 413    if (USED_BATCH(brw->batch) & 1) {
 414       /* Round batchbuffer usage to 2 DWORDs. */
 415       intel_batchbuffer_emit_dword(brw, MI_NOOP);
 416    }
 417
 418    intel_upload_finish(brw);
 419
 420    /* Check that we didn't just wrap our batchbuffer at a bad time. */
 421    assert(!brw->no_batch_wrap);
 422
 423    ret = do_flush_locked(brw);
 424
 425    if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
 426       fprintf(stderr, "waiting for idle\n");
 427       drm_intel_bo_wait_rendering(brw->batch.bo);
 428    }
 429
 430    if (brw->use_resource_streamer)
 431       gen7_reset_hw_bt_pool_offsets(brw);
 432
 433    /* Start a new batch buffer. */
 434    brw_new_batch(brw);
 435
 436    return ret;
 437 }
 438
 439
 440 /*  This is the only way buffers get added to the validate list.
 441  */
 442 uint32_t
 443 intel_batchbuffer_reloc(struct brw_context *brw,
 444                         drm_intel_bo *buffer, uint32_t offset,
 445                         uint32_t read_domains, uint32_t write_domain,
 446                         uint32_t delta)
 447 {
 448    int ret;
 449
 450    ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
 451                                  buffer, delta,
 452                                  read_domains, write_domain);
 453    assert(ret == 0);
 454    (void)ret;
 455
 456    /* Using the old buffer offset, write in what the right data would be, in
 457     * case the buffer doesn't move and we can short-circuit the relocation
 458     * processing in the kernel
 459     */
 460    return buffer->offset64 + delta;
 461 }
 462
 463 uint64_t
 464 intel_batchbuffer_reloc64(struct brw_context *brw,
 465                           drm_intel_bo *buffer, uint32_t offset,
 466                           uint32_t read_domains, uint32_t write_domain,
 467                           uint32_t delta)
 468 {
 469    int ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
 470                                      buffer, delta,
 471                                      read_domains, write_domain);
 472    assert(ret == 0);
 473    (void) ret;
 474
 475    /* Using the old buffer offset, write in what the right data would be, in
 476     * case the buffer doesn't move and we can short-circuit the relocation
 477     * processing in the kernel
 478     */
 479    return buffer->offset64 + delta;
 480 }
 481
 482
 483 void
 484 intel_batchbuffer_data(struct brw_context *brw,
 485                        const void *data, GLuint bytes, enum brw_gpu_ring ring)
 486 {
 487    assert((bytes & 3) == 0);
 488    intel_batchbuffer_require_space(brw, bytes, ring);
 489    memcpy(brw->batch.map_next, data, bytes);
 490    brw->batch.map_next += bytes >> 2;
 491 }
 492
 493 static void
 494 load_sized_register_mem(struct brw_context *brw,
 495                         uint32_t reg,
 496                         drm_intel_bo *bo,
 497                         uint32_t read_domains, uint32_t write_domain,
 498                         uint32_t offset,
 499                         int size)
 500 {
 501    int i;
 502
 503    /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
 504    assert(brw->gen >= 7);
 505
 506    if (brw->gen >= 8) {
 507       BEGIN_BATCH(4 * size);
 508       for (i = 0; i < size; i++) {
 509          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
 510          OUT_BATCH(reg + i * 4);
 511          OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
 512       }
 513       ADVANCE_BATCH();
 514    } else {
 515       BEGIN_BATCH(3 * size);
 516       for (i = 0; i < size; i++) {
 517          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
 518          OUT_BATCH(reg + i * 4);
 519          OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
 520       }
 521       ADVANCE_BATCH();
 522    }
 523 }
 524
 525 void
 526 brw_load_register_mem(struct brw_context *brw,
 527                       uint32_t reg,
 528                       drm_intel_bo *bo,
 529                       uint32_t read_domains, uint32_t write_domain,
 530                       uint32_t offset)
 531 {
 532    load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
 533 }
 534
 535 void
 536 brw_load_register_mem64(struct brw_context *brw,
 537                         uint32_t reg,
 538                         drm_intel_bo *bo,
 539                         uint32_t read_domains, uint32_t write_domain,
 540                         uint32_t offset)
 541 {
 542    load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
 543 }
 544
 545 /*
 546  * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
 547  */
 548 void
 549 brw_store_register_mem32(struct brw_context *brw,
 550                          drm_intel_bo *bo, uint32_t reg, uint32_t offset)
 551 {
 552    assert(brw->gen >= 6);
 553
 554    if (brw->gen >= 8) {
 555       BEGIN_BATCH(4);
 556       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
 557       OUT_BATCH(reg);
 558       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 559                   offset);
 560       ADVANCE_BATCH();
 561    } else {
 562       BEGIN_BATCH(3);
 563       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
 564       OUT_BATCH(reg);
 565       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 566                 offset);
 567       ADVANCE_BATCH();
 568    }
 569 }
 570
 571 /*
 572  * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
 573  */
 574 void
 575 brw_store_register_mem64(struct brw_context *brw,
 576                          drm_intel_bo *bo, uint32_t reg, uint32_t offset)
 577 {
 578    assert(brw->gen >= 6);
 579
 580    /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
 581     * read a full 64-bit register, we need to do two of them.
 582     */
 583    if (brw->gen >= 8) {
 584       BEGIN_BATCH(8);
 585       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
 586       OUT_BATCH(reg);
 587       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 588                   offset);
 589       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
 590       OUT_BATCH(reg + sizeof(uint32_t));
 591       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 592                   offset + sizeof(uint32_t));
 593       ADVANCE_BATCH();
 594    } else {
 595       BEGIN_BATCH(6);
 596       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
 597       OUT_BATCH(reg);
 598       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 599                 offset);
 600       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
 601       OUT_BATCH(reg + sizeof(uint32_t));
 602       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 603                 offset + sizeof(uint32_t));
 604       ADVANCE_BATCH();
 605    }
 606 }
 607
 608 /*
 609  * Write a 32-bit register using immediate data.
 610  */
 611 void
 612 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
 613 {
 614    assert(brw->gen >= 6);
 615
 616    BEGIN_BATCH(3);
 617    OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
 618    OUT_BATCH(reg);
 619    OUT_BATCH(imm);
 620    ADVANCE_BATCH();
 621 }
 622
 623 /*
 624  * Write a 64-bit register using immediate data.
 625  */
 626 void
 627 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
 628 {
 629    assert(brw->gen >= 6);
 630
 631    BEGIN_BATCH(5);
 632    OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
 633    OUT_BATCH(reg);
 634    OUT_BATCH(imm & 0xffffffff);
 635    OUT_BATCH(reg + 4);
 636    OUT_BATCH(imm >> 32);
 637    ADVANCE_BATCH();
 638 }
 639
 640 /*
 641  * Copies a 32-bit register.
 642  */
 643 void
 644 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
 645 {
 646    assert(brw->gen >= 8 || brw->is_haswell);
 647
 648    BEGIN_BATCH(3);
 649    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
 650    OUT_BATCH(src);
 651    OUT_BATCH(dest);
 652    ADVANCE_BATCH();
 653 }
 654
 655 /*
 656  * Copies a 64-bit register.
 657  */
 658 void
 659 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
 660 {
 661    assert(brw->gen >= 8 || brw->is_haswell);
 662
 663    BEGIN_BATCH(6);
 664    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
 665    OUT_BATCH(src);
 666    OUT_BATCH(dest);
 667    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
 668    OUT_BATCH(src + sizeof(uint32_t));
 669    OUT_BATCH(dest + sizeof(uint32_t));
 670    ADVANCE_BATCH();
 671 }
 672
 673 /*
 674  * Write 32-bits of immediate data to a GPU memory buffer.
 675  */
 676 void
 677 brw_store_data_imm32(struct brw_context *brw, drm_intel_bo *bo,
 678                      uint32_t offset, uint32_t imm)
 679 {
 680    assert(brw->gen >= 6);
 681
 682    BEGIN_BATCH(4);
 683    OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
 684    if (brw->gen >= 8)
 685       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 686                   offset);
 687    else {
 688       OUT_BATCH(0); /* MBZ */
 689       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 690                 offset);
 691    }
 692    OUT_BATCH(imm);
 693    ADVANCE_BATCH();
 694 }
 695
 696 /*
 697  * Write 64-bits of immediate data to a GPU memory buffer.
 698  */
 699 void
 700 brw_store_data_imm64(struct brw_context *brw, drm_intel_bo *bo,
 701                      uint32_t offset, uint64_t imm)
 702 {
 703    assert(brw->gen >= 6);
 704
 705    BEGIN_BATCH(5);
 706    OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
 707    if (brw->gen >= 8)
 708       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 709                   offset);
 710    else {
 711       OUT_BATCH(0); /* MBZ */
 712       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 713                 offset);
 714    }
 715    OUT_BATCH(imm & 0xffffffffu);
 716    OUT_BATCH(imm >> 32);
 717    ADVANCE_BATCH();
 718 }