src/mesa/drivers/dri/i965/intel_batchbuffer.c

   1 /*
   2  * Copyright 2006 VMware, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the
   7  * "Software"), to deal in the Software without restriction, including
   8  * without limitation the rights to use, copy, modify, merge, publish,
   9  * distribute, sublicense, and/or sell copies of the Software, and to
  10  * permit persons to whom the Software is furnished to do so, subject to
  11  * the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the
  14  * next paragraph) shall be included in all copies or substantial portions
  15  * of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  19  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  20  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  21  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  22  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  23  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #include "intel_batchbuffer.h"
  27 #include "intel_buffer_objects.h"
  28 #include "intel_bufmgr.h"
  29 #include "intel_buffers.h"
  30 #include "intel_fbo.h"
  31 #include "brw_context.h"
  32 #include "brw_defines.h"
  33 #include "brw_state.h"
  34 #include "common/gen_decoder.h"
  35
  36 #include "util/hash_table.h"
  37
  38 #include <xf86drm.h>
  39 #include <i915_drm.h>
  40
  41 static void
  42 intel_batchbuffer_reset(struct intel_batchbuffer *batch, dri_bufmgr *bufmgr,
  43                         bool has_llc);
  44
  45 static bool
  46 uint_key_compare(const void *a, const void *b)
  47 {
  48    return a == b;
  49 }
  50
  51 static uint32_t
  52 uint_key_hash(const void *key)
  53 {
  54    return (uintptr_t) key;
  55 }
  56
  57 void
  58 intel_batchbuffer_init(struct intel_batchbuffer *batch, dri_bufmgr *bufmgr,
  59                        bool has_llc)
  60 {
  61    intel_batchbuffer_reset(batch, bufmgr, has_llc);
  62
  63    if (!has_llc) {
  64       batch->cpu_map = malloc(BATCH_SZ);
  65       batch->map = batch->cpu_map;
  66       batch->map_next = batch->cpu_map;
  67    }
  68
  69    if (INTEL_DEBUG & DEBUG_BATCH) {
  70       batch->state_batch_sizes =
  71          _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
  72    }
  73 }
  74
  75 static void
  76 intel_batchbuffer_reset(struct intel_batchbuffer *batch, dri_bufmgr *bufmgr,
  77                         bool has_llc)
  78 {
  79    if (batch->last_bo != NULL) {
  80       drm_intel_bo_unreference(batch->last_bo);
  81       batch->last_bo = NULL;
  82    }
  83    batch->last_bo = batch->bo;
  84
  85    batch->bo = drm_intel_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
  86    if (has_llc) {
  87       drm_intel_bo_map(batch->bo, true);
  88       batch->map = batch->bo->virtual;
  89    }
  90    batch->map_next = batch->map;
  91
  92    batch->reserved_space = BATCH_RESERVED;
  93    batch->state_batch_offset = batch->bo->size;
  94    batch->needs_sol_reset = false;
  95    batch->state_base_address_emitted = false;
  96
  97    /* We don't know what ring the new batch will be sent to until we see the
  98     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
  99     */
 100    batch->ring = UNKNOWN_RING;
 101
 102    if (batch->state_batch_sizes)
 103       _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
 104 }
 105
 106 static void
 107 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
 108 {
 109    intel_batchbuffer_reset(&brw->batch, brw->bufmgr, brw->has_llc);
 110    brw_render_cache_set_clear(brw);
 111 }
 112
 113 void
 114 intel_batchbuffer_save_state(struct brw_context *brw)
 115 {
 116    brw->batch.saved.map_next = brw->batch.map_next;
 117    brw->batch.saved.reloc_count =
 118       drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
 119 }
 120
 121 void
 122 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 123 {
 124    drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
 125
 126    brw->batch.map_next = brw->batch.saved.map_next;
 127    if (USED_BATCH(brw->batch) == 0)
 128       brw->batch.ring = UNKNOWN_RING;
 129 }
 130
 131 void
 132 intel_batchbuffer_free(struct intel_batchbuffer *batch)
 133 {
 134    free(batch->cpu_map);
 135    drm_intel_bo_unreference(batch->last_bo);
 136    drm_intel_bo_unreference(batch->bo);
 137    if (batch->state_batch_sizes)
 138       _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
 139 }
 140
 141 void
 142 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
 143                                 enum brw_gpu_ring ring)
 144 {
 145    /* If we're switching rings, implicitly flush the batch. */
 146    if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
 147        brw->gen >= 6) {
 148       intel_batchbuffer_flush(brw);
 149    }
 150
 151 #ifdef DEBUG
 152    assert(sz < BATCH_SZ - BATCH_RESERVED);
 153 #endif
 154    if (intel_batchbuffer_space(&brw->batch) < sz)
 155       intel_batchbuffer_flush(brw);
 156
 157    /* The intel_batchbuffer_flush() calls above might have changed
 158     * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
 159     */
 160    brw->batch.ring = ring;
 161 }
 162
 163 #ifdef DEBUG
 164 #define CSI "\e["
 165 #define BLUE_HEADER  CSI "0;44m"
 166 #define NORMAL       CSI "0m"
 167
 168
 169 static void
 170 decode_struct(struct brw_context *brw, struct gen_spec *spec,
 171               const char *struct_name, uint32_t *data,
 172               uint32_t gtt_offset, uint32_t offset, bool color)
 173 {
 174    struct gen_group *group = gen_spec_find_struct(spec, struct_name);
 175    if (!group)
 176       return;
 177
 178    fprintf(stderr, "%s\n", struct_name);
 179    gen_print_group(stderr, group, gtt_offset + offset,
 180                    &data[offset / 4], 0, color);
 181 }
 182
 183 static void
 184 decode_structs(struct brw_context *brw, struct gen_spec *spec,
 185                const char *struct_name,
 186                uint32_t *data, uint32_t gtt_offset, uint32_t offset,
 187                int struct_size, bool color)
 188 {
 189    struct gen_group *group = gen_spec_find_struct(spec, struct_name);
 190    if (!group)
 191       return;
 192
 193    int entries = brw_state_batch_size(brw, offset) / struct_size;
 194    for (int i = 0; i < entries; i++) {
 195       fprintf(stderr, "%s %d\n", struct_name, i);
 196       gen_print_group(stderr, group, gtt_offset + offset,
 197                       &data[(offset + i * struct_size) / 4], 0, color);
 198    }
 199 }
 200
 201 static void
 202 do_batch_dump(struct brw_context *brw)
 203 {
 204    struct intel_batchbuffer *batch = &brw->batch;
 205    struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
 206
 207    if (batch->ring != RENDER_RING)
 208       return;
 209
 210    int ret = drm_intel_bo_map(batch->bo, false);
 211    if (ret != 0) {
 212       fprintf(stderr,
 213               "WARNING: failed to map batchbuffer (%s), "
 214               "dumping uploaded data instead.\n", strerror(ret));
 215    }
 216
 217    uint32_t *data = batch->bo->virtual ? batch->bo->virtual : batch->map;
 218    uint32_t *end = data + USED_BATCH(*batch);
 219    uint32_t gtt_offset = batch->bo->virtual ? batch->bo->offset64 : 0;
 220    unsigned int length;
 221
 222    bool color = INTEL_DEBUG & DEBUG_COLOR;
 223    const char *header_color = color ? BLUE_HEADER : "";
 224    const char *reset_color  = color ? NORMAL : "";
 225
 226    for (uint32_t *p = data; p < end; p += length) {
 227       struct gen_group *inst = gen_spec_find_instruction(spec, p);
 228       if (inst == NULL) {
 229          fprintf(stderr, "unknown instruction %08x\n", p[0]);
 230          length = (p[0] & 0xff) + 2;
 231          continue;
 232       }
 233
 234       uint64_t offset = gtt_offset + 4 * (p - data);
 235
 236       fprintf(stderr, "%s0x%08"PRIx64":  0x%08x:  %-80s%s\n", header_color,
 237               offset, p[0], gen_group_get_name(inst), reset_color);
 238
 239       gen_print_group(stderr, inst, offset, p, 1, color);
 240
 241       switch (gen_group_get_opcode(inst) >> 16) {
 242       case _3DSTATE_PIPELINED_POINTERS:
 243          /* TODO: Decode Gen4-5 pipelined pointers */
 244          break;
 245       case _3DSTATE_BINDING_TABLE_POINTERS_VS:
 246       case _3DSTATE_BINDING_TABLE_POINTERS_HS:
 247       case _3DSTATE_BINDING_TABLE_POINTERS_DS:
 248       case _3DSTATE_BINDING_TABLE_POINTERS_GS:
 249       case _3DSTATE_BINDING_TABLE_POINTERS_PS: {
 250          struct gen_group *group =
 251             gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
 252          if (!group)
 253             break;
 254
 255          uint32_t bt_offset = p[1] & ~0x1fu;
 256          int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
 257          uint32_t *bt_pointers = &data[bt_offset / 4];
 258          for (int i = 0; i < bt_entries; i++) {
 259             fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
 260             gen_print_group(stderr, group, gtt_offset + bt_pointers[i],
 261                             &data[bt_pointers[i] / 4], 0, color);
 262          }
 263          break;
 264       }
 265       case _3DSTATE_SAMPLER_STATE_POINTERS_VS:
 266       case _3DSTATE_SAMPLER_STATE_POINTERS_HS:
 267       case _3DSTATE_SAMPLER_STATE_POINTERS_DS:
 268       case _3DSTATE_SAMPLER_STATE_POINTERS_GS:
 269       case _3DSTATE_SAMPLER_STATE_POINTERS_PS:
 270          decode_structs(brw, spec, "SAMPLER_STATE", data,
 271                         gtt_offset, p[1] & ~0x1fu, 4 * 4, color);
 272          break;
 273       case _3DSTATE_VIEWPORT_STATE_POINTERS:
 274          decode_structs(brw, spec, "CLIP_VIEWPORT", data,
 275                         gtt_offset, p[1] & ~0x3fu, 4 * 4, color);
 276          decode_structs(brw, spec, "SF_VIEWPORT", data,
 277                         gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
 278          decode_structs(brw, spec, "CC_VIEWPORT", data,
 279                         gtt_offset, p[3] & ~0x3fu, 2 * 4, color);
 280          break;
 281       case _3DSTATE_VIEWPORT_STATE_POINTERS_CC:
 282          decode_structs(brw, spec, "CC_VIEWPORT", data,
 283                         gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
 284          break;
 285       case _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL:
 286          decode_structs(brw, spec, "SF_CLIP_VIEWPORT", data,
 287                         gtt_offset, p[1] & ~0x3fu, 16 * 4, color);
 288          break;
 289       case _3DSTATE_SCISSOR_STATE_POINTERS:
 290          decode_structs(brw, spec, "SCISSOR_RECT", data,
 291                         gtt_offset, p[1] & ~0x1fu, 2 * 4, color);
 292          break;
 293       case _3DSTATE_BLEND_STATE_POINTERS:
 294          /* TODO: handle Gen8+ extra dword at the beginning */
 295          decode_structs(brw, spec, "BLEND_STATE", data,
 296                         gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
 297          break;
 298       case _3DSTATE_CC_STATE_POINTERS:
 299          if (brw->gen >= 7) {
 300             decode_struct(brw, spec, "COLOR_CALC_STATE", data,
 301                           gtt_offset, p[1] & ~0x3fu, color);
 302          } else if (brw->gen == 6) {
 303             decode_structs(brw, spec, "BLEND_STATE", data,
 304                            gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
 305             decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
 306                           gtt_offset, p[2] & ~0x3fu, color);
 307             decode_struct(brw, spec, "COLOR_CALC_STATE", data,
 308                           gtt_offset, p[3] & ~0x3fu, color);
 309          }
 310          break;
 311       case _3DSTATE_DEPTH_STENCIL_STATE_POINTERS:
 312          decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
 313                        gtt_offset, p[1] & ~0x3fu, color);
 314          break;
 315       }
 316
 317       length = gen_group_get_length(inst, p);
 318    }
 319
 320    if (ret == 0) {
 321       drm_intel_bo_unmap(batch->bo);
 322    }
 323 }
 324 #else
 325 static void do_batch_dump(struct brw_context *brw) { }
 326 #endif
 327
 328 /**
 329  * Called when starting a new batch buffer.
 330  */
 331 static void
 332 brw_new_batch(struct brw_context *brw)
 333 {
 334    /* Create a new batchbuffer and reset the associated state: */
 335    drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
 336    intel_batchbuffer_reset_and_clear_render_cache(brw);
 337
 338    /* If the kernel supports hardware contexts, then most hardware state is
 339     * preserved between batches; we only need to re-emit state that is required
 340     * to be in every batch.  Otherwise we need to re-emit all the state that
 341     * would otherwise be stored in the context (which for all intents and
 342     * purposes means everything).
 343     */
 344    if (brw->hw_ctx == NULL)
 345       brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
 346
 347    brw->ctx.NewDriverState |= BRW_NEW_BATCH;
 348
 349    brw->ib.type = -1;
 350
 351    /* We need to periodically reap the shader time results, because rollover
 352     * happens every few seconds.  We also want to see results every once in a
 353     * while, because many programs won't cleanly destroy our context, so the
 354     * end-of-run printout may not happen.
 355     */
 356    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
 357       brw_collect_and_report_shader_time(brw);
 358 }
 359
 360 /**
 361  * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
 362  * sending it off.
 363  *
 364  * This function can emit state (say, to preserve registers that aren't saved
 365  * between batches).  All of this state MUST fit in the reserved space at the
 366  * end of the batchbuffer.  If you add more GPU state, increase the reserved
 367  * space by updating the BATCH_RESERVED macro.
 368  */
 369 static void
 370 brw_finish_batch(struct brw_context *brw)
 371 {
 372    /* Capture the closing pipeline statistics register values necessary to
 373     * support query objects (in the non-hardware context world).
 374     */
 375    brw_emit_query_end(brw);
 376
 377    if (brw->batch.ring == RENDER_RING) {
 378       /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
 379        * assume that the L3 cache is configured according to the hardware
 380        * defaults.
 381        */
 382       if (brw->gen >= 7)
 383          gen7_restore_default_l3_config(brw);
 384
 385       if (brw->is_haswell) {
 386          /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
 387           * 3DSTATE_CC_STATE_POINTERS > "Note":
 388           *
 389           * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
 390           *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
 391           *
 392           * From the example in the docs, it seems to expect a regular pipe control
 393           * flush here as well. We may have done it already, but meh.
 394           *
 395           * See also WaAvoidRCZCounterRollover.
 396           */
 397          brw_emit_mi_flush(brw);
 398          BEGIN_BATCH(2);
 399          OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
 400          OUT_BATCH(brw->cc.state_offset | 1);
 401          ADVANCE_BATCH();
 402          brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
 403                                           PIPE_CONTROL_CS_STALL);
 404       }
 405    }
 406
 407    /* Mark that the current program cache BO has been used by the GPU.
 408     * It will be reallocated if we need to put new programs in for the
 409     * next batch.
 410     */
 411    brw->cache.bo_used_by_gpu = true;
 412 }
 413
 414 static void
 415 throttle(struct brw_context *brw)
 416 {
 417    /* Wait for the swapbuffers before the one we just emitted, so we
 418     * don't get too many swaps outstanding for apps that are GPU-heavy
 419     * but not CPU-heavy.
 420     *
 421     * We're using intelDRI2Flush (called from the loader before
 422     * swapbuffer) and glFlush (for front buffer rendering) as the
 423     * indicator that a frame is done and then throttle when we get
 424     * here as we prepare to render the next frame.  At this point for
 425     * round trips for swap/copy and getting new buffers are done and
 426     * we'll spend less time waiting on the GPU.
 427     *
 428     * Unfortunately, we don't have a handle to the batch containing
 429     * the swap, and getting our hands on that doesn't seem worth it,
 430     * so we just use the first batch we emitted after the last swap.
 431     */
 432    if (brw->need_swap_throttle && brw->throttle_batch[0]) {
 433       if (brw->throttle_batch[1]) {
 434          if (!brw->disable_throttling)
 435             drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
 436          drm_intel_bo_unreference(brw->throttle_batch[1]);
 437       }
 438       brw->throttle_batch[1] = brw->throttle_batch[0];
 439       brw->throttle_batch[0] = NULL;
 440       brw->need_swap_throttle = false;
 441       /* Throttling here is more precise than the throttle ioctl, so skip it */
 442       brw->need_flush_throttle = false;
 443    }
 444
 445    if (brw->need_flush_throttle) {
 446       __DRIscreen *dri_screen = brw->screen->driScrnPriv;
 447       drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
 448       brw->need_flush_throttle = false;
 449    }
 450 }
 451
 452 /* TODO: Push this whole function into bufmgr.
 453  */
 454 static int
 455 do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
 456 {
 457    struct intel_batchbuffer *batch = &brw->batch;
 458    int ret = 0;
 459
 460    if (brw->has_llc) {
 461       drm_intel_bo_unmap(batch->bo);
 462    } else {
 463       ret = drm_intel_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map);
 464       if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
 465          ret = drm_intel_bo_subdata(batch->bo,
 466                                     batch->state_batch_offset,
 467                                     batch->bo->size - batch->state_batch_offset,
 468                                     (char *)batch->map + batch->state_batch_offset);
 469       }
 470    }
 471
 472    if (!brw->screen->no_hw) {
 473       int flags;
 474
 475       if (brw->gen >= 6 && batch->ring == BLT_RING) {
 476          flags = I915_EXEC_BLT;
 477       } else {
 478          flags = I915_EXEC_RENDER;
 479       }
 480       if (batch->needs_sol_reset)
 481          flags |= I915_EXEC_GEN7_SOL_RESET;
 482
 483       if (ret == 0) {
 484          if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
 485             assert(in_fence_fd == -1);
 486             assert(out_fence_fd == NULL);
 487             ret = drm_intel_bo_mrb_exec(batch->bo, 4 * USED_BATCH(*batch),
 488                                         NULL, 0, 0, flags);
 489          } else {
 490             ret = drm_intel_gem_bo_fence_exec(batch->bo, brw->hw_ctx,
 491                                                 4 * USED_BATCH(*batch),
 492                                                 in_fence_fd, out_fence_fd,
 493                                                 flags);
 494          }
 495       }
 496
 497       throttle(brw);
 498    }
 499
 500    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
 501       do_batch_dump(brw);
 502
 503    if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
 504       brw_check_for_reset(brw);
 505
 506    if (ret != 0) {
 507       fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
 508       exit(1);
 509    }
 510
 511    return ret;
 512 }
 513
 514 /**
 515  * The in_fence_fd is ignored if -1.  Otherwise this function takes ownership
 516  * of the fd.
 517  *
 518  * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
 519  * of the returned fd.
 520  */
 521 int
 522 _intel_batchbuffer_flush_fence(struct brw_context *brw,
 523                                int in_fence_fd, int *out_fence_fd,
 524                                const char *file, int line)
 525 {
 526    int ret;
 527
 528    if (USED_BATCH(brw->batch) == 0)
 529       return 0;
 530
 531    if (brw->throttle_batch[0] == NULL) {
 532       brw->throttle_batch[0] = brw->batch.bo;
 533       drm_intel_bo_reference(brw->throttle_batch[0]);
 534    }
 535
 536    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
 537       int bytes_for_commands = 4 * USED_BATCH(brw->batch);
 538       int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
 539       int total_bytes = bytes_for_commands + bytes_for_state;
 540       fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
 541               "%4db (state) = %4db (%0.1f%%)\n", file, line,
 542               bytes_for_commands, bytes_for_state,
 543               total_bytes,
 544               100.0f * total_bytes / BATCH_SZ);
 545    }
 546
 547    brw->batch.reserved_space = 0;
 548
 549    brw_finish_batch(brw);
 550
 551    /* Mark the end of the buffer. */
 552    intel_batchbuffer_emit_dword(&brw->batch, MI_BATCH_BUFFER_END);
 553    if (USED_BATCH(brw->batch) & 1) {
 554       /* Round batchbuffer usage to 2 DWORDs. */
 555       intel_batchbuffer_emit_dword(&brw->batch, MI_NOOP);
 556    }
 557
 558    intel_upload_finish(brw);
 559
 560    /* Check that we didn't just wrap our batchbuffer at a bad time. */
 561    assert(!brw->no_batch_wrap);
 562
 563    ret = do_flush_locked(brw, in_fence_fd, out_fence_fd);
 564
 565    if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
 566       fprintf(stderr, "waiting for idle\n");
 567       drm_intel_bo_wait_rendering(brw->batch.bo);
 568    }
 569
 570    /* Start a new batch buffer. */
 571    brw_new_batch(brw);
 572
 573    return ret;
 574 }
 575
 576
 577 /*  This is the only way buffers get added to the validate list.
 578  */
 579 uint32_t
 580 intel_batchbuffer_reloc(struct intel_batchbuffer *batch,
 581                         drm_intel_bo *buffer, uint32_t offset,
 582                         uint32_t read_domains, uint32_t write_domain,
 583                         uint32_t delta)
 584 {
 585    int ret;
 586
 587    ret = drm_intel_bo_emit_reloc(batch->bo, offset,
 588                                  buffer, delta,
 589                                  read_domains, write_domain);
 590    assert(ret == 0);
 591    (void)ret;
 592
 593    /* Using the old buffer offset, write in what the right data would be, in
 594     * case the buffer doesn't move and we can short-circuit the relocation
 595     * processing in the kernel
 596     */
 597    return buffer->offset64 + delta;
 598 }
 599
 600 uint64_t
 601 intel_batchbuffer_reloc64(struct intel_batchbuffer *batch,
 602                           drm_intel_bo *buffer, uint32_t offset,
 603                           uint32_t read_domains, uint32_t write_domain,
 604                           uint32_t delta)
 605 {
 606    int ret = drm_intel_bo_emit_reloc(batch->bo, offset,
 607                                      buffer, delta,
 608                                      read_domains, write_domain);
 609    assert(ret == 0);
 610    (void) ret;
 611
 612    /* Using the old buffer offset, write in what the right data would be, in
 613     * case the buffer doesn't move and we can short-circuit the relocation
 614     * processing in the kernel
 615     */
 616    return buffer->offset64 + delta;
 617 }
 618
 619
 620 void
 621 intel_batchbuffer_data(struct brw_context *brw,
 622                        const void *data, GLuint bytes, enum brw_gpu_ring ring)
 623 {
 624    assert((bytes & 3) == 0);
 625    intel_batchbuffer_require_space(brw, bytes, ring);
 626    memcpy(brw->batch.map_next, data, bytes);
 627    brw->batch.map_next += bytes >> 2;
 628 }
 629
 630 static void
 631 load_sized_register_mem(struct brw_context *brw,
 632                         uint32_t reg,
 633                         drm_intel_bo *bo,
 634                         uint32_t read_domains, uint32_t write_domain,
 635                         uint32_t offset,
 636                         int size)
 637 {
 638    int i;
 639
 640    /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
 641    assert(brw->gen >= 7);
 642
 643    if (brw->gen >= 8) {
 644       BEGIN_BATCH(4 * size);
 645       for (i = 0; i < size; i++) {
 646          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
 647          OUT_BATCH(reg + i * 4);
 648          OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
 649       }
 650       ADVANCE_BATCH();
 651    } else {
 652       BEGIN_BATCH(3 * size);
 653       for (i = 0; i < size; i++) {
 654          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
 655          OUT_BATCH(reg + i * 4);
 656          OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
 657       }
 658       ADVANCE_BATCH();
 659    }
 660 }
 661
 662 void
 663 brw_load_register_mem(struct brw_context *brw,
 664                       uint32_t reg,
 665                       drm_intel_bo *bo,
 666                       uint32_t read_domains, uint32_t write_domain,
 667                       uint32_t offset)
 668 {
 669    load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
 670 }
 671
 672 void
 673 brw_load_register_mem64(struct brw_context *brw,
 674                         uint32_t reg,
 675                         drm_intel_bo *bo,
 676                         uint32_t read_domains, uint32_t write_domain,
 677                         uint32_t offset)
 678 {
 679    load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
 680 }
 681
 682 /*
 683  * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
 684  */
 685 void
 686 brw_store_register_mem32(struct brw_context *brw,
 687                          drm_intel_bo *bo, uint32_t reg, uint32_t offset)
 688 {
 689    assert(brw->gen >= 6);
 690
 691    if (brw->gen >= 8) {
 692       BEGIN_BATCH(4);
 693       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
 694       OUT_BATCH(reg);
 695       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 696                   offset);
 697       ADVANCE_BATCH();
 698    } else {
 699       BEGIN_BATCH(3);
 700       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
 701       OUT_BATCH(reg);
 702       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 703                 offset);
 704       ADVANCE_BATCH();
 705    }
 706 }
 707
 708 /*
 709  * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
 710  */
 711 void
 712 brw_store_register_mem64(struct brw_context *brw,
 713                          drm_intel_bo *bo, uint32_t reg, uint32_t offset)
 714 {
 715    assert(brw->gen >= 6);
 716
 717    /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
 718     * read a full 64-bit register, we need to do two of them.
 719     */
 720    if (brw->gen >= 8) {
 721       BEGIN_BATCH(8);
 722       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
 723       OUT_BATCH(reg);
 724       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 725                   offset);
 726       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
 727       OUT_BATCH(reg + sizeof(uint32_t));
 728       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 729                   offset + sizeof(uint32_t));
 730       ADVANCE_BATCH();
 731    } else {
 732       BEGIN_BATCH(6);
 733       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
 734       OUT_BATCH(reg);
 735       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 736                 offset);
 737       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
 738       OUT_BATCH(reg + sizeof(uint32_t));
 739       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 740                 offset + sizeof(uint32_t));
 741       ADVANCE_BATCH();
 742    }
 743 }
 744
 745 /*
 746  * Write a 32-bit register using immediate data.
 747  */
 748 void
 749 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
 750 {
 751    assert(brw->gen >= 6);
 752
 753    BEGIN_BATCH(3);
 754    OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
 755    OUT_BATCH(reg);
 756    OUT_BATCH(imm);
 757    ADVANCE_BATCH();
 758 }
 759
 760 /*
 761  * Write a 64-bit register using immediate data.
 762  */
 763 void
 764 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
 765 {
 766    assert(brw->gen >= 6);
 767
 768    BEGIN_BATCH(5);
 769    OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
 770    OUT_BATCH(reg);
 771    OUT_BATCH(imm & 0xffffffff);
 772    OUT_BATCH(reg + 4);
 773    OUT_BATCH(imm >> 32);
 774    ADVANCE_BATCH();
 775 }
 776
 777 /*
 778  * Copies a 32-bit register.
 779  */
 780 void
 781 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
 782 {
 783    assert(brw->gen >= 8 || brw->is_haswell);
 784
 785    BEGIN_BATCH(3);
 786    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
 787    OUT_BATCH(src);
 788    OUT_BATCH(dest);
 789    ADVANCE_BATCH();
 790 }
 791
 792 /*
 793  * Copies a 64-bit register.
 794  */
 795 void
 796 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
 797 {
 798    assert(brw->gen >= 8 || brw->is_haswell);
 799
 800    BEGIN_BATCH(6);
 801    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
 802    OUT_BATCH(src);
 803    OUT_BATCH(dest);
 804    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
 805    OUT_BATCH(src + sizeof(uint32_t));
 806    OUT_BATCH(dest + sizeof(uint32_t));
 807    ADVANCE_BATCH();
 808 }
 809
 810 /*
 811  * Write 32-bits of immediate data to a GPU memory buffer.
 812  */
 813 void
 814 brw_store_data_imm32(struct brw_context *brw, drm_intel_bo *bo,
 815                      uint32_t offset, uint32_t imm)
 816 {
 817    assert(brw->gen >= 6);
 818
 819    BEGIN_BATCH(4);
 820    OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
 821    if (brw->gen >= 8)
 822       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 823                   offset);
 824    else {
 825       OUT_BATCH(0); /* MBZ */
 826       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 827                 offset);
 828    }
 829    OUT_BATCH(imm);
 830    ADVANCE_BATCH();
 831 }
 832
 833 /*
 834  * Write 64-bits of immediate data to a GPU memory buffer.
 835  */
 836 void
 837 brw_store_data_imm64(struct brw_context *brw, drm_intel_bo *bo,
 838                      uint32_t offset, uint64_t imm)
 839 {
 840    assert(brw->gen >= 6);
 841
 842    BEGIN_BATCH(5);
 843    OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
 844    if (brw->gen >= 8)
 845       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 846                   offset);
 847    else {
 848       OUT_BATCH(0); /* MBZ */
 849       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 850                 offset);
 851    }
 852    OUT_BATCH(imm & 0xffffffffu);
 853    OUT_BATCH(imm >> 32);
 854    ADVANCE_BATCH();
 855 }