src/mesa/drivers/dri/i965/intel_batchbuffer.c

   1 /*
   2  * Copyright 2006 VMware, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the
   7  * "Software"), to deal in the Software without restriction, including
   8  * without limitation the rights to use, copy, modify, merge, publish,
   9  * distribute, sublicense, and/or sell copies of the Software, and to
  10  * permit persons to whom the Software is furnished to do so, subject to
  11  * the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the
  14  * next paragraph) shall be included in all copies or substantial portions
  15  * of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  19  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  20  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  21  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  22  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  23  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #include "intel_batchbuffer.h"
  27 #include "intel_buffer_objects.h"
  28 #include "brw_bufmgr.h"
  29 #include "intel_buffers.h"
  30 #include "intel_fbo.h"
  31 #include "brw_context.h"
  32 #include "brw_defines.h"
  33 #include "brw_state.h"
  34 #include "common/gen_decoder.h"
  35
  36 #include "util/hash_table.h"
  37
  38 #include <xf86drm.h>
  39 #include <i915_drm.h>
  40
  41 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
  42
  43 /**
  44  * Target sizes of the batch and state buffers.  We create the initial
  45  * buffers at these sizes, and flush when they're nearly full.  If we
  46  * underestimate how close we are to the end, and suddenly need more space
  47  * in the middle of a draw, we can grow the buffers, and finish the draw.
  48  * At that point, we'll be over our target size, so the next operation
  49  * should flush.  Each time we flush the batch, we recreate both buffers
  50  * at the original target size, so it doesn't grow without bound.
  51  */
  52 #define BATCH_SZ (20 * 1024)
  53 #define STATE_SZ (16 * 1024)
  54
  55 static void
  56 intel_batchbuffer_reset(struct brw_context *brw);
  57
  58 UNUSED static void
  59 dump_validation_list(struct intel_batchbuffer *batch)
  60 {
  61    fprintf(stderr, "Validation list (length %d):\n", batch->exec_count);
  62
  63    for (int i = 0; i < batch->exec_count; i++) {
  64       uint64_t flags = batch->validation_list[i].flags;
  65       assert(batch->validation_list[i].handle ==
  66              batch->exec_bos[i]->gem_handle);
  67       fprintf(stderr, "[%2d]: %2d %-14s %p %s%-7s @ 0x%016llx%s (%"PRIu64"B)\n",
  68               i,
  69               batch->validation_list[i].handle,
  70               batch->exec_bos[i]->name,
  71               batch->exec_bos[i],
  72               (flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) ? "(48b" : "(32b",
  73               (flags & EXEC_OBJECT_WRITE) ? " write)" : ")",
  74               batch->validation_list[i].offset,
  75               (flags & EXEC_OBJECT_PINNED) ? " (pinned)" : "",
  76               batch->exec_bos[i]->size);
  77    }
  78 }
  79
  80 static bool
  81 uint_key_compare(const void *a, const void *b)
  82 {
  83    return a == b;
  84 }
  85
  86 static uint32_t
  87 uint_key_hash(const void *key)
  88 {
  89    return (uintptr_t) key;
  90 }
  91
  92 static void
  93 init_reloc_list(struct brw_reloc_list *rlist, int count)
  94 {
  95    rlist->reloc_count = 0;
  96    rlist->reloc_array_size = count;
  97    rlist->relocs = malloc(rlist->reloc_array_size *
  98                           sizeof(struct drm_i915_gem_relocation_entry));
  99 }
 100
 101 void
 102 intel_batchbuffer_init(struct brw_context *brw)
 103 {
 104    struct intel_screen *screen = brw->screen;
 105    struct intel_batchbuffer *batch = &brw->batch;
 106    const struct gen_device_info *devinfo = &screen->devinfo;
 107
 108    batch->use_shadow_copy = !devinfo->has_llc;
 109
 110    if (batch->use_shadow_copy) {
 111       batch->batch.map = malloc(BATCH_SZ);
 112       batch->map_next = batch->batch.map;
 113       batch->state.map = malloc(STATE_SZ);
 114    }
 115
 116    init_reloc_list(&batch->batch_relocs, 250);
 117    init_reloc_list(&batch->state_relocs, 250);
 118
 119    batch->exec_count = 0;
 120    batch->exec_array_size = 100;
 121    batch->exec_bos =
 122       malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
 123    batch->validation_list =
 124       malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
 125
 126    if (INTEL_DEBUG & DEBUG_BATCH) {
 127       batch->state_batch_sizes =
 128          _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
 129    }
 130
 131    batch->use_batch_first =
 132       screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST;
 133
 134    /* PIPE_CONTROL needs a w/a but only on gen6 */
 135    batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
 136    if (devinfo->gen == 6)
 137       batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
 138
 139    intel_batchbuffer_reset(brw);
 140 }
 141
 142 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
 143
 144 static unsigned
 145 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
 146 {
 147    unsigned index = READ_ONCE(bo->index);
 148
 149    if (index < batch->exec_count && batch->exec_bos[index] == bo)
 150       return index;
 151
 152    /* May have been shared between multiple active batches */
 153    for (index = 0; index < batch->exec_count; index++) {
 154       if (batch->exec_bos[index] == bo)
 155          return index;
 156    }
 157
 158    brw_bo_reference(bo);
 159
 160    if (batch->exec_count == batch->exec_array_size) {
 161       batch->exec_array_size *= 2;
 162       batch->exec_bos =
 163          realloc(batch->exec_bos,
 164                  batch->exec_array_size * sizeof(batch->exec_bos[0]));
 165       batch->validation_list =
 166          realloc(batch->validation_list,
 167                  batch->exec_array_size * sizeof(batch->validation_list[0]));
 168    }
 169
 170    batch->validation_list[batch->exec_count] =
 171       (struct drm_i915_gem_exec_object2) {
 172          .handle = bo->gem_handle,
 173          .offset = bo->gtt_offset,
 174          .flags = bo->kflags,
 175       };
 176
 177    bo->index = batch->exec_count;
 178    batch->exec_bos[batch->exec_count] = bo;
 179    batch->aperture_space += bo->size;
 180
 181    return batch->exec_count++;
 182 }
 183
 184 static void
 185 recreate_growing_buffer(struct brw_context *brw,
 186                         struct brw_growing_bo *grow,
 187                         const char *name, unsigned size)
 188 {
 189    struct intel_screen *screen = brw->screen;
 190    struct intel_batchbuffer *batch = &brw->batch;
 191    struct brw_bufmgr *bufmgr = screen->bufmgr;
 192
 193    grow->bo = brw_bo_alloc(bufmgr, name, size);
 194    grow->bo->kflags = can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0;
 195    grow->partial_bo = NULL;
 196    grow->partial_bo_map = NULL;
 197    grow->partial_bytes = 0;
 198
 199    if (!batch->use_shadow_copy)
 200       grow->map = brw_bo_map(brw, grow->bo, MAP_READ | MAP_WRITE);
 201 }
 202
 203 static void
 204 intel_batchbuffer_reset(struct brw_context *brw)
 205 {
 206    struct intel_batchbuffer *batch = &brw->batch;
 207
 208    if (batch->last_bo != NULL) {
 209       brw_bo_unreference(batch->last_bo);
 210       batch->last_bo = NULL;
 211    }
 212    batch->last_bo = batch->batch.bo;
 213
 214    recreate_growing_buffer(brw, &batch->batch, "batchbuffer", BATCH_SZ);
 215    batch->map_next = batch->batch.map;
 216
 217    recreate_growing_buffer(brw, &batch->state, "statebuffer", STATE_SZ);
 218
 219    /* Avoid making 0 a valid state offset - otherwise the decoder will try
 220     * and decode data when we use offset 0 as a null pointer.
 221     */
 222    batch->state_used = 1;
 223
 224    add_exec_bo(batch, batch->batch.bo);
 225    assert(batch->batch.bo->index == 0);
 226
 227    batch->needs_sol_reset = false;
 228    batch->state_base_address_emitted = false;
 229
 230    /* We don't know what ring the new batch will be sent to until we see the
 231     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
 232     */
 233    batch->ring = UNKNOWN_RING;
 234
 235    if (batch->state_batch_sizes)
 236       _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
 237 }
 238
 239 static void
 240 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
 241 {
 242    intel_batchbuffer_reset(brw);
 243    brw_cache_sets_clear(brw);
 244 }
 245
 246 void
 247 intel_batchbuffer_save_state(struct brw_context *brw)
 248 {
 249    brw->batch.saved.map_next = brw->batch.map_next;
 250    brw->batch.saved.batch_reloc_count = brw->batch.batch_relocs.reloc_count;
 251    brw->batch.saved.state_reloc_count = brw->batch.state_relocs.reloc_count;
 252    brw->batch.saved.exec_count = brw->batch.exec_count;
 253 }
 254
 255 void
 256 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 257 {
 258    for (int i = brw->batch.saved.exec_count;
 259         i < brw->batch.exec_count; i++) {
 260       brw_bo_unreference(brw->batch.exec_bos[i]);
 261    }
 262    brw->batch.batch_relocs.reloc_count = brw->batch.saved.batch_reloc_count;
 263    brw->batch.state_relocs.reloc_count = brw->batch.saved.state_reloc_count;
 264    brw->batch.exec_count = brw->batch.saved.exec_count;
 265
 266    brw->batch.map_next = brw->batch.saved.map_next;
 267    if (USED_BATCH(brw->batch) == 0)
 268       brw->batch.ring = UNKNOWN_RING;
 269 }
 270
 271 void
 272 intel_batchbuffer_free(struct intel_batchbuffer *batch)
 273 {
 274    if (batch->use_shadow_copy) {
 275       free(batch->batch.map);
 276       free(batch->state.map);
 277    }
 278
 279    for (int i = 0; i < batch->exec_count; i++) {
 280       brw_bo_unreference(batch->exec_bos[i]);
 281    }
 282    free(batch->batch_relocs.relocs);
 283    free(batch->state_relocs.relocs);
 284    free(batch->exec_bos);
 285    free(batch->validation_list);
 286
 287    brw_bo_unreference(batch->last_bo);
 288    brw_bo_unreference(batch->batch.bo);
 289    brw_bo_unreference(batch->state.bo);
 290    if (batch->state_batch_sizes)
 291       _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
 292 }
 293
 294 /**
 295  * Finish copying the old batch/state buffer's contents to the new one
 296  * after we tried to "grow" the buffer in an earlier operation.
 297  */
 298 static void
 299 finish_growing_bos(struct brw_growing_bo *grow)
 300 {
 301    struct brw_bo *old_bo = grow->partial_bo;
 302    if (!old_bo)
 303       return;
 304
 305    memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes);
 306
 307    grow->partial_bo = NULL;
 308    grow->partial_bo_map = NULL;
 309    grow->partial_bytes = 0;
 310
 311    brw_bo_unreference(old_bo);
 312 }
 313
 314 static void
 315 replace_bo_in_reloc_list(struct brw_reloc_list *rlist,
 316                          uint32_t old_handle, uint32_t new_handle)
 317 {
 318    for (int i = 0; i < rlist->reloc_count; i++) {
 319       if (rlist->relocs[i].target_handle == old_handle)
 320          rlist->relocs[i].target_handle = new_handle;
 321    }
 322 }
 323
 324 /**
 325  * Grow either the batch or state buffer to a new larger size.
 326  *
 327  * We can't actually grow buffers, so we allocate a new one, copy over
 328  * the existing contents, and update our lists to refer to the new one.
 329  *
 330  * Note that this is only temporary - each new batch recreates the buffers
 331  * at their original target size (BATCH_SZ or STATE_SZ).
 332  */
 333 static void
 334 grow_buffer(struct brw_context *brw,
 335             struct brw_growing_bo *grow,
 336             unsigned existing_bytes,
 337             unsigned new_size)
 338 {
 339    struct intel_batchbuffer *batch = &brw->batch;
 340    struct brw_bufmgr *bufmgr = brw->bufmgr;
 341    struct brw_bo *bo = grow->bo;
 342
 343    perf_debug("Growing %s - ran out of space\n", bo->name);
 344
 345    if (grow->partial_bo) {
 346       /* We've already grown once, and now we need to do it again.
 347        * Finish our last grow operation so we can start a new one.
 348        * This should basically never happen.
 349        */
 350       perf_debug("Had to grow multiple times");
 351       finish_growing_bos(grow);
 352    }
 353
 354    struct brw_bo *new_bo = brw_bo_alloc(bufmgr, bo->name, new_size);
 355
 356    /* Copy existing data to the new larger buffer */
 357    grow->partial_bo_map = grow->map;
 358
 359    if (batch->use_shadow_copy) {
 360       /* We can't safely use realloc, as it may move the existing buffer,
 361        * breaking existing pointers the caller may still be using.  Just
 362        * malloc a new copy and memcpy it like the normal BO path.
 363        */
 364       grow->map = malloc(new_size);
 365    } else {
 366       grow->map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE);
 367    }
 368
 369    /* Try to put the new BO at the same GTT offset as the old BO (which
 370     * we're throwing away, so it doesn't need to be there).
 371     *
 372     * This guarantees that our relocations continue to work: values we've
 373     * already written into the buffer, values we're going to write into the
 374     * buffer, and the validation/relocation lists all will match.
 375     *
 376     * Also preserve kflags for EXEC_OBJECT_CAPTURE.
 377     */
 378    new_bo->gtt_offset = bo->gtt_offset;
 379    new_bo->index = bo->index;
 380    new_bo->kflags = bo->kflags;
 381
 382    /* Batch/state buffers are per-context, and if we've run out of space,
 383     * we must have actually used them before, so...they will be in the list.
 384     */
 385    assert(bo->index < batch->exec_count);
 386    assert(batch->exec_bos[bo->index] == bo);
 387
 388    /* Update the validation list to use the new BO. */
 389    batch->validation_list[bo->index].handle = new_bo->gem_handle;
 390
 391    if (!batch->use_batch_first) {
 392       /* We're not using I915_EXEC_HANDLE_LUT, which means we need to go
 393        * update the relocation list entries to point at the new BO as well.
 394        * (With newer kernels, the "handle" is an offset into the validation
 395        * list, which remains unchanged, so we can skip this.)
 396        */
 397       replace_bo_in_reloc_list(&batch->batch_relocs,
 398                                bo->gem_handle, new_bo->gem_handle);
 399       replace_bo_in_reloc_list(&batch->state_relocs,
 400                                bo->gem_handle, new_bo->gem_handle);
 401    }
 402
 403    /* Exchange the two BOs...without breaking pointers to the old BO.
 404     *
 405     * Consider this scenario:
 406     *
 407     * 1. Somebody calls brw_state_batch() to get a region of memory, and
 408     *    and then creates a brw_address pointing to brw->batch.state.bo.
 409     * 2. They then call brw_state_batch() a second time, which happens to
 410     *    grow and replace the state buffer.  They then try to emit a
 411     *    relocation to their first section of memory.
 412     *
 413     * If we replace the brw->batch.state.bo pointer at step 2, we would
 414     * break the address created in step 1.  They'd have a pointer to the
 415     * old destroyed BO.  Emitting a relocation would add this dead BO to
 416     * the validation list...causing /both/ statebuffers to be in the list,
 417     * and all kinds of disasters.
 418     *
 419     * This is not a contrived case - BLORP vertex data upload hits this.
 420     *
 421     * There are worse scenarios too.  Fences for GL sync objects reference
 422     * brw->batch.batch.bo.  If we replaced the batch pointer when growing,
 423     * we'd need to chase down every fence and update it to point to the
 424     * new BO.  Otherwise, it would refer to a "batch" that never actually
 425     * gets submitted, and would fail to trigger.
 426     *
 427     * To work around both of these issues, we transmutate the buffers in
 428     * place, making the existing struct brw_bo represent the new buffer,
 429     * and "new_bo" represent the old BO.  This is highly unusual, but it
 430     * seems like a necessary evil.
 431     *
 432     * We also defer the memcpy of the existing batch's contents.  Callers
 433     * may make multiple brw_state_batch calls, and retain pointers to the
 434     * old BO's map.  We'll perform the memcpy in finish_growing_bo() when
 435     * we finally submit the batch, at which point we've finished uploading
 436     * state, and nobody should have any old references anymore.
 437     *
 438     * To do that, we keep a reference to the old BO in grow->partial_bo,
 439     * and store the number of bytes to copy in grow->partial_bytes.  We
 440     * can monkey with the refcounts directly without atomics because these
 441     * are per-context BOs and they can only be touched by this thread.
 442     */
 443    assert(new_bo->refcount == 1);
 444    new_bo->refcount = bo->refcount;
 445    bo->refcount = 1;
 446
 447    struct brw_bo tmp;
 448    memcpy(&tmp, bo, sizeof(struct brw_bo));
 449    memcpy(bo, new_bo, sizeof(struct brw_bo));
 450    memcpy(new_bo, &tmp, sizeof(struct brw_bo));
 451
 452    grow->partial_bo = new_bo; /* the one reference of the OLD bo */
 453    grow->partial_bytes = existing_bytes;
 454 }
 455
 456 void
 457 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
 458                                 enum brw_gpu_ring ring)
 459 {
 460    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 461    struct intel_batchbuffer *batch = &brw->batch;
 462
 463    /* If we're switching rings, implicitly flush the batch. */
 464    if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
 465        devinfo->gen >= 6) {
 466       intel_batchbuffer_flush(brw);
 467    }
 468
 469    const unsigned batch_used = USED_BATCH(*batch) * 4;
 470    if (batch_used + sz >= BATCH_SZ && !batch->no_wrap) {
 471       intel_batchbuffer_flush(brw);
 472    } else if (batch_used + sz >= batch->batch.bo->size) {
 473       const unsigned new_size =
 474          MIN2(batch->batch.bo->size + batch->batch.bo->size / 2,
 475               MAX_BATCH_SIZE);
 476       grow_buffer(brw, &batch->batch, batch_used, new_size);
 477       batch->map_next = (void *) batch->batch.map + batch_used;
 478       assert(batch_used + sz < batch->batch.bo->size);
 479    }
 480
 481    /* The intel_batchbuffer_flush() calls above might have changed
 482     * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
 483     */
 484    brw->batch.ring = ring;
 485 }
 486
 487 #ifdef DEBUG
 488 #define CSI "\e["
 489 #define BLUE_HEADER  CSI "0;44m"
 490 #define NORMAL       CSI "0m"
 491
 492
 493 static void
 494 decode_struct(struct brw_context *brw, struct gen_spec *spec,
 495               const char *struct_name, uint32_t *data,
 496               uint32_t gtt_offset, uint32_t offset, bool color)
 497 {
 498    struct gen_group *group = gen_spec_find_struct(spec, struct_name);
 499    if (!group)
 500       return;
 501
 502    fprintf(stderr, "%s\n", struct_name);
 503    gen_print_group(stderr, group, gtt_offset + offset,
 504                    &data[offset / 4], 0, color);
 505 }
 506
 507 static void
 508 decode_structs(struct brw_context *brw, struct gen_spec *spec,
 509                const char *struct_name,
 510                uint32_t *data, uint32_t gtt_offset, uint32_t offset,
 511                int struct_size, bool color)
 512 {
 513    struct gen_group *group = gen_spec_find_struct(spec, struct_name);
 514    if (!group)
 515       return;
 516
 517    int entries = brw_state_batch_size(brw, offset) / struct_size;
 518    for (int i = 0; i < entries; i++) {
 519       fprintf(stderr, "%s %d\n", struct_name, i);
 520       gen_print_group(stderr, group, gtt_offset + offset,
 521                       &data[(offset + i * struct_size) / 4], 0, color);
 522    }
 523 }
 524
 525 static void
 526 do_batch_dump(struct brw_context *brw)
 527 {
 528    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 529    struct intel_batchbuffer *batch = &brw->batch;
 530    struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
 531
 532    if (batch->ring != RENDER_RING)
 533       return;
 534
 535    uint32_t *batch_data = brw_bo_map(brw, batch->batch.bo, MAP_READ);
 536    uint32_t *state = brw_bo_map(brw, batch->state.bo, MAP_READ);
 537    if (batch_data == NULL || state == NULL) {
 538       fprintf(stderr, "WARNING: failed to map batchbuffer/statebuffer\n");
 539       return;
 540    }
 541
 542    uint32_t *end = batch_data + USED_BATCH(*batch);
 543    uint32_t batch_gtt_offset = batch->batch.bo->gtt_offset;
 544    uint32_t state_gtt_offset = batch->state.bo->gtt_offset;
 545    int length;
 546
 547    bool color = INTEL_DEBUG & DEBUG_COLOR;
 548    const char *header_color = color ? BLUE_HEADER : "";
 549    const char *reset_color  = color ? NORMAL : "";
 550
 551    for (uint32_t *p = batch_data; p < end; p += length) {
 552       struct gen_group *inst = gen_spec_find_instruction(spec, p);
 553       length = gen_group_get_length(inst, p);
 554       assert(inst == NULL || length > 0);
 555       length = MAX2(1, length);
 556       if (inst == NULL) {
 557          fprintf(stderr, "unknown instruction %08x\n", p[0]);
 558          continue;
 559       }
 560
 561       uint64_t offset = batch_gtt_offset + 4 * (p - batch_data);
 562
 563       fprintf(stderr, "%s0x%08"PRIx64":  0x%08x:  %-80s%s\n", header_color,
 564               offset, p[0], gen_group_get_name(inst), reset_color);
 565
 566       gen_print_group(stderr, inst, offset, p, 0, color);
 567
 568       switch (gen_group_get_opcode(inst) >> 16) {
 569       case _3DSTATE_PIPELINED_POINTERS:
 570          /* Note: these Gen4-5 pointers are full relocations rather than
 571           * offsets from the start of the statebuffer.  So we need to subtract
 572           * gtt_offset (the start of the statebuffer) to obtain an offset we
 573           * can add to the map and get at the data.
 574           */
 575          decode_struct(brw, spec, "VS_STATE", state, state_gtt_offset,
 576                        (p[1] & ~0x1fu) - state_gtt_offset, color);
 577          if (p[2] & 1) {
 578             decode_struct(brw, spec, "GS_STATE", state, state_gtt_offset,
 579                           (p[2] & ~0x1fu) - state_gtt_offset, color);
 580          }
 581          if (p[3] & 1) {
 582             decode_struct(brw, spec, "CLIP_STATE", state, state_gtt_offset,
 583                           (p[3] & ~0x1fu) - state_gtt_offset, color);
 584          }
 585          decode_struct(brw, spec, "SF_STATE", state, state_gtt_offset,
 586                        (p[4] & ~0x1fu) - state_gtt_offset, color);
 587          decode_struct(brw, spec, "WM_STATE", state, state_gtt_offset,
 588                        (p[5] & ~0x1fu) - state_gtt_offset, color);
 589          decode_struct(brw, spec, "COLOR_CALC_STATE", state, state_gtt_offset,
 590                        (p[6] & ~0x3fu) - state_gtt_offset, color);
 591          break;
 592       case _3DSTATE_BINDING_TABLE_POINTERS_VS:
 593       case _3DSTATE_BINDING_TABLE_POINTERS_HS:
 594       case _3DSTATE_BINDING_TABLE_POINTERS_DS:
 595       case _3DSTATE_BINDING_TABLE_POINTERS_GS:
 596       case _3DSTATE_BINDING_TABLE_POINTERS_PS: {
 597          struct gen_group *group =
 598             gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
 599          if (!group)
 600             break;
 601
 602          uint32_t bt_offset = p[1] & ~0x1fu;
 603          int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
 604          uint32_t *bt_pointers = &state[bt_offset / 4];
 605          for (int i = 0; i < bt_entries; i++) {
 606             fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
 607             gen_print_group(stderr, group, state_gtt_offset + bt_pointers[i],
 608                             &state[bt_pointers[i] / 4], 0, color);
 609          }
 610          break;
 611       }
 612       case _3DSTATE_SAMPLER_STATE_POINTERS_VS:
 613       case _3DSTATE_SAMPLER_STATE_POINTERS_HS:
 614       case _3DSTATE_SAMPLER_STATE_POINTERS_DS:
 615       case _3DSTATE_SAMPLER_STATE_POINTERS_GS:
 616       case _3DSTATE_SAMPLER_STATE_POINTERS_PS:
 617          decode_structs(brw, spec, "SAMPLER_STATE", state,
 618                         state_gtt_offset, p[1] & ~0x1fu, 4 * 4, color);
 619          break;
 620       case _3DSTATE_VIEWPORT_STATE_POINTERS:
 621          decode_structs(brw, spec, "CLIP_VIEWPORT", state,
 622                         state_gtt_offset, p[1] & ~0x3fu, 4 * 4, color);
 623          decode_structs(brw, spec, "SF_VIEWPORT", state,
 624                         state_gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
 625          decode_structs(brw, spec, "CC_VIEWPORT", state,
 626                         state_gtt_offset, p[3] & ~0x3fu, 2 * 4, color);
 627          break;
 628       case _3DSTATE_VIEWPORT_STATE_POINTERS_CC:
 629          decode_structs(brw, spec, "CC_VIEWPORT", state,
 630                         state_gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
 631          break;
 632       case _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL:
 633          decode_structs(brw, spec, "SF_CLIP_VIEWPORT", state,
 634                         state_gtt_offset, p[1] & ~0x3fu, 16 * 4, color);
 635          break;
 636       case _3DSTATE_SCISSOR_STATE_POINTERS:
 637          decode_structs(brw, spec, "SCISSOR_RECT", state,
 638                         state_gtt_offset, p[1] & ~0x1fu, 2 * 4, color);
 639          break;
 640       case _3DSTATE_BLEND_STATE_POINTERS:
 641          /* TODO: handle Gen8+ extra dword at the beginning */
 642          decode_structs(brw, spec, "BLEND_STATE", state,
 643                         state_gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
 644          break;
 645       case _3DSTATE_CC_STATE_POINTERS:
 646          if (devinfo->gen >= 7) {
 647             decode_struct(brw, spec, "COLOR_CALC_STATE", state,
 648                           state_gtt_offset, p[1] & ~0x3fu, color);
 649          } else if (devinfo->gen == 6) {
 650             decode_structs(brw, spec, "BLEND_STATE", state,
 651                            state_gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
 652             decode_struct(brw, spec, "DEPTH_STENCIL_STATE", state,
 653                           state_gtt_offset, p[2] & ~0x3fu, color);
 654             decode_struct(brw, spec, "COLOR_CALC_STATE", state,
 655                           state_gtt_offset, p[3] & ~0x3fu, color);
 656          }
 657          break;
 658       case _3DSTATE_DEPTH_STENCIL_STATE_POINTERS:
 659          decode_struct(brw, spec, "DEPTH_STENCIL_STATE", state,
 660                        state_gtt_offset, p[1] & ~0x3fu, color);
 661          break;
 662       case MEDIA_INTERFACE_DESCRIPTOR_LOAD: {
 663          struct gen_group *group =
 664             gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
 665          if (!group)
 666             break;
 667
 668          uint32_t idd_offset = p[3] & ~0x1fu;
 669          decode_struct(brw, spec, "INTERFACE_DESCRIPTOR_DATA", state,
 670                        state_gtt_offset, idd_offset, color);
 671
 672          uint32_t ss_offset = state[idd_offset / 4 + 3] & ~0x1fu;
 673          decode_structs(brw, spec, "SAMPLER_STATE", state,
 674                         state_gtt_offset, ss_offset, 4 * 4, color);
 675
 676          uint32_t bt_offset = state[idd_offset / 4 + 4] & ~0x1fu;
 677          int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
 678          uint32_t *bt_pointers = &state[bt_offset / 4];
 679          for (int i = 0; i < bt_entries; i++) {
 680             fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
 681             gen_print_group(stderr, group, state_gtt_offset + bt_pointers[i],
 682                             &state[bt_pointers[i] / 4], 0, color);
 683          }
 684          break;
 685       }
 686       }
 687    }
 688
 689    brw_bo_unmap(batch->batch.bo);
 690    brw_bo_unmap(batch->state.bo);
 691 }
 692 #else
 693 static void do_batch_dump(struct brw_context *brw) { }
 694 #endif
 695
 696 /**
 697  * Called when starting a new batch buffer.
 698  */
 699 static void
 700 brw_new_batch(struct brw_context *brw)
 701 {
 702    /* Unreference any BOs held by the previous batch, and reset counts. */
 703    for (int i = 0; i < brw->batch.exec_count; i++) {
 704       brw_bo_unreference(brw->batch.exec_bos[i]);
 705       brw->batch.exec_bos[i] = NULL;
 706    }
 707    brw->batch.batch_relocs.reloc_count = 0;
 708    brw->batch.state_relocs.reloc_count = 0;
 709    brw->batch.exec_count = 0;
 710    brw->batch.aperture_space = 0;
 711
 712    brw_bo_unreference(brw->batch.state.bo);
 713
 714    /* Create a new batchbuffer and reset the associated state: */
 715    intel_batchbuffer_reset_and_clear_render_cache(brw);
 716
 717    /* If the kernel supports hardware contexts, then most hardware state is
 718     * preserved between batches; we only need to re-emit state that is required
 719     * to be in every batch.  Otherwise we need to re-emit all the state that
 720     * would otherwise be stored in the context (which for all intents and
 721     * purposes means everything).
 722     */
 723    if (brw->hw_ctx == 0) {
 724       brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
 725       brw_upload_invariant_state(brw);
 726    }
 727
 728    brw->ctx.NewDriverState |= BRW_NEW_BATCH;
 729
 730    brw->ib.index_size = -1;
 731
 732    /* We need to periodically reap the shader time results, because rollover
 733     * happens every few seconds.  We also want to see results every once in a
 734     * while, because many programs won't cleanly destroy our context, so the
 735     * end-of-run printout may not happen.
 736     */
 737    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
 738       brw_collect_and_report_shader_time(brw);
 739 }
 740
 741 /**
 742  * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
 743  * sending it off.
 744  *
 745  * This function can emit state (say, to preserve registers that aren't saved
 746  * between batches).
 747  */
 748 static void
 749 brw_finish_batch(struct brw_context *brw)
 750 {
 751    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 752
 753    brw->batch.no_wrap = true;
 754
 755    /* Capture the closing pipeline statistics register values necessary to
 756     * support query objects (in the non-hardware context world).
 757     */
 758    brw_emit_query_end(brw);
 759
 760    if (brw->batch.ring == RENDER_RING) {
 761       /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
 762        * assume that the L3 cache is configured according to the hardware
 763        * defaults.  On Kernel 4.16+, we no longer need to do this.
 764        */
 765       if (devinfo->gen >= 7 &&
 766           !(brw->screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION))
 767          gen7_restore_default_l3_config(brw);
 768
 769       if (devinfo->is_haswell) {
 770          /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
 771           * 3DSTATE_CC_STATE_POINTERS > "Note":
 772           *
 773           * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
 774           *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
 775           *
 776           * From the example in the docs, it seems to expect a regular pipe control
 777           * flush here as well. We may have done it already, but meh.
 778           *
 779           * See also WaAvoidRCZCounterRollover.
 780           */
 781          brw_emit_mi_flush(brw);
 782          BEGIN_BATCH(2);
 783          OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
 784          OUT_BATCH(brw->cc.state_offset | 1);
 785          ADVANCE_BATCH();
 786          brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
 787                                           PIPE_CONTROL_CS_STALL);
 788       }
 789
 790       /* Do not restore push constant packets during context restore. */
 791       if (devinfo->gen >= 7)
 792          gen10_emit_isp_disable(brw);
 793    }
 794
 795    /* Emit MI_BATCH_BUFFER_END to finish our batch.  Note that execbuf2
 796     * requires our batch size to be QWord aligned, so we pad it out if
 797     * necessary by emitting an extra MI_NOOP after the end.
 798     */
 799    intel_batchbuffer_require_space(brw, 8, brw->batch.ring);
 800    *brw->batch.map_next++ = MI_BATCH_BUFFER_END;
 801    if (USED_BATCH(brw->batch) & 1) {
 802       *brw->batch.map_next++ = MI_NOOP;
 803    }
 804
 805    brw->batch.no_wrap = false;
 806 }
 807
 808 static void
 809 throttle(struct brw_context *brw)
 810 {
 811    /* Wait for the swapbuffers before the one we just emitted, so we
 812     * don't get too many swaps outstanding for apps that are GPU-heavy
 813     * but not CPU-heavy.
 814     *
 815     * We're using intelDRI2Flush (called from the loader before
 816     * swapbuffer) and glFlush (for front buffer rendering) as the
 817     * indicator that a frame is done and then throttle when we get
 818     * here as we prepare to render the next frame.  At this point for
 819     * round trips for swap/copy and getting new buffers are done and
 820     * we'll spend less time waiting on the GPU.
 821     *
 822     * Unfortunately, we don't have a handle to the batch containing
 823     * the swap, and getting our hands on that doesn't seem worth it,
 824     * so we just use the first batch we emitted after the last swap.
 825     */
 826    if (brw->need_swap_throttle && brw->throttle_batch[0]) {
 827       if (brw->throttle_batch[1]) {
 828          if (!brw->disable_throttling) {
 829             brw_bo_wait_rendering(brw->throttle_batch[1]);
 830          }
 831          brw_bo_unreference(brw->throttle_batch[1]);
 832       }
 833       brw->throttle_batch[1] = brw->throttle_batch[0];
 834       brw->throttle_batch[0] = NULL;
 835       brw->need_swap_throttle = false;
 836       /* Throttling here is more precise than the throttle ioctl, so skip it */
 837       brw->need_flush_throttle = false;
 838    }
 839
 840    if (brw->need_flush_throttle) {
 841       __DRIscreen *dri_screen = brw->screen->driScrnPriv;
 842       drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
 843       brw->need_flush_throttle = false;
 844    }
 845 }
 846
 847 static int
 848 execbuffer(int fd,
 849            struct intel_batchbuffer *batch,
 850            uint32_t ctx_id,
 851            int used,
 852            int in_fence,
 853            int *out_fence,
 854            int flags)
 855 {
 856    struct drm_i915_gem_execbuffer2 execbuf = {
 857       .buffers_ptr = (uintptr_t) batch->validation_list,
 858       .buffer_count = batch->exec_count,
 859       .batch_start_offset = 0,
 860       .batch_len = used,
 861       .flags = flags,
 862       .rsvd1 = ctx_id, /* rsvd1 is actually the context ID */
 863    };
 864
 865    unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
 866
 867    if (in_fence != -1) {
 868       execbuf.rsvd2 = in_fence;
 869       execbuf.flags |= I915_EXEC_FENCE_IN;
 870    }
 871
 872    if (out_fence != NULL) {
 873       cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2_WR;
 874       *out_fence = -1;
 875       execbuf.flags |= I915_EXEC_FENCE_OUT;
 876    }
 877
 878    int ret = drmIoctl(fd, cmd, &execbuf);
 879    if (ret != 0)
 880       ret = -errno;
 881
 882    for (int i = 0; i < batch->exec_count; i++) {
 883       struct brw_bo *bo = batch->exec_bos[i];
 884
 885       bo->idle = false;
 886       bo->index = -1;
 887
 888       /* Update brw_bo::gtt_offset */
 889       if (batch->validation_list[i].offset != bo->gtt_offset) {
 890          DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
 891              bo->gem_handle, bo->gtt_offset,
 892              batch->validation_list[i].offset);
 893          bo->gtt_offset = batch->validation_list[i].offset;
 894       }
 895    }
 896
 897    if (ret == 0 && out_fence != NULL)
 898       *out_fence = execbuf.rsvd2 >> 32;
 899
 900    return ret;
 901 }
 902
 903 static int
 904 submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
 905 {
 906    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 907    __DRIscreen *dri_screen = brw->screen->driScrnPriv;
 908    struct intel_batchbuffer *batch = &brw->batch;
 909    int ret = 0;
 910
 911    if (batch->use_shadow_copy) {
 912       void *bo_map = brw_bo_map(brw, batch->batch.bo, MAP_WRITE);
 913       memcpy(bo_map, batch->batch.map, 4 * USED_BATCH(*batch));
 914
 915       bo_map = brw_bo_map(brw, batch->state.bo, MAP_WRITE);
 916       memcpy(bo_map, batch->state.map, batch->state_used);
 917    }
 918
 919    brw_bo_unmap(batch->batch.bo);
 920    brw_bo_unmap(batch->state.bo);
 921
 922    if (!brw->screen->no_hw) {
 923       /* The requirement for using I915_EXEC_NO_RELOC are:
 924        *
 925        *   The addresses written in the objects must match the corresponding
 926        *   reloc.gtt_offset which in turn must match the corresponding
 927        *   execobject.offset.
 928        *
 929        *   Any render targets written to in the batch must be flagged with
 930        *   EXEC_OBJECT_WRITE.
 931        *
 932        *   To avoid stalling, execobject.offset should match the current
 933        *   address of that object within the active context.
 934        */
 935       int flags = I915_EXEC_NO_RELOC;
 936
 937       if (devinfo->gen >= 6 && batch->ring == BLT_RING) {
 938          flags |= I915_EXEC_BLT;
 939       } else {
 940          flags |= I915_EXEC_RENDER;
 941       }
 942       if (batch->needs_sol_reset)
 943          flags |= I915_EXEC_GEN7_SOL_RESET;
 944
 945       uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;
 946
 947       /* Set statebuffer relocations */
 948       const unsigned state_index = batch->state.bo->index;
 949       if (state_index < batch->exec_count &&
 950           batch->exec_bos[state_index] == batch->state.bo) {
 951          struct drm_i915_gem_exec_object2 *entry =
 952             &batch->validation_list[state_index];
 953          assert(entry->handle == batch->state.bo->gem_handle);
 954          entry->relocation_count = batch->state_relocs.reloc_count;
 955          entry->relocs_ptr = (uintptr_t) batch->state_relocs.relocs;
 956       }
 957
 958       /* Set batchbuffer relocations */
 959       struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
 960       assert(entry->handle == batch->batch.bo->gem_handle);
 961       entry->relocation_count = batch->batch_relocs.reloc_count;
 962       entry->relocs_ptr = (uintptr_t) batch->batch_relocs.relocs;
 963
 964       if (batch->use_batch_first) {
 965          flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
 966       } else {
 967          /* Move the batch to the end of the validation list */
 968          struct drm_i915_gem_exec_object2 tmp;
 969          const unsigned index = batch->exec_count - 1;
 970
 971          tmp = *entry;
 972          *entry = batch->validation_list[index];
 973          batch->validation_list[index] = tmp;
 974       }
 975
 976       ret = execbuffer(dri_screen->fd, batch, hw_ctx,
 977                        4 * USED_BATCH(*batch),
 978                        in_fence_fd, out_fence_fd, flags);
 979
 980       throttle(brw);
 981    }
 982
 983    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
 984       do_batch_dump(brw);
 985
 986    if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
 987       brw_check_for_reset(brw);
 988
 989    if (ret != 0) {
 990       fprintf(stderr, "i965: Failed to submit batchbuffer: %s\n",
 991               strerror(-ret));
 992       exit(1);
 993    }
 994
 995    return ret;
 996 }
 997
 998 /**
 999  * The in_fence_fd is ignored if -1.  Otherwise this function takes ownership
1000  * of the fd.
1001  *
1002  * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
1003  * of the returned fd.
1004  */
1005 int
1006 _intel_batchbuffer_flush_fence(struct brw_context *brw,
1007                                int in_fence_fd, int *out_fence_fd,
1008                                const char *file, int line)
1009 {
1010    int ret;
1011
1012    if (USED_BATCH(brw->batch) == 0)
1013       return 0;
1014
1015    /* Check that we didn't just wrap our batchbuffer at a bad time. */
1016    assert(!brw->batch.no_wrap);
1017
1018    brw_finish_batch(brw);
1019    brw_upload_finish(&brw->upload);
1020
1021    finish_growing_bos(&brw->batch.batch);
1022    finish_growing_bos(&brw->batch.state);
1023
1024    if (brw->throttle_batch[0] == NULL) {
1025       brw->throttle_batch[0] = brw->batch.batch.bo;
1026       brw_bo_reference(brw->throttle_batch[0]);
1027    }
1028
1029    if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT))) {
1030       int bytes_for_commands = 4 * USED_BATCH(brw->batch);
1031       int bytes_for_state = brw->batch.state_used;
1032       fprintf(stderr, "%19s:%-3d: Batchbuffer flush with %5db (%0.1f%%) (pkt),"
1033               " %5db (%0.1f%%) (state), %4d BOs (%0.1fMb aperture),"
1034               " %4d batch relocs, %4d state relocs\n", file, line,
1035               bytes_for_commands, 100.0f * bytes_for_commands / BATCH_SZ,
1036               bytes_for_state, 100.0f * bytes_for_state / STATE_SZ,
1037               brw->batch.exec_count,
1038               (float) brw->batch.aperture_space / (1024 * 1024),
1039               brw->batch.batch_relocs.reloc_count,
1040               brw->batch.state_relocs.reloc_count);
1041    }
1042
1043    ret = submit_batch(brw, in_fence_fd, out_fence_fd);
1044
1045    if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
1046       fprintf(stderr, "waiting for idle\n");
1047       brw_bo_wait_rendering(brw->batch.batch.bo);
1048    }
1049
1050    /* Start a new batch buffer. */
1051    brw_new_batch(brw);
1052
1053    return ret;
1054 }
1055
1056 bool
1057 brw_batch_has_aperture_space(struct brw_context *brw, unsigned extra_space)
1058 {
1059    return brw->batch.aperture_space + extra_space <=
1060           brw->screen->aperture_threshold;
1061 }
1062
1063 bool
1064 brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
1065 {
1066    unsigned index = READ_ONCE(bo->index);
1067    if (index < batch->exec_count && batch->exec_bos[index] == bo)
1068       return true;
1069
1070    for (int i = 0; i < batch->exec_count; i++) {
1071       if (batch->exec_bos[i] == bo)
1072          return true;
1073    }
1074    return false;
1075 }
1076
1077 /*  This is the only way buffers get added to the validate list.
1078  */
1079 static uint64_t
1080 emit_reloc(struct intel_batchbuffer *batch,
1081            struct brw_reloc_list *rlist, uint32_t offset,
1082            struct brw_bo *target, int32_t target_offset,
1083            unsigned int reloc_flags)
1084 {
1085    assert(target != NULL);
1086
1087    if (rlist->reloc_count == rlist->reloc_array_size) {
1088       rlist->reloc_array_size *= 2;
1089       rlist->relocs = realloc(rlist->relocs,
1090                               rlist->reloc_array_size *
1091                               sizeof(struct drm_i915_gem_relocation_entry));
1092    }
1093
1094    unsigned int index = add_exec_bo(batch, target);
1095    struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
1096
1097    if (reloc_flags & RELOC_32BIT) {
1098       /* Restrict this buffer to the low 32 bits of the address space.
1099        *
1100        * Altering the validation list flags restricts it for this batch,
1101        * but we also alter the BO's kflags to restrict it permanently
1102        * (until the BO is destroyed and put back in the cache).  Buffers
1103        * may stay bound across batches, and we want keep it constrained.
1104        */
1105       target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
1106       entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
1107
1108       /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
1109       reloc_flags &= ~RELOC_32BIT;
1110    }
1111
1112    if (reloc_flags)
1113       entry->flags |= reloc_flags & batch->valid_reloc_flags;
1114
1115    rlist->relocs[rlist->reloc_count++] =
1116       (struct drm_i915_gem_relocation_entry) {
1117          .offset = offset,
1118          .delta = target_offset,
1119          .target_handle = batch->use_batch_first ? index : target->gem_handle,
1120          .presumed_offset = entry->offset,
1121       };
1122
1123    /* Using the old buffer offset, write in what the right data would be, in
1124     * case the buffer doesn't move and we can short-circuit the relocation
1125     * processing in the kernel
1126     */
1127    return entry->offset + target_offset;
1128 }
1129
1130 uint64_t
1131 brw_batch_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
1132                 struct brw_bo *target, uint32_t target_offset,
1133                 unsigned int reloc_flags)
1134 {
1135    assert(batch_offset <= batch->batch.bo->size - sizeof(uint32_t));
1136
1137    return emit_reloc(batch, &batch->batch_relocs, batch_offset,
1138                      target, target_offset, reloc_flags);
1139 }
1140
1141 uint64_t
1142 brw_state_reloc(struct intel_batchbuffer *batch, uint32_t state_offset,
1143                 struct brw_bo *target, uint32_t target_offset,
1144                 unsigned int reloc_flags)
1145 {
1146    assert(state_offset <= batch->state.bo->size - sizeof(uint32_t));
1147
1148    return emit_reloc(batch, &batch->state_relocs, state_offset,
1149                      target, target_offset, reloc_flags);
1150 }
1151
1152
1153 uint32_t
1154 brw_state_batch_size(struct brw_context *brw, uint32_t offset)
1155 {
1156    struct hash_entry *entry =
1157       _mesa_hash_table_search(brw->batch.state_batch_sizes,
1158                               (void *) (uintptr_t) offset);
1159    return entry ? (uintptr_t) entry->data : 0;
1160 }
1161
1162 /**
1163  * Reserve some space in the statebuffer, or flush.
1164  *
1165  * This is used to estimate when we're near the end of the batch,
1166  * so we can flush early.
1167  */
1168 void
1169 brw_require_statebuffer_space(struct brw_context *brw, int size)
1170 {
1171    if (brw->batch.state_used + size >= STATE_SZ)
1172       intel_batchbuffer_flush(brw);
1173 }
1174
1175 /**
1176  * Allocates a block of space in the batchbuffer for indirect state.
1177  */
1178 void *
1179 brw_state_batch(struct brw_context *brw,
1180                 int size,
1181                 int alignment,
1182                 uint32_t *out_offset)
1183 {
1184    struct intel_batchbuffer *batch = &brw->batch;
1185
1186    assert(size < batch->state.bo->size);
1187
1188    uint32_t offset = ALIGN(batch->state_used, alignment);
1189
1190    if (offset + size >= STATE_SZ && !batch->no_wrap) {
1191       intel_batchbuffer_flush(brw);
1192       offset = ALIGN(batch->state_used, alignment);
1193    } else if (offset + size >= batch->state.bo->size) {
1194       const unsigned new_size =
1195          MIN2(batch->state.bo->size + batch->state.bo->size / 2,
1196               MAX_STATE_SIZE);
1197       grow_buffer(brw, &batch->state, batch->state_used, new_size);
1198       assert(offset + size < batch->state.bo->size);
1199    }
1200
1201    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
1202       _mesa_hash_table_insert(batch->state_batch_sizes,
1203                               (void *) (uintptr_t) offset,
1204                               (void *) (uintptr_t) size);
1205    }
1206
1207    batch->state_used = offset + size;
1208
1209    *out_offset = offset;
1210    return batch->state.map + (offset >> 2);
1211 }
1212
1213 void
1214 intel_batchbuffer_data(struct brw_context *brw,
1215                        const void *data, GLuint bytes, enum brw_gpu_ring ring)
1216 {
1217    assert((bytes & 3) == 0);
1218    intel_batchbuffer_require_space(brw, bytes, ring);
1219    memcpy(brw->batch.map_next, data, bytes);
1220    brw->batch.map_next += bytes >> 2;
1221 }
1222
1223 static void
1224 load_sized_register_mem(struct brw_context *brw,
1225                         uint32_t reg,
1226                         struct brw_bo *bo,
1227                         uint32_t offset,
1228                         int size)
1229 {
1230    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1231    int i;
1232
1233    /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
1234    assert(devinfo->gen >= 7);
1235
1236    if (devinfo->gen >= 8) {
1237       BEGIN_BATCH(4 * size);
1238       for (i = 0; i < size; i++) {
1239          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
1240          OUT_BATCH(reg + i * 4);
1241          OUT_RELOC64(bo, 0, offset + i * 4);
1242       }
1243       ADVANCE_BATCH();
1244    } else {
1245       BEGIN_BATCH(3 * size);
1246       for (i = 0; i < size; i++) {
1247          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
1248          OUT_BATCH(reg + i * 4);
1249          OUT_RELOC(bo, 0, offset + i * 4);
1250       }
1251       ADVANCE_BATCH();
1252    }
1253 }
1254
1255 void
1256 brw_load_register_mem(struct brw_context *brw,
1257                       uint32_t reg,
1258                       struct brw_bo *bo,
1259                       uint32_t offset)
1260 {
1261    load_sized_register_mem(brw, reg, bo, offset, 1);
1262 }
1263
1264 void
1265 brw_load_register_mem64(struct brw_context *brw,
1266                         uint32_t reg,
1267                         struct brw_bo *bo,
1268                         uint32_t offset)
1269 {
1270    load_sized_register_mem(brw, reg, bo, offset, 2);
1271 }
1272
1273 /*
1274  * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
1275  */
1276 void
1277 brw_store_register_mem32(struct brw_context *brw,
1278                          struct brw_bo *bo, uint32_t reg, uint32_t offset)
1279 {
1280    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1281
1282    assert(devinfo->gen >= 6);
1283
1284    if (devinfo->gen >= 8) {
1285       BEGIN_BATCH(4);
1286       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1287       OUT_BATCH(reg);
1288       OUT_RELOC64(bo, RELOC_WRITE, offset);
1289       ADVANCE_BATCH();
1290    } else {
1291       BEGIN_BATCH(3);
1292       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1293       OUT_BATCH(reg);
1294       OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1295       ADVANCE_BATCH();
1296    }
1297 }
1298
1299 /*
1300  * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
1301  */
1302 void
1303 brw_store_register_mem64(struct brw_context *brw,
1304                          struct brw_bo *bo, uint32_t reg, uint32_t offset)
1305 {
1306    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1307
1308    assert(devinfo->gen >= 6);
1309
1310    /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
1311     * read a full 64-bit register, we need to do two of them.
1312     */
1313    if (devinfo->gen >= 8) {
1314       BEGIN_BATCH(8);
1315       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1316       OUT_BATCH(reg);
1317       OUT_RELOC64(bo, RELOC_WRITE, offset);
1318       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1319       OUT_BATCH(reg + sizeof(uint32_t));
1320       OUT_RELOC64(bo, RELOC_WRITE, offset + sizeof(uint32_t));
1321       ADVANCE_BATCH();
1322    } else {
1323       BEGIN_BATCH(6);
1324       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1325       OUT_BATCH(reg);
1326       OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1327       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1328       OUT_BATCH(reg + sizeof(uint32_t));
1329       OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset + sizeof(uint32_t));
1330       ADVANCE_BATCH();
1331    }
1332 }
1333
1334 /*
1335  * Write a 32-bit register using immediate data.
1336  */
1337 void
1338 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
1339 {
1340    assert(brw->screen->devinfo.gen >= 6);
1341
1342    BEGIN_BATCH(3);
1343    OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
1344    OUT_BATCH(reg);
1345    OUT_BATCH(imm);
1346    ADVANCE_BATCH();
1347 }
1348
1349 /*
1350  * Write a 64-bit register using immediate data.
1351  */
1352 void
1353 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
1354 {
1355    assert(brw->screen->devinfo.gen >= 6);
1356
1357    BEGIN_BATCH(5);
1358    OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
1359    OUT_BATCH(reg);
1360    OUT_BATCH(imm & 0xffffffff);
1361    OUT_BATCH(reg + 4);
1362    OUT_BATCH(imm >> 32);
1363    ADVANCE_BATCH();
1364 }
1365
1366 /*
1367  * Copies a 32-bit register.
1368  */
1369 void
1370 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
1371 {
1372    assert(brw->screen->devinfo.gen >= 8 || brw->screen->devinfo.is_haswell);
1373
1374    BEGIN_BATCH(3);
1375    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1376    OUT_BATCH(src);
1377    OUT_BATCH(dest);
1378    ADVANCE_BATCH();
1379 }
1380
1381 /*
1382  * Copies a 64-bit register.
1383  */
1384 void
1385 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
1386 {
1387    assert(brw->screen->devinfo.gen >= 8 || brw->screen->devinfo.is_haswell);
1388
1389    BEGIN_BATCH(6);
1390    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1391    OUT_BATCH(src);
1392    OUT_BATCH(dest);
1393    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1394    OUT_BATCH(src + sizeof(uint32_t));
1395    OUT_BATCH(dest + sizeof(uint32_t));
1396    ADVANCE_BATCH();
1397 }
1398
1399 /*
1400  * Write 32-bits of immediate data to a GPU memory buffer.
1401  */
1402 void
1403 brw_store_data_imm32(struct brw_context *brw, struct brw_bo *bo,
1404                      uint32_t offset, uint32_t imm)
1405 {
1406    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1407
1408    assert(devinfo->gen >= 6);
1409
1410    BEGIN_BATCH(4);
1411    OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
1412    if (devinfo->gen >= 8)
1413       OUT_RELOC64(bo, RELOC_WRITE, offset);
1414    else {
1415       OUT_BATCH(0); /* MBZ */
1416       OUT_RELOC(bo, RELOC_WRITE, offset);
1417    }
1418    OUT_BATCH(imm);
1419    ADVANCE_BATCH();
1420 }
1421
1422 /*
1423  * Write 64-bits of immediate data to a GPU memory buffer.
1424  */
1425 void
1426 brw_store_data_imm64(struct brw_context *brw, struct brw_bo *bo,
1427                      uint32_t offset, uint64_t imm)
1428 {
1429    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1430
1431    assert(devinfo->gen >= 6);
1432
1433    BEGIN_BATCH(5);
1434    OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
1435    if (devinfo->gen >= 8)
1436       OUT_RELOC64(bo, RELOC_WRITE, offset);
1437    else {
1438       OUT_BATCH(0); /* MBZ */
1439       OUT_RELOC(bo, RELOC_WRITE, offset);
1440    }
1441    OUT_BATCH(imm & 0xffffffffu);
1442    OUT_BATCH(imm >> 32);
1443    ADVANCE_BATCH();
1444 }