src/mesa/drivers/dri/i965/intel_batchbuffer.c

   1 /*
   2  * Copyright 2006 VMware, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the
   7  * "Software"), to deal in the Software without restriction, including
   8  * without limitation the rights to use, copy, modify, merge, publish,
   9  * distribute, sublicense, and/or sell copies of the Software, and to
  10  * permit persons to whom the Software is furnished to do so, subject to
  11  * the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the
  14  * next paragraph) shall be included in all copies or substantial portions
  15  * of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  19  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  20  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  21  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  22  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  23  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #include "intel_batchbuffer.h"
  27 #include "intel_buffer_objects.h"
  28 #include "brw_bufmgr.h"
  29 #include "intel_buffers.h"
  30 #include "intel_fbo.h"
  31 #include "brw_context.h"
  32 #include "brw_defines.h"
  33 #include "brw_state.h"
  34 #include "common/gen_decoder.h"
  35
  36 #include "util/hash_table.h"
  37
  38 #include <xf86drm.h>
  39 #include <i915_drm.h>
  40
  41 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
  42
  43 /**
  44  * Target sizes of the batch and state buffers.  We create the initial
  45  * buffers at these sizes, and flush when they're nearly full.  If we
  46  * underestimate how close we are to the end, and suddenly need more space
  47  * in the middle of a draw, we can grow the buffers, and finish the draw.
  48  * At that point, we'll be over our target size, so the next operation
  49  * should flush.  Each time we flush the batch, we recreate both buffers
  50  * at the original target size, so it doesn't grow without bound.
  51  */
  52 #define BATCH_SZ (20 * 1024)
  53 #define STATE_SZ (16 * 1024)
  54
  55 /* The kernel assumes batchbuffers are smaller than 256kB. */
  56 #define MAX_BATCH_SIZE (256 * 1024)
  57
  58 /* 3DSTATE_BINDING_TABLE_POINTERS has a U16 offset from Surface State Base
  59  * Address, which means that we can't put binding tables beyond 64kB.  This
  60  * effectively limits the maximum statebuffer size to 64kB.
  61  */
  62 #define MAX_STATE_SIZE (64 * 1024)
  63
  64 static void
  65 intel_batchbuffer_reset(struct intel_batchbuffer *batch,
  66                         struct intel_screen *screen);
  67
  68 static bool
  69 uint_key_compare(const void *a, const void *b)
  70 {
  71    return a == b;
  72 }
  73
  74 static uint32_t
  75 uint_key_hash(const void *key)
  76 {
  77    return (uintptr_t) key;
  78 }
  79
  80 static void
  81 init_reloc_list(struct brw_reloc_list *rlist, int count)
  82 {
  83    rlist->reloc_count = 0;
  84    rlist->reloc_array_size = count;
  85    rlist->relocs = malloc(rlist->reloc_array_size *
  86                           sizeof(struct drm_i915_gem_relocation_entry));
  87 }
  88
  89 void
  90 intel_batchbuffer_init(struct intel_screen *screen,
  91                        struct intel_batchbuffer *batch)
  92 {
  93    const struct gen_device_info *devinfo = &screen->devinfo;
  94
  95    if (!devinfo->has_llc) {
  96       batch->batch_cpu_map = malloc(BATCH_SZ);
  97       batch->map = batch->batch_cpu_map;
  98       batch->map_next = batch->map;
  99       batch->state_cpu_map = malloc(STATE_SZ);
 100       batch->state_map = batch->state_cpu_map;
 101    }
 102
 103    init_reloc_list(&batch->batch_relocs, 250);
 104    init_reloc_list(&batch->state_relocs, 250);
 105
 106    batch->exec_count = 0;
 107    batch->exec_array_size = 100;
 108    batch->exec_bos =
 109       malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
 110    batch->validation_list =
 111       malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
 112
 113    if (INTEL_DEBUG & DEBUG_BATCH) {
 114       batch->state_batch_sizes =
 115          _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
 116    }
 117
 118    batch->use_batch_first =
 119       screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST;
 120
 121    /* PIPE_CONTROL needs a w/a but only on gen6 */
 122    batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
 123    if (devinfo->gen == 6)
 124       batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
 125
 126    intel_batchbuffer_reset(batch, screen);
 127 }
 128
 129 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
 130
 131 static unsigned
 132 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
 133 {
 134    unsigned index = READ_ONCE(bo->index);
 135
 136    if (index < batch->exec_count && batch->exec_bos[index] == bo)
 137       return index;
 138
 139    /* May have been shared between multiple active batches */
 140    for (index = 0; index < batch->exec_count; index++) {
 141       if (batch->exec_bos[index] == bo)
 142          return index;
 143    }
 144
 145    brw_bo_reference(bo);
 146
 147    if (batch->exec_count == batch->exec_array_size) {
 148       batch->exec_array_size *= 2;
 149       batch->exec_bos =
 150          realloc(batch->exec_bos,
 151                  batch->exec_array_size * sizeof(batch->exec_bos[0]));
 152       batch->validation_list =
 153          realloc(batch->validation_list,
 154                  batch->exec_array_size * sizeof(batch->validation_list[0]));
 155    }
 156
 157    batch->validation_list[batch->exec_count] =
 158       (struct drm_i915_gem_exec_object2) {
 159          .handle = bo->gem_handle,
 160          .alignment = bo->align,
 161          .offset = bo->gtt_offset,
 162          .flags = bo->kflags,
 163       };
 164
 165    bo->index = batch->exec_count;
 166    batch->exec_bos[batch->exec_count] = bo;
 167    batch->aperture_space += bo->size;
 168
 169    return batch->exec_count++;
 170 }
 171
 172 static void
 173 intel_batchbuffer_reset(struct intel_batchbuffer *batch,
 174                         struct intel_screen *screen)
 175 {
 176    struct brw_bufmgr *bufmgr = screen->bufmgr;
 177
 178    if (batch->last_bo != NULL) {
 179       brw_bo_unreference(batch->last_bo);
 180       batch->last_bo = NULL;
 181    }
 182    batch->last_bo = batch->bo;
 183
 184    batch->bo = brw_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
 185    if (!batch->batch_cpu_map) {
 186       batch->map = brw_bo_map(NULL, batch->bo, MAP_READ | MAP_WRITE);
 187    }
 188    batch->map_next = batch->map;
 189
 190    batch->state_bo = brw_bo_alloc(bufmgr, "statebuffer", STATE_SZ, 4096);
 191    batch->state_bo->kflags =
 192       can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0;
 193    if (!batch->state_cpu_map) {
 194       batch->state_map =
 195          brw_bo_map(NULL, batch->state_bo, MAP_READ | MAP_WRITE);
 196    }
 197
 198    /* Avoid making 0 a valid state offset - otherwise the decoder will try
 199     * and decode data when we use offset 0 as a null pointer.
 200     */
 201    batch->state_used = 1;
 202
 203    add_exec_bo(batch, batch->bo);
 204    assert(batch->bo->index == 0);
 205
 206    batch->needs_sol_reset = false;
 207    batch->state_base_address_emitted = false;
 208
 209    /* We don't know what ring the new batch will be sent to until we see the
 210     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
 211     */
 212    batch->ring = UNKNOWN_RING;
 213
 214    if (batch->state_batch_sizes)
 215       _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
 216 }
 217
 218 static void
 219 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
 220 {
 221    intel_batchbuffer_reset(&brw->batch, brw->screen);
 222    brw_render_cache_set_clear(brw);
 223 }
 224
 225 void
 226 intel_batchbuffer_save_state(struct brw_context *brw)
 227 {
 228    brw->batch.saved.map_next = brw->batch.map_next;
 229    brw->batch.saved.batch_reloc_count = brw->batch.batch_relocs.reloc_count;
 230    brw->batch.saved.state_reloc_count = brw->batch.state_relocs.reloc_count;
 231    brw->batch.saved.exec_count = brw->batch.exec_count;
 232 }
 233
 234 void
 235 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 236 {
 237    for (int i = brw->batch.saved.exec_count;
 238         i < brw->batch.exec_count; i++) {
 239       brw_bo_unreference(brw->batch.exec_bos[i]);
 240    }
 241    brw->batch.batch_relocs.reloc_count = brw->batch.saved.batch_reloc_count;
 242    brw->batch.state_relocs.reloc_count = brw->batch.saved.state_reloc_count;
 243    brw->batch.exec_count = brw->batch.saved.exec_count;
 244
 245    brw->batch.map_next = brw->batch.saved.map_next;
 246    if (USED_BATCH(brw->batch) == 0)
 247       brw->batch.ring = UNKNOWN_RING;
 248 }
 249
 250 void
 251 intel_batchbuffer_free(struct intel_batchbuffer *batch)
 252 {
 253    free(batch->batch_cpu_map);
 254    free(batch->state_cpu_map);
 255
 256    for (int i = 0; i < batch->exec_count; i++) {
 257       brw_bo_unreference(batch->exec_bos[i]);
 258    }
 259    free(batch->batch_relocs.relocs);
 260    free(batch->state_relocs.relocs);
 261    free(batch->exec_bos);
 262    free(batch->validation_list);
 263
 264    brw_bo_unreference(batch->last_bo);
 265    brw_bo_unreference(batch->bo);
 266    brw_bo_unreference(batch->state_bo);
 267    if (batch->state_batch_sizes)
 268       _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
 269 }
 270
 271 static void
 272 replace_bo_in_reloc_list(struct brw_reloc_list *rlist,
 273                          uint32_t old_handle, uint32_t new_handle)
 274 {
 275    for (int i = 0; i < rlist->reloc_count; i++) {
 276       if (rlist->relocs[i].target_handle == old_handle)
 277          rlist->relocs[i].target_handle = new_handle;
 278    }
 279 }
 280
 281 /**
 282  * Grow either the batch or state buffer to a new larger size.
 283  *
 284  * We can't actually grow buffers, so we allocate a new one, copy over
 285  * the existing contents, and update our lists to refer to the new one.
 286  *
 287  * Note that this is only temporary - each new batch recreates the buffers
 288  * at their original target size (BATCH_SZ or STATE_SZ).
 289  */
 290 static void
 291 grow_buffer(struct brw_context *brw,
 292             struct brw_bo **bo_ptr,
 293             uint32_t **map_ptr,
 294             uint32_t **cpu_map_ptr,
 295             unsigned existing_bytes,
 296             unsigned new_size)
 297 {
 298    struct intel_batchbuffer *batch = &brw->batch;
 299    struct brw_bufmgr *bufmgr = brw->bufmgr;
 300
 301    uint32_t *old_map = *map_ptr;
 302    struct brw_bo *old_bo = *bo_ptr;
 303
 304    struct brw_bo *new_bo = brw_bo_alloc(bufmgr, old_bo->name, new_size, 4096);
 305    uint32_t *new_map;
 306
 307    perf_debug("Growing %s - ran out of space\n", old_bo->name);
 308
 309    /* Copy existing data to the new larger buffer */
 310    if (*cpu_map_ptr) {
 311       *cpu_map_ptr = new_map = realloc(*cpu_map_ptr, new_size);
 312    } else {
 313       new_map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE);
 314       memcpy(new_map, old_map, existing_bytes);
 315    }
 316
 317    /* Try to put the new BO at the same GTT offset as the old BO (which
 318     * we're throwing away, so it doesn't need to be there).
 319     *
 320     * This guarantees that our relocations continue to work: values we've
 321     * already written into the buffer, values we're going to write into the
 322     * buffer, and the validation/relocation lists all will match.
 323     */
 324    new_bo->gtt_offset = old_bo->gtt_offset;
 325    new_bo->index = old_bo->index;
 326
 327    /* Batch/state buffers are per-context, and if we've run out of space,
 328     * we must have actually used them before, so...they will be in the list.
 329     */
 330    assert(old_bo->index < batch->exec_count);
 331    assert(batch->exec_bos[old_bo->index] == old_bo);
 332
 333    /* Update the validation list to use the new BO. */
 334    batch->exec_bos[old_bo->index] = new_bo;
 335    batch->validation_list[old_bo->index].handle = new_bo->gem_handle;
 336    brw_bo_reference(new_bo);
 337    brw_bo_unreference(old_bo);
 338
 339    if (!batch->use_batch_first) {
 340       /* We're not using I915_EXEC_HANDLE_LUT, which means we need to go
 341        * update the relocation list entries to point at the new BO as well.
 342        * (With newer kernels, the "handle" is an offset into the validation
 343        * list, which remains unchanged, so we can skip this.)
 344        */
 345       replace_bo_in_reloc_list(&batch->batch_relocs,
 346                                old_bo->gem_handle, new_bo->gem_handle);
 347       replace_bo_in_reloc_list(&batch->state_relocs,
 348                                old_bo->gem_handle, new_bo->gem_handle);
 349    }
 350
 351    /* Drop the *bo_ptr reference.  This should free the old BO. */
 352    brw_bo_unreference(old_bo);
 353
 354    *bo_ptr = new_bo;
 355    *map_ptr = new_map;
 356 }
 357
 358 void
 359 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
 360                                 enum brw_gpu_ring ring)
 361 {
 362    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 363    struct intel_batchbuffer *batch = &brw->batch;
 364
 365    /* If we're switching rings, implicitly flush the batch. */
 366    if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
 367        devinfo->gen >= 6) {
 368       intel_batchbuffer_flush(brw);
 369    }
 370
 371    const unsigned batch_used = USED_BATCH(*batch) * 4;
 372    if (batch_used + sz >= BATCH_SZ) {
 373       if (!brw->no_batch_wrap) {
 374          intel_batchbuffer_flush(brw);
 375       } else {
 376          const unsigned new_size =
 377             MIN2(batch->bo->size + batch->bo->size / 2, MAX_BATCH_SIZE);
 378          grow_buffer(brw, &batch->bo, &batch->map, &batch->batch_cpu_map,
 379                      batch_used, new_size);
 380          batch->map_next = (void *) batch->map + batch_used;
 381          assert(batch_used + sz < batch->bo->size);
 382       }
 383    }
 384
 385    /* The intel_batchbuffer_flush() calls above might have changed
 386     * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
 387     */
 388    brw->batch.ring = ring;
 389 }
 390
 391 #ifdef DEBUG
 392 #define CSI "\e["
 393 #define BLUE_HEADER  CSI "0;44m"
 394 #define NORMAL       CSI "0m"
 395
 396
 397 static void
 398 decode_struct(struct brw_context *brw, struct gen_spec *spec,
 399               const char *struct_name, uint32_t *data,
 400               uint32_t gtt_offset, uint32_t offset, bool color)
 401 {
 402    struct gen_group *group = gen_spec_find_struct(spec, struct_name);
 403    if (!group)
 404       return;
 405
 406    fprintf(stderr, "%s\n", struct_name);
 407    gen_print_group(stderr, group, gtt_offset + offset,
 408                    &data[offset / 4], color);
 409 }
 410
 411 static void
 412 decode_structs(struct brw_context *brw, struct gen_spec *spec,
 413                const char *struct_name,
 414                uint32_t *data, uint32_t gtt_offset, uint32_t offset,
 415                int struct_size, bool color)
 416 {
 417    struct gen_group *group = gen_spec_find_struct(spec, struct_name);
 418    if (!group)
 419       return;
 420
 421    int entries = brw_state_batch_size(brw, offset) / struct_size;
 422    for (int i = 0; i < entries; i++) {
 423       fprintf(stderr, "%s %d\n", struct_name, i);
 424       gen_print_group(stderr, group, gtt_offset + offset,
 425                       &data[(offset + i * struct_size) / 4], color);
 426    }
 427 }
 428
 429 static void
 430 do_batch_dump(struct brw_context *brw)
 431 {
 432    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 433    struct intel_batchbuffer *batch = &brw->batch;
 434    struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
 435
 436    if (batch->ring != RENDER_RING)
 437       return;
 438
 439    uint32_t *batch_data = brw_bo_map(brw, batch->bo, MAP_READ);
 440    uint32_t *state = brw_bo_map(brw, batch->state_bo, MAP_READ);
 441    if (batch == NULL || state == NULL) {
 442       fprintf(stderr, "WARNING: failed to map batchbuffer/statebuffer\n");
 443       return;
 444    }
 445
 446    uint32_t *end = batch_data + USED_BATCH(*batch);
 447    uint32_t batch_gtt_offset = batch->bo->gtt_offset;
 448    uint32_t state_gtt_offset = batch->state_bo->gtt_offset;
 449    int length;
 450
 451    bool color = INTEL_DEBUG & DEBUG_COLOR;
 452    const char *header_color = color ? BLUE_HEADER : "";
 453    const char *reset_color  = color ? NORMAL : "";
 454
 455    for (uint32_t *p = batch_data; p < end; p += length) {
 456       struct gen_group *inst = gen_spec_find_instruction(spec, p);
 457       length = gen_group_get_length(inst, p);
 458       assert(inst == NULL || length > 0);
 459       length = MAX2(1, length);
 460       if (inst == NULL) {
 461          fprintf(stderr, "unknown instruction %08x\n", p[0]);
 462          continue;
 463       }
 464
 465       uint64_t offset = batch_gtt_offset + 4 * (p - batch_data);
 466
 467       fprintf(stderr, "%s0x%08"PRIx64":  0x%08x:  %-80s%s\n", header_color,
 468               offset, p[0], gen_group_get_name(inst), reset_color);
 469
 470       gen_print_group(stderr, inst, offset, p, color);
 471
 472       switch (gen_group_get_opcode(inst) >> 16) {
 473       case _3DSTATE_PIPELINED_POINTERS:
 474          /* Note: these Gen4-5 pointers are full relocations rather than
 475           * offsets from the start of the statebuffer.  So we need to subtract
 476           * gtt_offset (the start of the statebuffer) to obtain an offset we
 477           * can add to the map and get at the data.
 478           */
 479          decode_struct(brw, spec, "VS_STATE", state, state_gtt_offset,
 480                        (p[1] & ~0x1fu) - state_gtt_offset, color);
 481          if (p[2] & 1) {
 482             decode_struct(brw, spec, "GS_STATE", state, state_gtt_offset,
 483                           (p[2] & ~0x1fu) - state_gtt_offset, color);
 484          }
 485          if (p[3] & 1) {
 486             decode_struct(brw, spec, "CLIP_STATE", state, state_gtt_offset,
 487                           (p[3] & ~0x1fu) - state_gtt_offset, color);
 488          }
 489          decode_struct(brw, spec, "SF_STATE", state, state_gtt_offset,
 490                        (p[4] & ~0x1fu) - state_gtt_offset, color);
 491          decode_struct(brw, spec, "WM_STATE", state, state_gtt_offset,
 492                        (p[5] & ~0x1fu) - state_gtt_offset, color);
 493          decode_struct(brw, spec, "COLOR_CALC_STATE", state, state_gtt_offset,
 494                        (p[6] & ~0x3fu) - state_gtt_offset, color);
 495          break;
 496       case _3DSTATE_BINDING_TABLE_POINTERS_VS:
 497       case _3DSTATE_BINDING_TABLE_POINTERS_HS:
 498       case _3DSTATE_BINDING_TABLE_POINTERS_DS:
 499       case _3DSTATE_BINDING_TABLE_POINTERS_GS:
 500       case _3DSTATE_BINDING_TABLE_POINTERS_PS: {
 501          struct gen_group *group =
 502             gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
 503          if (!group)
 504             break;
 505
 506          uint32_t bt_offset = p[1] & ~0x1fu;
 507          int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
 508          uint32_t *bt_pointers = &state[bt_offset / 4];
 509          for (int i = 0; i < bt_entries; i++) {
 510             fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
 511             gen_print_group(stderr, group, state_gtt_offset + bt_pointers[i],
 512                             &state[bt_pointers[i] / 4], color);
 513          }
 514          break;
 515       }
 516       case _3DSTATE_SAMPLER_STATE_POINTERS_VS:
 517       case _3DSTATE_SAMPLER_STATE_POINTERS_HS:
 518       case _3DSTATE_SAMPLER_STATE_POINTERS_DS:
 519       case _3DSTATE_SAMPLER_STATE_POINTERS_GS:
 520       case _3DSTATE_SAMPLER_STATE_POINTERS_PS:
 521          decode_structs(brw, spec, "SAMPLER_STATE", state,
 522                         state_gtt_offset, p[1] & ~0x1fu, 4 * 4, color);
 523          break;
 524       case _3DSTATE_VIEWPORT_STATE_POINTERS:
 525          decode_structs(brw, spec, "CLIP_VIEWPORT", state,
 526                         state_gtt_offset, p[1] & ~0x3fu, 4 * 4, color);
 527          decode_structs(brw, spec, "SF_VIEWPORT", state,
 528                         state_gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
 529          decode_structs(brw, spec, "CC_VIEWPORT", state,
 530                         state_gtt_offset, p[3] & ~0x3fu, 2 * 4, color);
 531          break;
 532       case _3DSTATE_VIEWPORT_STATE_POINTERS_CC:
 533          decode_structs(brw, spec, "CC_VIEWPORT", state,
 534                         state_gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
 535          break;
 536       case _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL:
 537          decode_structs(brw, spec, "SF_CLIP_VIEWPORT", state,
 538                         state_gtt_offset, p[1] & ~0x3fu, 16 * 4, color);
 539          break;
 540       case _3DSTATE_SCISSOR_STATE_POINTERS:
 541          decode_structs(brw, spec, "SCISSOR_RECT", state,
 542                         state_gtt_offset, p[1] & ~0x1fu, 2 * 4, color);
 543          break;
 544       case _3DSTATE_BLEND_STATE_POINTERS:
 545          /* TODO: handle Gen8+ extra dword at the beginning */
 546          decode_structs(brw, spec, "BLEND_STATE", state,
 547                         state_gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
 548          break;
 549       case _3DSTATE_CC_STATE_POINTERS:
 550          if (devinfo->gen >= 7) {
 551             decode_struct(brw, spec, "COLOR_CALC_STATE", state,
 552                           state_gtt_offset, p[1] & ~0x3fu, color);
 553          } else if (devinfo->gen == 6) {
 554             decode_structs(brw, spec, "BLEND_STATE", state,
 555                            state_gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
 556             decode_struct(brw, spec, "DEPTH_STENCIL_STATE", state,
 557                           state_gtt_offset, p[2] & ~0x3fu, color);
 558             decode_struct(brw, spec, "COLOR_CALC_STATE", state,
 559                           state_gtt_offset, p[3] & ~0x3fu, color);
 560          }
 561          break;
 562       case _3DSTATE_DEPTH_STENCIL_STATE_POINTERS:
 563          decode_struct(brw, spec, "DEPTH_STENCIL_STATE", state,
 564                        state_gtt_offset, p[1] & ~0x3fu, color);
 565          break;
 566       }
 567    }
 568
 569    brw_bo_unmap(batch->bo);
 570    brw_bo_unmap(batch->state_bo);
 571 }
 572 #else
 573 static void do_batch_dump(struct brw_context *brw) { }
 574 #endif
 575
 576 /**
 577  * Called when starting a new batch buffer.
 578  */
 579 static void
 580 brw_new_batch(struct brw_context *brw)
 581 {
 582    /* Unreference any BOs held by the previous batch, and reset counts. */
 583    for (int i = 0; i < brw->batch.exec_count; i++) {
 584       brw_bo_unreference(brw->batch.exec_bos[i]);
 585       brw->batch.exec_bos[i] = NULL;
 586    }
 587    brw->batch.batch_relocs.reloc_count = 0;
 588    brw->batch.state_relocs.reloc_count = 0;
 589    brw->batch.exec_count = 0;
 590    brw->batch.aperture_space = 0;
 591
 592    brw_bo_unreference(brw->batch.state_bo);
 593
 594    /* Create a new batchbuffer and reset the associated state: */
 595    intel_batchbuffer_reset_and_clear_render_cache(brw);
 596
 597    /* If the kernel supports hardware contexts, then most hardware state is
 598     * preserved between batches; we only need to re-emit state that is required
 599     * to be in every batch.  Otherwise we need to re-emit all the state that
 600     * would otherwise be stored in the context (which for all intents and
 601     * purposes means everything).
 602     */
 603    if (brw->hw_ctx == 0)
 604       brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
 605
 606    brw->ctx.NewDriverState |= BRW_NEW_BATCH;
 607
 608    brw->ib.index_size = -1;
 609
 610    /* We need to periodically reap the shader time results, because rollover
 611     * happens every few seconds.  We also want to see results every once in a
 612     * while, because many programs won't cleanly destroy our context, so the
 613     * end-of-run printout may not happen.
 614     */
 615    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
 616       brw_collect_and_report_shader_time(brw);
 617 }
 618
 619 /**
 620  * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
 621  * sending it off.
 622  *
 623  * This function can emit state (say, to preserve registers that aren't saved
 624  * between batches).  All of this state MUST fit in the reserved space at the
 625  * end of the batchbuffer.  If you add more GPU state, increase the reserved
 626  * space by updating the BATCH_RESERVED macro.
 627  */
 628 static void
 629 brw_finish_batch(struct brw_context *brw)
 630 {
 631    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 632
 633    /* Capture the closing pipeline statistics register values necessary to
 634     * support query objects (in the non-hardware context world).
 635     */
 636    brw_emit_query_end(brw);
 637
 638    if (brw->batch.ring == RENDER_RING) {
 639       /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
 640        * assume that the L3 cache is configured according to the hardware
 641        * defaults.
 642        */
 643       if (devinfo->gen >= 7)
 644          gen7_restore_default_l3_config(brw);
 645
 646       if (devinfo->is_haswell) {
 647          /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
 648           * 3DSTATE_CC_STATE_POINTERS > "Note":
 649           *
 650           * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
 651           *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
 652           *
 653           * From the example in the docs, it seems to expect a regular pipe control
 654           * flush here as well. We may have done it already, but meh.
 655           *
 656           * See also WaAvoidRCZCounterRollover.
 657           */
 658          brw_emit_mi_flush(brw);
 659          BEGIN_BATCH(2);
 660          OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
 661          OUT_BATCH(brw->cc.state_offset | 1);
 662          ADVANCE_BATCH();
 663          brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
 664                                           PIPE_CONTROL_CS_STALL);
 665       }
 666    }
 667 }
 668
 669 static void
 670 throttle(struct brw_context *brw)
 671 {
 672    /* Wait for the swapbuffers before the one we just emitted, so we
 673     * don't get too many swaps outstanding for apps that are GPU-heavy
 674     * but not CPU-heavy.
 675     *
 676     * We're using intelDRI2Flush (called from the loader before
 677     * swapbuffer) and glFlush (for front buffer rendering) as the
 678     * indicator that a frame is done and then throttle when we get
 679     * here as we prepare to render the next frame.  At this point for
 680     * round trips for swap/copy and getting new buffers are done and
 681     * we'll spend less time waiting on the GPU.
 682     *
 683     * Unfortunately, we don't have a handle to the batch containing
 684     * the swap, and getting our hands on that doesn't seem worth it,
 685     * so we just use the first batch we emitted after the last swap.
 686     */
 687    if (brw->need_swap_throttle && brw->throttle_batch[0]) {
 688       if (brw->throttle_batch[1]) {
 689          if (!brw->disable_throttling) {
 690             /* Pass NULL rather than brw so we avoid perf_debug warnings;
 691              * stalling is common and expected here...
 692              */
 693             brw_bo_wait_rendering(brw->throttle_batch[1]);
 694          }
 695          brw_bo_unreference(brw->throttle_batch[1]);
 696       }
 697       brw->throttle_batch[1] = brw->throttle_batch[0];
 698       brw->throttle_batch[0] = NULL;
 699       brw->need_swap_throttle = false;
 700       /* Throttling here is more precise than the throttle ioctl, so skip it */
 701       brw->need_flush_throttle = false;
 702    }
 703
 704    if (brw->need_flush_throttle) {
 705       __DRIscreen *dri_screen = brw->screen->driScrnPriv;
 706       drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
 707       brw->need_flush_throttle = false;
 708    }
 709 }
 710
 711 static int
 712 execbuffer(int fd,
 713            struct intel_batchbuffer *batch,
 714            uint32_t ctx_id,
 715            int used,
 716            int in_fence,
 717            int *out_fence,
 718            int flags)
 719 {
 720    struct drm_i915_gem_execbuffer2 execbuf = {
 721       .buffers_ptr = (uintptr_t) batch->validation_list,
 722       .buffer_count = batch->exec_count,
 723       .batch_start_offset = 0,
 724       .batch_len = used,
 725       .flags = flags,
 726       .rsvd1 = ctx_id, /* rsvd1 is actually the context ID */
 727    };
 728
 729    unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
 730
 731    if (in_fence != -1) {
 732       execbuf.rsvd2 = in_fence;
 733       execbuf.flags |= I915_EXEC_FENCE_IN;
 734    }
 735
 736    if (out_fence != NULL) {
 737       cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2_WR;
 738       *out_fence = -1;
 739       execbuf.flags |= I915_EXEC_FENCE_OUT;
 740    }
 741
 742    int ret = drmIoctl(fd, cmd, &execbuf);
 743    if (ret != 0)
 744       ret = -errno;
 745
 746    for (int i = 0; i < batch->exec_count; i++) {
 747       struct brw_bo *bo = batch->exec_bos[i];
 748
 749       bo->idle = false;
 750       bo->index = -1;
 751
 752       /* Update brw_bo::gtt_offset */
 753       if (batch->validation_list[i].offset != bo->gtt_offset) {
 754          DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
 755              bo->gem_handle, bo->gtt_offset,
 756              batch->validation_list[i].offset);
 757          bo->gtt_offset = batch->validation_list[i].offset;
 758       }
 759    }
 760
 761    if (ret == 0 && out_fence != NULL)
 762       *out_fence = execbuf.rsvd2 >> 32;
 763
 764    return ret;
 765 }
 766
 767 static int
 768 do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
 769 {
 770    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 771    __DRIscreen *dri_screen = brw->screen->driScrnPriv;
 772    struct intel_batchbuffer *batch = &brw->batch;
 773    int ret = 0;
 774
 775    if (batch->batch_cpu_map) {
 776       void *bo_map = brw_bo_map(brw, batch->bo, MAP_WRITE);
 777       memcpy(bo_map, batch->batch_cpu_map, 4 * USED_BATCH(*batch));
 778    }
 779
 780    if (batch->state_cpu_map) {
 781       void *bo_map = brw_bo_map(brw, batch->state_bo, MAP_WRITE);
 782       memcpy(bo_map, batch->state_cpu_map, batch->state_used);
 783    }
 784
 785    brw_bo_unmap(batch->bo);
 786    brw_bo_unmap(batch->state_bo);
 787
 788    if (!brw->screen->no_hw) {
 789       /* The requirement for using I915_EXEC_NO_RELOC are:
 790        *
 791        *   The addresses written in the objects must match the corresponding
 792        *   reloc.gtt_offset which in turn must match the corresponding
 793        *   execobject.offset.
 794        *
 795        *   Any render targets written to in the batch must be flagged with
 796        *   EXEC_OBJECT_WRITE.
 797        *
 798        *   To avoid stalling, execobject.offset should match the current
 799        *   address of that object within the active context.
 800        */
 801       int flags = I915_EXEC_NO_RELOC;
 802
 803       if (devinfo->gen >= 6 && batch->ring == BLT_RING) {
 804          flags |= I915_EXEC_BLT;
 805       } else {
 806          flags |= I915_EXEC_RENDER;
 807       }
 808       if (batch->needs_sol_reset)
 809          flags |= I915_EXEC_GEN7_SOL_RESET;
 810
 811       uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;
 812
 813       /* Set statebuffer relocations */
 814       const unsigned state_index = batch->state_bo->index;
 815       if (state_index < batch->exec_count &&
 816           batch->exec_bos[state_index] == batch->state_bo) {
 817          struct drm_i915_gem_exec_object2 *entry =
 818             &batch->validation_list[state_index];
 819          assert(entry->handle == batch->state_bo->gem_handle);
 820          entry->relocation_count = batch->state_relocs.reloc_count;
 821          entry->relocs_ptr = (uintptr_t) batch->state_relocs.relocs;
 822       }
 823
 824       /* Set batchbuffer relocations */
 825       struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
 826       assert(entry->handle == batch->bo->gem_handle);
 827       entry->relocation_count = batch->batch_relocs.reloc_count;
 828       entry->relocs_ptr = (uintptr_t) batch->batch_relocs.relocs;
 829
 830       if (batch->use_batch_first) {
 831          flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
 832       } else {
 833          /* Move the batch to the end of the validation list */
 834          struct drm_i915_gem_exec_object2 tmp;
 835          const unsigned index = batch->exec_count - 1;
 836
 837          tmp = *entry;
 838          *entry = batch->validation_list[index];
 839          batch->validation_list[index] = tmp;
 840       }
 841
 842       ret = execbuffer(dri_screen->fd, batch, hw_ctx,
 843                        4 * USED_BATCH(*batch),
 844                        in_fence_fd, out_fence_fd, flags);
 845
 846       throttle(brw);
 847    }
 848
 849    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
 850       do_batch_dump(brw);
 851
 852    if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
 853       brw_check_for_reset(brw);
 854
 855    if (ret != 0) {
 856       fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
 857       exit(1);
 858    }
 859
 860    return ret;
 861 }
 862
 863 /**
 864  * The in_fence_fd is ignored if -1.  Otherwise this function takes ownership
 865  * of the fd.
 866  *
 867  * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
 868  * of the returned fd.
 869  */
 870 int
 871 _intel_batchbuffer_flush_fence(struct brw_context *brw,
 872                                int in_fence_fd, int *out_fence_fd,
 873                                const char *file, int line)
 874 {
 875    int ret;
 876
 877    if (USED_BATCH(brw->batch) == 0)
 878       return 0;
 879
 880    if (brw->throttle_batch[0] == NULL) {
 881       brw->throttle_batch[0] = brw->batch.bo;
 882       brw_bo_reference(brw->throttle_batch[0]);
 883    }
 884
 885    if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT))) {
 886       int bytes_for_commands = 4 * USED_BATCH(brw->batch);
 887       int bytes_for_state = brw->batch.state_used;
 888       fprintf(stderr, "%19s:%-3d: Batchbuffer flush with %5db (%0.1f%%) (pkt),"
 889               " %5db (%0.1f%%) (state), %4d BOs (%0.1fMb aperture),"
 890               " %4d batch relocs, %4d state relocs\n", file, line,
 891               bytes_for_commands, 100.0f * bytes_for_commands / BATCH_SZ,
 892               bytes_for_state, 100.0f * bytes_for_state / STATE_SZ,
 893               brw->batch.exec_count,
 894               (float) brw->batch.aperture_space / (1024 * 1024),
 895               brw->batch.batch_relocs.reloc_count,
 896               brw->batch.state_relocs.reloc_count);
 897    }
 898
 899    brw_finish_batch(brw);
 900
 901    /* Mark the end of the buffer. */
 902    intel_batchbuffer_emit_dword(&brw->batch, MI_BATCH_BUFFER_END);
 903    if (USED_BATCH(brw->batch) & 1) {
 904       /* Round batchbuffer usage to 2 DWORDs. */
 905       intel_batchbuffer_emit_dword(&brw->batch, MI_NOOP);
 906    }
 907
 908    intel_upload_finish(brw);
 909
 910    /* Check that we didn't just wrap our batchbuffer at a bad time. */
 911    assert(!brw->no_batch_wrap);
 912
 913    ret = do_flush_locked(brw, in_fence_fd, out_fence_fd);
 914
 915    if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
 916       fprintf(stderr, "waiting for idle\n");
 917       brw_bo_wait_rendering(brw->batch.bo);
 918    }
 919
 920    /* Start a new batch buffer. */
 921    brw_new_batch(brw);
 922
 923    return ret;
 924 }
 925
 926 bool
 927 brw_batch_has_aperture_space(struct brw_context *brw, unsigned extra_space)
 928 {
 929    return brw->batch.aperture_space + extra_space <=
 930           brw->screen->aperture_threshold;
 931 }
 932
 933 bool
 934 brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
 935 {
 936    unsigned index = READ_ONCE(bo->index);
 937    if (index < batch->exec_count && batch->exec_bos[index] == bo)
 938       return true;
 939
 940    for (int i = 0; i < batch->exec_count; i++) {
 941       if (batch->exec_bos[i] == bo)
 942          return true;
 943    }
 944    return false;
 945 }
 946
 947 /*  This is the only way buffers get added to the validate list.
 948  */
 949 static uint64_t
 950 emit_reloc(struct intel_batchbuffer *batch,
 951            struct brw_reloc_list *rlist, uint32_t offset,
 952            struct brw_bo *target, uint32_t target_offset,
 953            unsigned int reloc_flags)
 954 {
 955    assert(target != NULL);
 956
 957    if (rlist->reloc_count == rlist->reloc_array_size) {
 958       rlist->reloc_array_size *= 2;
 959       rlist->relocs = realloc(rlist->relocs,
 960                               rlist->reloc_array_size *
 961                               sizeof(struct drm_i915_gem_relocation_entry));
 962    }
 963
 964    unsigned int index = add_exec_bo(batch, target);
 965    struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
 966
 967    if (reloc_flags)
 968       entry->flags |= reloc_flags & batch->valid_reloc_flags;
 969
 970    rlist->relocs[rlist->reloc_count++] =
 971       (struct drm_i915_gem_relocation_entry) {
 972          .offset = offset,
 973          .delta = target_offset,
 974          .target_handle = batch->use_batch_first ? index : target->gem_handle,
 975          .presumed_offset = entry->offset,
 976       };
 977
 978    /* Using the old buffer offset, write in what the right data would be, in
 979     * case the buffer doesn't move and we can short-circuit the relocation
 980     * processing in the kernel
 981     */
 982    return entry->offset + target_offset;
 983 }
 984
 985 uint64_t
 986 brw_batch_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
 987                 struct brw_bo *target, uint32_t target_offset,
 988                 unsigned int reloc_flags)
 989 {
 990    assert(batch_offset <= batch->bo->size - sizeof(uint32_t));
 991
 992    return emit_reloc(batch, &batch->batch_relocs, batch_offset,
 993                      target, target_offset, reloc_flags);
 994 }
 995
 996 uint64_t
 997 brw_state_reloc(struct intel_batchbuffer *batch, uint32_t state_offset,
 998                 struct brw_bo *target, uint32_t target_offset,
 999                 unsigned int reloc_flags)
1000 {
1001    assert(state_offset <= batch->state_bo->size - sizeof(uint32_t));
1002
1003    return emit_reloc(batch, &batch->state_relocs, state_offset,
1004                      target, target_offset, reloc_flags);
1005 }
1006
1007
1008 uint32_t
1009 brw_state_batch_size(struct brw_context *brw, uint32_t offset)
1010 {
1011    struct hash_entry *entry =
1012       _mesa_hash_table_search(brw->batch.state_batch_sizes,
1013                               (void *) (uintptr_t) offset);
1014    return entry ? (uintptr_t) entry->data : 0;
1015 }
1016
1017 /**
1018  * Reserve some space in the statebuffer, or flush.
1019  *
1020  * This is used to estimate when we're near the end of the batch,
1021  * so we can flush early.
1022  */
1023 void
1024 brw_require_statebuffer_space(struct brw_context *brw, int size)
1025 {
1026    if (brw->batch.state_used + size >= STATE_SZ)
1027       intel_batchbuffer_flush(brw);
1028 }
1029
1030 /**
1031  * Allocates a block of space in the batchbuffer for indirect state.
1032  */
1033 void *
1034 brw_state_batch(struct brw_context *brw,
1035                 int size,
1036                 int alignment,
1037                 uint32_t *out_offset)
1038 {
1039    struct intel_batchbuffer *batch = &brw->batch;
1040
1041    assert(size < batch->bo->size);
1042
1043    uint32_t offset = ALIGN(batch->state_used, alignment);
1044
1045    if (offset + size >= STATE_SZ) {
1046       if (!brw->no_batch_wrap) {
1047          intel_batchbuffer_flush(brw);
1048          offset = ALIGN(batch->state_used, alignment);
1049       } else {
1050          const unsigned new_size =
1051             MIN2(batch->state_bo->size + batch->state_bo->size / 2,
1052                  MAX_STATE_SIZE);
1053          grow_buffer(brw, &batch->state_bo, &batch->state_map,
1054                      &batch->state_cpu_map, batch->state_used, new_size);
1055          assert(offset + size < batch->state_bo->size);
1056       }
1057    }
1058
1059    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
1060       _mesa_hash_table_insert(batch->state_batch_sizes,
1061                               (void *) (uintptr_t) offset,
1062                               (void *) (uintptr_t) size);
1063    }
1064
1065    batch->state_used = offset + size;
1066
1067    *out_offset = offset;
1068    return batch->state_map + (offset >> 2);
1069 }
1070
1071 void
1072 intel_batchbuffer_data(struct brw_context *brw,
1073                        const void *data, GLuint bytes, enum brw_gpu_ring ring)
1074 {
1075    assert((bytes & 3) == 0);
1076    intel_batchbuffer_require_space(brw, bytes, ring);
1077    memcpy(brw->batch.map_next, data, bytes);
1078    brw->batch.map_next += bytes >> 2;
1079 }
1080
1081 static void
1082 load_sized_register_mem(struct brw_context *brw,
1083                         uint32_t reg,
1084                         struct brw_bo *bo,
1085                         uint32_t offset,
1086                         int size)
1087 {
1088    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1089    int i;
1090
1091    /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
1092    assert(devinfo->gen >= 7);
1093
1094    if (devinfo->gen >= 8) {
1095       BEGIN_BATCH(4 * size);
1096       for (i = 0; i < size; i++) {
1097          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
1098          OUT_BATCH(reg + i * 4);
1099          OUT_RELOC64(bo, 0, offset + i * 4);
1100       }
1101       ADVANCE_BATCH();
1102    } else {
1103       BEGIN_BATCH(3 * size);
1104       for (i = 0; i < size; i++) {
1105          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
1106          OUT_BATCH(reg + i * 4);
1107          OUT_RELOC(bo, 0, offset + i * 4);
1108       }
1109       ADVANCE_BATCH();
1110    }
1111 }
1112
1113 void
1114 brw_load_register_mem(struct brw_context *brw,
1115                       uint32_t reg,
1116                       struct brw_bo *bo,
1117                       uint32_t offset)
1118 {
1119    load_sized_register_mem(brw, reg, bo, offset, 1);
1120 }
1121
1122 void
1123 brw_load_register_mem64(struct brw_context *brw,
1124                         uint32_t reg,
1125                         struct brw_bo *bo,
1126                         uint32_t offset)
1127 {
1128    load_sized_register_mem(brw, reg, bo, offset, 2);
1129 }
1130
1131 /*
1132  * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
1133  */
1134 void
1135 brw_store_register_mem32(struct brw_context *brw,
1136                          struct brw_bo *bo, uint32_t reg, uint32_t offset)
1137 {
1138    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1139
1140    assert(devinfo->gen >= 6);
1141
1142    if (devinfo->gen >= 8) {
1143       BEGIN_BATCH(4);
1144       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1145       OUT_BATCH(reg);
1146       OUT_RELOC64(bo, RELOC_WRITE, offset);
1147       ADVANCE_BATCH();
1148    } else {
1149       BEGIN_BATCH(3);
1150       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1151       OUT_BATCH(reg);
1152       OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1153       ADVANCE_BATCH();
1154    }
1155 }
1156
1157 /*
1158  * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
1159  */
1160 void
1161 brw_store_register_mem64(struct brw_context *brw,
1162                          struct brw_bo *bo, uint32_t reg, uint32_t offset)
1163 {
1164    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1165
1166    assert(devinfo->gen >= 6);
1167
1168    /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
1169     * read a full 64-bit register, we need to do two of them.
1170     */
1171    if (devinfo->gen >= 8) {
1172       BEGIN_BATCH(8);
1173       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1174       OUT_BATCH(reg);
1175       OUT_RELOC64(bo, RELOC_WRITE, offset);
1176       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1177       OUT_BATCH(reg + sizeof(uint32_t));
1178       OUT_RELOC64(bo, RELOC_WRITE, offset + sizeof(uint32_t));
1179       ADVANCE_BATCH();
1180    } else {
1181       BEGIN_BATCH(6);
1182       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1183       OUT_BATCH(reg);
1184       OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1185       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1186       OUT_BATCH(reg + sizeof(uint32_t));
1187       OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset + sizeof(uint32_t));
1188       ADVANCE_BATCH();
1189    }
1190 }
1191
1192 /*
1193  * Write a 32-bit register using immediate data.
1194  */
1195 void
1196 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
1197 {
1198    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1199
1200    assert(devinfo->gen >= 6);
1201
1202    BEGIN_BATCH(3);
1203    OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
1204    OUT_BATCH(reg);
1205    OUT_BATCH(imm);
1206    ADVANCE_BATCH();
1207 }
1208
1209 /*
1210  * Write a 64-bit register using immediate data.
1211  */
1212 void
1213 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
1214 {
1215    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1216
1217    assert(devinfo->gen >= 6);
1218
1219    BEGIN_BATCH(5);
1220    OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
1221    OUT_BATCH(reg);
1222    OUT_BATCH(imm & 0xffffffff);
1223    OUT_BATCH(reg + 4);
1224    OUT_BATCH(imm >> 32);
1225    ADVANCE_BATCH();
1226 }
1227
1228 /*
1229  * Copies a 32-bit register.
1230  */
1231 void
1232 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
1233 {
1234    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1235
1236    assert(devinfo->gen >= 8 || devinfo->is_haswell);
1237
1238    BEGIN_BATCH(3);
1239    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1240    OUT_BATCH(src);
1241    OUT_BATCH(dest);
1242    ADVANCE_BATCH();
1243 }
1244
1245 /*
1246  * Copies a 64-bit register.
1247  */
1248 void
1249 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
1250 {
1251    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1252
1253    assert(devinfo->gen >= 8 || devinfo->is_haswell);
1254
1255    BEGIN_BATCH(6);
1256    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1257    OUT_BATCH(src);
1258    OUT_BATCH(dest);
1259    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1260    OUT_BATCH(src + sizeof(uint32_t));
1261    OUT_BATCH(dest + sizeof(uint32_t));
1262    ADVANCE_BATCH();
1263 }
1264
1265 /*
1266  * Write 32-bits of immediate data to a GPU memory buffer.
1267  */
1268 void
1269 brw_store_data_imm32(struct brw_context *brw, struct brw_bo *bo,
1270                      uint32_t offset, uint32_t imm)
1271 {
1272    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1273
1274    assert(devinfo->gen >= 6);
1275
1276    BEGIN_BATCH(4);
1277    OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
1278    if (devinfo->gen >= 8)
1279       OUT_RELOC64(bo, RELOC_WRITE, offset);
1280    else {
1281       OUT_BATCH(0); /* MBZ */
1282       OUT_RELOC(bo, RELOC_WRITE, offset);
1283    }
1284    OUT_BATCH(imm);
1285    ADVANCE_BATCH();
1286 }
1287
1288 /*
1289  * Write 64-bits of immediate data to a GPU memory buffer.
1290  */
1291 void
1292 brw_store_data_imm64(struct brw_context *brw, struct brw_bo *bo,
1293                      uint32_t offset, uint64_t imm)
1294 {
1295    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1296
1297    assert(devinfo->gen >= 6);
1298
1299    BEGIN_BATCH(5);
1300    OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
1301    if (devinfo->gen >= 8)
1302       OUT_RELOC64(bo, 0, offset);
1303    else {
1304       OUT_BATCH(0); /* MBZ */
1305       OUT_RELOC(bo, RELOC_WRITE, offset);
1306    }
1307    OUT_BATCH(imm & 0xffffffffu);
1308    OUT_BATCH(imm >> 32);
1309    ADVANCE_BATCH();
1310 }