src/mesa/drivers/dri/i965/intel_batchbuffer.c

   1 /*
   2  * Copyright 2006 VMware, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the
   7  * "Software"), to deal in the Software without restriction, including
   8  * without limitation the rights to use, copy, modify, merge, publish,
   9  * distribute, sublicense, and/or sell copies of the Software, and to
  10  * permit persons to whom the Software is furnished to do so, subject to
  11  * the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the
  14  * next paragraph) shall be included in all copies or substantial portions
  15  * of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  19  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  20  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  21  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  22  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  23  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #include "intel_batchbuffer.h"
  27 #include "intel_buffer_objects.h"
  28 #include "brw_bufmgr.h"
  29 #include "intel_buffers.h"
  30 #include "intel_fbo.h"
  31 #include "brw_context.h"
  32 #include "brw_defines.h"
  33 #include "brw_state.h"
  34 #include "common/gen_decoder.h"
  35
  36 #include "util/hash_table.h"
  37
  38 #include <xf86drm.h>
  39 #include <i915_drm.h>
  40
  41 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
  42
  43 static void
  44 intel_batchbuffer_reset(struct intel_batchbuffer *batch,
  45                         struct brw_bufmgr *bufmgr,
  46                         bool has_llc);
  47
  48 static bool
  49 uint_key_compare(const void *a, const void *b)
  50 {
  51    return a == b;
  52 }
  53
  54 static uint32_t
  55 uint_key_hash(const void *key)
  56 {
  57    return (uintptr_t) key;
  58 }
  59
  60 void
  61 intel_batchbuffer_init(struct intel_batchbuffer *batch,
  62                        struct brw_bufmgr *bufmgr,
  63                        bool has_llc)
  64 {
  65    struct brw_context *brw = container_of(batch, brw, batch);
  66
  67    if (!has_llc) {
  68       batch->cpu_map = malloc(BATCH_SZ);
  69       batch->map = batch->cpu_map;
  70       batch->map_next = batch->cpu_map;
  71    }
  72
  73    batch->reloc_count = 0;
  74    batch->reloc_array_size = 250;
  75    batch->relocs = malloc(batch->reloc_array_size *
  76                           sizeof(struct drm_i915_gem_relocation_entry));
  77    batch->exec_count = 0;
  78    batch->exec_array_size = 100;
  79    batch->exec_bos =
  80       malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
  81    batch->validation_list =
  82       malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
  83
  84    if (INTEL_DEBUG & DEBUG_BATCH) {
  85       batch->state_batch_sizes =
  86          _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
  87    }
  88
  89    batch->use_batch_first =
  90       brw->screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST;
  91
  92    intel_batchbuffer_reset(batch, bufmgr, has_llc);
  93 }
  94
  95 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
  96
  97 static unsigned
  98 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
  99 {
 100    unsigned index = READ_ONCE(bo->index);
 101
 102    if (index < batch->exec_count && batch->exec_bos[index] == bo)
 103       return index;
 104
 105    /* May have been shared between multiple active batches */
 106    for (index = 0; index < batch->exec_count; index++) {
 107       if (batch->exec_bos[index] == bo)
 108          return index;
 109    }
 110
 111    if (bo != batch->bo)
 112       brw_bo_reference(bo);
 113
 114    if (batch->exec_count == batch->exec_array_size) {
 115       batch->exec_array_size *= 2;
 116       batch->exec_bos =
 117          realloc(batch->exec_bos,
 118                  batch->exec_array_size * sizeof(batch->exec_bos[0]));
 119       batch->validation_list =
 120          realloc(batch->validation_list,
 121                  batch->exec_array_size * sizeof(batch->validation_list[0]));
 122    }
 123
 124    batch->validation_list[batch->exec_count] =
 125       (struct drm_i915_gem_exec_object2) {
 126          .handle = bo->gem_handle,
 127          .alignment = bo->align,
 128          .offset = bo->offset64,
 129          .flags = bo->kflags,
 130       };
 131
 132    bo->index = batch->exec_count;
 133    batch->exec_bos[batch->exec_count] = bo;
 134    batch->aperture_space += bo->size;
 135
 136    return batch->exec_count++;
 137 }
 138
 139 static void
 140 intel_batchbuffer_reset(struct intel_batchbuffer *batch,
 141                         struct brw_bufmgr *bufmgr,
 142                         bool has_llc)
 143 {
 144    if (batch->last_bo != NULL) {
 145       brw_bo_unreference(batch->last_bo);
 146       batch->last_bo = NULL;
 147    }
 148    batch->last_bo = batch->bo;
 149
 150    batch->bo = brw_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
 151    if (has_llc) {
 152       batch->map = brw_bo_map(NULL, batch->bo, MAP_READ | MAP_WRITE);
 153    }
 154    batch->map_next = batch->map;
 155
 156    add_exec_bo(batch, batch->bo);
 157    assert(batch->bo->index == 0);
 158
 159    batch->reserved_space = BATCH_RESERVED;
 160    batch->state_batch_offset = batch->bo->size;
 161    batch->needs_sol_reset = false;
 162    batch->state_base_address_emitted = false;
 163
 164    /* We don't know what ring the new batch will be sent to until we see the
 165     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
 166     */
 167    batch->ring = UNKNOWN_RING;
 168
 169    if (batch->state_batch_sizes)
 170       _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
 171 }
 172
 173 static void
 174 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
 175 {
 176    intel_batchbuffer_reset(&brw->batch, brw->bufmgr, brw->has_llc);
 177    brw_render_cache_set_clear(brw);
 178 }
 179
 180 void
 181 intel_batchbuffer_save_state(struct brw_context *brw)
 182 {
 183    brw->batch.saved.map_next = brw->batch.map_next;
 184    brw->batch.saved.reloc_count = brw->batch.reloc_count;
 185    brw->batch.saved.exec_count = brw->batch.exec_count;
 186 }
 187
 188 void
 189 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 190 {
 191    for (int i = brw->batch.saved.exec_count;
 192         i < brw->batch.exec_count; i++) {
 193       if (brw->batch.exec_bos[i] != brw->batch.bo) {
 194          brw_bo_unreference(brw->batch.exec_bos[i]);
 195       }
 196    }
 197    brw->batch.reloc_count = brw->batch.saved.reloc_count;
 198    brw->batch.exec_count = brw->batch.saved.exec_count;
 199
 200    brw->batch.map_next = brw->batch.saved.map_next;
 201    if (USED_BATCH(brw->batch) == 0)
 202       brw->batch.ring = UNKNOWN_RING;
 203 }
 204
 205 void
 206 intel_batchbuffer_free(struct intel_batchbuffer *batch)
 207 {
 208    free(batch->cpu_map);
 209
 210    for (int i = 0; i < batch->exec_count; i++) {
 211       if (batch->exec_bos[i] != batch->bo) {
 212          brw_bo_unreference(batch->exec_bos[i]);
 213       }
 214    }
 215    free(batch->relocs);
 216    free(batch->exec_bos);
 217    free(batch->validation_list);
 218
 219    brw_bo_unreference(batch->last_bo);
 220    brw_bo_unreference(batch->bo);
 221    if (batch->state_batch_sizes)
 222       _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
 223 }
 224
 225 void
 226 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
 227                                 enum brw_gpu_ring ring)
 228 {
 229    /* If we're switching rings, implicitly flush the batch. */
 230    if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
 231        brw->gen >= 6) {
 232       intel_batchbuffer_flush(brw);
 233    }
 234
 235 #ifdef DEBUG
 236    assert(sz < BATCH_SZ - BATCH_RESERVED);
 237 #endif
 238    if (intel_batchbuffer_space(&brw->batch) < sz)
 239       intel_batchbuffer_flush(brw);
 240
 241    /* The intel_batchbuffer_flush() calls above might have changed
 242     * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
 243     */
 244    brw->batch.ring = ring;
 245 }
 246
 247 #ifdef DEBUG
 248 #define CSI "\e["
 249 #define BLUE_HEADER  CSI "0;44m"
 250 #define NORMAL       CSI "0m"
 251
 252
 253 static void
 254 decode_struct(struct brw_context *brw, struct gen_spec *spec,
 255               const char *struct_name, uint32_t *data,
 256               uint32_t gtt_offset, uint32_t offset, bool color)
 257 {
 258    struct gen_group *group = gen_spec_find_struct(spec, struct_name);
 259    if (!group)
 260       return;
 261
 262    fprintf(stderr, "%s\n", struct_name);
 263    gen_print_group(stderr, group, gtt_offset + offset,
 264                    &data[offset / 4], color);
 265 }
 266
 267 static void
 268 decode_structs(struct brw_context *brw, struct gen_spec *spec,
 269                const char *struct_name,
 270                uint32_t *data, uint32_t gtt_offset, uint32_t offset,
 271                int struct_size, bool color)
 272 {
 273    struct gen_group *group = gen_spec_find_struct(spec, struct_name);
 274    if (!group)
 275       return;
 276
 277    int entries = brw_state_batch_size(brw, offset) / struct_size;
 278    for (int i = 0; i < entries; i++) {
 279       fprintf(stderr, "%s %d\n", struct_name, i);
 280       gen_print_group(stderr, group, gtt_offset + offset,
 281                       &data[(offset + i * struct_size) / 4], color);
 282    }
 283 }
 284
 285 static void
 286 do_batch_dump(struct brw_context *brw)
 287 {
 288    struct intel_batchbuffer *batch = &brw->batch;
 289    struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
 290
 291    if (batch->ring != RENDER_RING)
 292       return;
 293
 294    void *map = brw_bo_map(brw, batch->bo, MAP_READ);
 295    if (map == NULL) {
 296       fprintf(stderr,
 297               "WARNING: failed to map batchbuffer, "
 298               "dumping uploaded data instead.\n");
 299    }
 300
 301    uint32_t *data = map ? map : batch->map;
 302    uint32_t *end = data + USED_BATCH(*batch);
 303    uint32_t gtt_offset = map ? batch->bo->offset64 : 0;
 304    int length;
 305
 306    bool color = INTEL_DEBUG & DEBUG_COLOR;
 307    const char *header_color = color ? BLUE_HEADER : "";
 308    const char *reset_color  = color ? NORMAL : "";
 309
 310    for (uint32_t *p = data; p < end; p += length) {
 311       struct gen_group *inst = gen_spec_find_instruction(spec, p);
 312       length = gen_group_get_length(inst, p);
 313       assert(inst == NULL || length > 0);
 314       length = MAX2(1, length);
 315       if (inst == NULL) {
 316          fprintf(stderr, "unknown instruction %08x\n", p[0]);
 317          continue;
 318       }
 319
 320       uint64_t offset = gtt_offset + 4 * (p - data);
 321
 322       fprintf(stderr, "%s0x%08"PRIx64":  0x%08x:  %-80s%s\n", header_color,
 323               offset, p[0], gen_group_get_name(inst), reset_color);
 324
 325       gen_print_group(stderr, inst, offset, p, color);
 326
 327       switch (gen_group_get_opcode(inst) >> 16) {
 328       case _3DSTATE_PIPELINED_POINTERS:
 329          /* Note: these Gen4-5 pointers are full relocations rather than
 330           * offsets from the start of the batch.  So we need to subtract
 331           * gtt_offset (the start of the batch) to obtain an offset we
 332           * can add to the map and get at the data.
 333           */
 334          decode_struct(brw, spec, "VS_STATE", data, gtt_offset,
 335                        (p[1] & ~0x1fu) - gtt_offset, color);
 336          if (p[2] & 1) {
 337             decode_struct(brw, spec, "GS_STATE", data, gtt_offset,
 338                           (p[2] & ~0x1fu) - gtt_offset, color);
 339          }
 340          if (p[3] & 1) {
 341             decode_struct(brw, spec, "CLIP_STATE", data, gtt_offset,
 342                           (p[3] & ~0x1fu) - gtt_offset, color);
 343          }
 344          decode_struct(brw, spec, "SF_STATE", data, gtt_offset,
 345                        (p[4] & ~0x1fu) - gtt_offset, color);
 346          decode_struct(brw, spec, "WM_STATE", data, gtt_offset,
 347                        (p[5] & ~0x1fu) - gtt_offset, color);
 348          decode_struct(brw, spec, "COLOR_CALC_STATE", data, gtt_offset,
 349                        (p[6] & ~0x3fu) - gtt_offset, color);
 350          break;
 351       case _3DSTATE_BINDING_TABLE_POINTERS_VS:
 352       case _3DSTATE_BINDING_TABLE_POINTERS_HS:
 353       case _3DSTATE_BINDING_TABLE_POINTERS_DS:
 354       case _3DSTATE_BINDING_TABLE_POINTERS_GS:
 355       case _3DSTATE_BINDING_TABLE_POINTERS_PS: {
 356          struct gen_group *group =
 357             gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
 358          if (!group)
 359             break;
 360
 361          uint32_t bt_offset = p[1] & ~0x1fu;
 362          int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
 363          uint32_t *bt_pointers = &data[bt_offset / 4];
 364          for (int i = 0; i < bt_entries; i++) {
 365             fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
 366             gen_print_group(stderr, group, gtt_offset + bt_pointers[i],
 367                             &data[bt_pointers[i] / 4], color);
 368          }
 369          break;
 370       }
 371       case _3DSTATE_SAMPLER_STATE_POINTERS_VS:
 372       case _3DSTATE_SAMPLER_STATE_POINTERS_HS:
 373       case _3DSTATE_SAMPLER_STATE_POINTERS_DS:
 374       case _3DSTATE_SAMPLER_STATE_POINTERS_GS:
 375       case _3DSTATE_SAMPLER_STATE_POINTERS_PS:
 376          decode_structs(brw, spec, "SAMPLER_STATE", data,
 377                         gtt_offset, p[1] & ~0x1fu, 4 * 4, color);
 378          break;
 379       case _3DSTATE_VIEWPORT_STATE_POINTERS:
 380          decode_structs(brw, spec, "CLIP_VIEWPORT", data,
 381                         gtt_offset, p[1] & ~0x3fu, 4 * 4, color);
 382          decode_structs(brw, spec, "SF_VIEWPORT", data,
 383                         gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
 384          decode_structs(brw, spec, "CC_VIEWPORT", data,
 385                         gtt_offset, p[3] & ~0x3fu, 2 * 4, color);
 386          break;
 387       case _3DSTATE_VIEWPORT_STATE_POINTERS_CC:
 388          decode_structs(brw, spec, "CC_VIEWPORT", data,
 389                         gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
 390          break;
 391       case _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL:
 392          decode_structs(brw, spec, "SF_CLIP_VIEWPORT", data,
 393                         gtt_offset, p[1] & ~0x3fu, 16 * 4, color);
 394          break;
 395       case _3DSTATE_SCISSOR_STATE_POINTERS:
 396          decode_structs(brw, spec, "SCISSOR_RECT", data,
 397                         gtt_offset, p[1] & ~0x1fu, 2 * 4, color);
 398          break;
 399       case _3DSTATE_BLEND_STATE_POINTERS:
 400          /* TODO: handle Gen8+ extra dword at the beginning */
 401          decode_structs(brw, spec, "BLEND_STATE", data,
 402                         gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
 403          break;
 404       case _3DSTATE_CC_STATE_POINTERS:
 405          if (brw->gen >= 7) {
 406             decode_struct(brw, spec, "COLOR_CALC_STATE", data,
 407                           gtt_offset, p[1] & ~0x3fu, color);
 408          } else if (brw->gen == 6) {
 409             decode_structs(brw, spec, "BLEND_STATE", data,
 410                            gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
 411             decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
 412                           gtt_offset, p[2] & ~0x3fu, color);
 413             decode_struct(brw, spec, "COLOR_CALC_STATE", data,
 414                           gtt_offset, p[3] & ~0x3fu, color);
 415          }
 416          break;
 417       case _3DSTATE_DEPTH_STENCIL_STATE_POINTERS:
 418          decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
 419                        gtt_offset, p[1] & ~0x3fu, color);
 420          break;
 421       }
 422    }
 423
 424    if (map != NULL) {
 425       brw_bo_unmap(batch->bo);
 426    }
 427 }
 428 #else
 429 static void do_batch_dump(struct brw_context *brw) { }
 430 #endif
 431
 432 /**
 433  * Called when starting a new batch buffer.
 434  */
 435 static void
 436 brw_new_batch(struct brw_context *brw)
 437 {
 438    /* Unreference any BOs held by the previous batch, and reset counts. */
 439    for (int i = 0; i < brw->batch.exec_count; i++) {
 440       if (brw->batch.exec_bos[i] != brw->batch.bo) {
 441          brw_bo_unreference(brw->batch.exec_bos[i]);
 442       }
 443       brw->batch.exec_bos[i] = NULL;
 444    }
 445    brw->batch.reloc_count = 0;
 446    brw->batch.exec_count = 0;
 447    brw->batch.aperture_space = BATCH_SZ;
 448
 449    /* Create a new batchbuffer and reset the associated state: */
 450    intel_batchbuffer_reset_and_clear_render_cache(brw);
 451
 452    /* If the kernel supports hardware contexts, then most hardware state is
 453     * preserved between batches; we only need to re-emit state that is required
 454     * to be in every batch.  Otherwise we need to re-emit all the state that
 455     * would otherwise be stored in the context (which for all intents and
 456     * purposes means everything).
 457     */
 458    if (brw->hw_ctx == 0)
 459       brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
 460
 461    brw->ctx.NewDriverState |= BRW_NEW_BATCH;
 462
 463    brw->ib.index_size = -1;
 464
 465    /* We need to periodically reap the shader time results, because rollover
 466     * happens every few seconds.  We also want to see results every once in a
 467     * while, because many programs won't cleanly destroy our context, so the
 468     * end-of-run printout may not happen.
 469     */
 470    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
 471       brw_collect_and_report_shader_time(brw);
 472 }
 473
 474 /**
 475  * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
 476  * sending it off.
 477  *
 478  * This function can emit state (say, to preserve registers that aren't saved
 479  * between batches).  All of this state MUST fit in the reserved space at the
 480  * end of the batchbuffer.  If you add more GPU state, increase the reserved
 481  * space by updating the BATCH_RESERVED macro.
 482  */
 483 static void
 484 brw_finish_batch(struct brw_context *brw)
 485 {
 486    /* Capture the closing pipeline statistics register values necessary to
 487     * support query objects (in the non-hardware context world).
 488     */
 489    brw_emit_query_end(brw);
 490
 491    if (brw->batch.ring == RENDER_RING) {
 492       /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
 493        * assume that the L3 cache is configured according to the hardware
 494        * defaults.
 495        */
 496       if (brw->gen >= 7)
 497          gen7_restore_default_l3_config(brw);
 498
 499       if (brw->is_haswell) {
 500          /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
 501           * 3DSTATE_CC_STATE_POINTERS > "Note":
 502           *
 503           * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
 504           *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
 505           *
 506           * From the example in the docs, it seems to expect a regular pipe control
 507           * flush here as well. We may have done it already, but meh.
 508           *
 509           * See also WaAvoidRCZCounterRollover.
 510           */
 511          brw_emit_mi_flush(brw);
 512          BEGIN_BATCH(2);
 513          OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
 514          OUT_BATCH(brw->cc.state_offset | 1);
 515          ADVANCE_BATCH();
 516          brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
 517                                           PIPE_CONTROL_CS_STALL);
 518       }
 519    }
 520 }
 521
 522 static void
 523 throttle(struct brw_context *brw)
 524 {
 525    /* Wait for the swapbuffers before the one we just emitted, so we
 526     * don't get too many swaps outstanding for apps that are GPU-heavy
 527     * but not CPU-heavy.
 528     *
 529     * We're using intelDRI2Flush (called from the loader before
 530     * swapbuffer) and glFlush (for front buffer rendering) as the
 531     * indicator that a frame is done and then throttle when we get
 532     * here as we prepare to render the next frame.  At this point for
 533     * round trips for swap/copy and getting new buffers are done and
 534     * we'll spend less time waiting on the GPU.
 535     *
 536     * Unfortunately, we don't have a handle to the batch containing
 537     * the swap, and getting our hands on that doesn't seem worth it,
 538     * so we just use the first batch we emitted after the last swap.
 539     */
 540    if (brw->need_swap_throttle && brw->throttle_batch[0]) {
 541       if (brw->throttle_batch[1]) {
 542          if (!brw->disable_throttling) {
 543             /* Pass NULL rather than brw so we avoid perf_debug warnings;
 544              * stalling is common and expected here...
 545              */
 546             brw_bo_wait_rendering(brw->throttle_batch[1]);
 547          }
 548          brw_bo_unreference(brw->throttle_batch[1]);
 549       }
 550       brw->throttle_batch[1] = brw->throttle_batch[0];
 551       brw->throttle_batch[0] = NULL;
 552       brw->need_swap_throttle = false;
 553       /* Throttling here is more precise than the throttle ioctl, so skip it */
 554       brw->need_flush_throttle = false;
 555    }
 556
 557    if (brw->need_flush_throttle) {
 558       __DRIscreen *dri_screen = brw->screen->driScrnPriv;
 559       drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
 560       brw->need_flush_throttle = false;
 561    }
 562 }
 563
 564 static int
 565 execbuffer(int fd,
 566            struct intel_batchbuffer *batch,
 567            uint32_t ctx_id,
 568            int used,
 569            int in_fence,
 570            int *out_fence,
 571            int flags)
 572 {
 573    struct drm_i915_gem_execbuffer2 execbuf = {
 574       .buffers_ptr = (uintptr_t) batch->validation_list,
 575       .buffer_count = batch->exec_count,
 576       .batch_start_offset = 0,
 577       .batch_len = used,
 578       .flags = flags,
 579       .rsvd1 = ctx_id, /* rsvd1 is actually the context ID */
 580    };
 581
 582    unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
 583
 584    if (in_fence != -1) {
 585       execbuf.rsvd2 = in_fence;
 586       execbuf.flags |= I915_EXEC_FENCE_IN;
 587    }
 588
 589    if (out_fence != NULL) {
 590       cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2_WR;
 591       *out_fence = -1;
 592       execbuf.flags |= I915_EXEC_FENCE_OUT;
 593    }
 594
 595    int ret = drmIoctl(fd, cmd, &execbuf);
 596    if (ret != 0)
 597       ret = -errno;
 598
 599    for (int i = 0; i < batch->exec_count; i++) {
 600       struct brw_bo *bo = batch->exec_bos[i];
 601
 602       bo->idle = false;
 603       bo->index = -1;
 604
 605       /* Update brw_bo::offset64 */
 606       if (batch->validation_list[i].offset != bo->offset64) {
 607          DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
 608              bo->gem_handle, bo->offset64, batch->validation_list[i].offset);
 609          bo->offset64 = batch->validation_list[i].offset;
 610       }
 611    }
 612
 613    if (ret == 0 && out_fence != NULL)
 614       *out_fence = execbuf.rsvd2 >> 32;
 615
 616    return ret;
 617 }
 618
 619 static int
 620 do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
 621 {
 622    __DRIscreen *dri_screen = brw->screen->driScrnPriv;
 623    struct intel_batchbuffer *batch = &brw->batch;
 624    int ret = 0;
 625
 626    if (brw->has_llc) {
 627       brw_bo_unmap(batch->bo);
 628    } else {
 629       ret = brw_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map);
 630       if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
 631          ret = brw_bo_subdata(batch->bo,
 632                                     batch->state_batch_offset,
 633                                     batch->bo->size - batch->state_batch_offset,
 634                                     (char *)batch->map + batch->state_batch_offset);
 635       }
 636    }
 637
 638    if (!brw->screen->no_hw) {
 639       /* The requirement for using I915_EXEC_NO_RELOC are:
 640        *
 641        *   The addresses written in the objects must match the corresponding
 642        *   reloc.presumed_offset which in turn must match the corresponding
 643        *   execobject.offset.
 644        *
 645        *   Any render targets written to in the batch must be flagged with
 646        *   EXEC_OBJECT_WRITE.
 647        *
 648        *   To avoid stalling, execobject.offset should match the current
 649        *   address of that object within the active context.
 650        */
 651       int flags = I915_EXEC_NO_RELOC;
 652
 653       if (brw->gen >= 6 && batch->ring == BLT_RING) {
 654          flags |= I915_EXEC_BLT;
 655       } else {
 656          flags |= I915_EXEC_RENDER;
 657       }
 658       if (batch->needs_sol_reset)
 659          flags |= I915_EXEC_GEN7_SOL_RESET;
 660
 661       if (ret == 0) {
 662          uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;
 663
 664          struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
 665          assert(entry->handle == batch->bo->gem_handle);
 666          entry->relocation_count = batch->reloc_count;
 667          entry->relocs_ptr = (uintptr_t) batch->relocs;
 668
 669          if (batch->use_batch_first) {
 670             flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
 671          } else {
 672             /* Move the batch to the end of the validation list */
 673             struct drm_i915_gem_exec_object2 tmp;
 674             const unsigned index = batch->exec_count - 1;
 675
 676             tmp = *entry;
 677             *entry = batch->validation_list[index];
 678             batch->validation_list[index] = tmp;
 679          }
 680
 681          ret = execbuffer(dri_screen->fd, batch, hw_ctx,
 682                           4 * USED_BATCH(*batch),
 683                           in_fence_fd, out_fence_fd, flags);
 684       }
 685
 686       throttle(brw);
 687    }
 688
 689    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
 690       do_batch_dump(brw);
 691
 692    if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
 693       brw_check_for_reset(brw);
 694
 695    if (ret != 0) {
 696       fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
 697       exit(1);
 698    }
 699
 700    return ret;
 701 }
 702
 703 /**
 704  * The in_fence_fd is ignored if -1.  Otherwise this function takes ownership
 705  * of the fd.
 706  *
 707  * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
 708  * of the returned fd.
 709  */
 710 int
 711 _intel_batchbuffer_flush_fence(struct brw_context *brw,
 712                                int in_fence_fd, int *out_fence_fd,
 713                                const char *file, int line)
 714 {
 715    int ret;
 716
 717    if (USED_BATCH(brw->batch) == 0)
 718       return 0;
 719
 720    if (brw->throttle_batch[0] == NULL) {
 721       brw->throttle_batch[0] = brw->batch.bo;
 722       brw_bo_reference(brw->throttle_batch[0]);
 723    }
 724
 725    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
 726       int bytes_for_commands = 4 * USED_BATCH(brw->batch);
 727       int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
 728       int total_bytes = bytes_for_commands + bytes_for_state;
 729       fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
 730               "%4db (state) = %4db (%0.1f%%)\n", file, line,
 731               bytes_for_commands, bytes_for_state,
 732               total_bytes,
 733               100.0f * total_bytes / BATCH_SZ);
 734    }
 735
 736    brw->batch.reserved_space = 0;
 737
 738    brw_finish_batch(brw);
 739
 740    /* Mark the end of the buffer. */
 741    intel_batchbuffer_emit_dword(&brw->batch, MI_BATCH_BUFFER_END);
 742    if (USED_BATCH(brw->batch) & 1) {
 743       /* Round batchbuffer usage to 2 DWORDs. */
 744       intel_batchbuffer_emit_dword(&brw->batch, MI_NOOP);
 745    }
 746
 747    intel_upload_finish(brw);
 748
 749    /* Check that we didn't just wrap our batchbuffer at a bad time. */
 750    assert(!brw->no_batch_wrap);
 751
 752    ret = do_flush_locked(brw, in_fence_fd, out_fence_fd);
 753
 754    if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
 755       fprintf(stderr, "waiting for idle\n");
 756       brw_bo_wait_rendering(brw->batch.bo);
 757    }
 758
 759    /* Start a new batch buffer. */
 760    brw_new_batch(brw);
 761
 762    return ret;
 763 }
 764
 765 bool
 766 brw_batch_has_aperture_space(struct brw_context *brw, unsigned extra_space)
 767 {
 768    return brw->batch.aperture_space + extra_space <=
 769           brw->screen->aperture_threshold;
 770 }
 771
 772 bool
 773 brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
 774 {
 775    unsigned index = READ_ONCE(bo->index);
 776    if (index < batch->exec_count && batch->exec_bos[index] == bo)
 777       return true;
 778
 779    for (int i = 0; i < batch->exec_count; i++) {
 780       if (batch->exec_bos[i] == bo)
 781          return true;
 782    }
 783    return false;
 784 }
 785
 786 /*  This is the only way buffers get added to the validate list.
 787  */
 788 uint64_t
 789 brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
 790                struct brw_bo *target, uint32_t target_offset,
 791                uint32_t read_domains, uint32_t write_domain)
 792 {
 793    assert(target != NULL);
 794
 795    if (batch->reloc_count == batch->reloc_array_size) {
 796       batch->reloc_array_size *= 2;
 797       batch->relocs = realloc(batch->relocs,
 798                               batch->reloc_array_size *
 799                               sizeof(struct drm_i915_gem_relocation_entry));
 800    }
 801
 802    /* Check args */
 803    assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
 804    assert(_mesa_bitcount(write_domain) <= 1);
 805
 806    unsigned int index = add_exec_bo(batch, target);
 807    struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
 808
 809    if (write_domain) {
 810       entry->flags |= EXEC_OBJECT_WRITE;
 811
 812          /* PIPECONTROL needs a w/a on gen6 */
 813       if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
 814          struct brw_context *brw = container_of(batch, brw, batch);
 815          if (brw->gen == 6)
 816             entry->flags |= EXEC_OBJECT_NEEDS_GTT;
 817       }
 818    }
 819
 820    batch->relocs[batch->reloc_count++] =
 821       (struct drm_i915_gem_relocation_entry) {
 822          .offset = batch_offset,
 823          .delta = target_offset,
 824          .target_handle = batch->use_batch_first ? index : target->gem_handle,
 825          .presumed_offset = entry->offset,
 826       };
 827
 828    /* Using the old buffer offset, write in what the right data would be, in
 829     * case the buffer doesn't move and we can short-circuit the relocation
 830     * processing in the kernel
 831     */
 832    return entry->offset + target_offset;
 833 }
 834
 835 void
 836 intel_batchbuffer_data(struct brw_context *brw,
 837                        const void *data, GLuint bytes, enum brw_gpu_ring ring)
 838 {
 839    assert((bytes & 3) == 0);
 840    intel_batchbuffer_require_space(brw, bytes, ring);
 841    memcpy(brw->batch.map_next, data, bytes);
 842    brw->batch.map_next += bytes >> 2;
 843 }
 844
 845 static void
 846 load_sized_register_mem(struct brw_context *brw,
 847                         uint32_t reg,
 848                         struct brw_bo *bo,
 849                         uint32_t read_domains, uint32_t write_domain,
 850                         uint32_t offset,
 851                         int size)
 852 {
 853    int i;
 854
 855    /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
 856    assert(brw->gen >= 7);
 857
 858    if (brw->gen >= 8) {
 859       BEGIN_BATCH(4 * size);
 860       for (i = 0; i < size; i++) {
 861          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
 862          OUT_BATCH(reg + i * 4);
 863          OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
 864       }
 865       ADVANCE_BATCH();
 866    } else {
 867       BEGIN_BATCH(3 * size);
 868       for (i = 0; i < size; i++) {
 869          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
 870          OUT_BATCH(reg + i * 4);
 871          OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
 872       }
 873       ADVANCE_BATCH();
 874    }
 875 }
 876
 877 void
 878 brw_load_register_mem(struct brw_context *brw,
 879                       uint32_t reg,
 880                       struct brw_bo *bo,
 881                       uint32_t read_domains, uint32_t write_domain,
 882                       uint32_t offset)
 883 {
 884    load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
 885 }
 886
 887 void
 888 brw_load_register_mem64(struct brw_context *brw,
 889                         uint32_t reg,
 890                         struct brw_bo *bo,
 891                         uint32_t read_domains, uint32_t write_domain,
 892                         uint32_t offset)
 893 {
 894    load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
 895 }
 896
 897 /*
 898  * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
 899  */
 900 void
 901 brw_store_register_mem32(struct brw_context *brw,
 902                          struct brw_bo *bo, uint32_t reg, uint32_t offset)
 903 {
 904    assert(brw->gen >= 6);
 905
 906    if (brw->gen >= 8) {
 907       BEGIN_BATCH(4);
 908       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
 909       OUT_BATCH(reg);
 910       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 911                   offset);
 912       ADVANCE_BATCH();
 913    } else {
 914       BEGIN_BATCH(3);
 915       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
 916       OUT_BATCH(reg);
 917       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 918                 offset);
 919       ADVANCE_BATCH();
 920    }
 921 }
 922
 923 /*
 924  * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
 925  */
 926 void
 927 brw_store_register_mem64(struct brw_context *brw,
 928                          struct brw_bo *bo, uint32_t reg, uint32_t offset)
 929 {
 930    assert(brw->gen >= 6);
 931
 932    /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
 933     * read a full 64-bit register, we need to do two of them.
 934     */
 935    if (brw->gen >= 8) {
 936       BEGIN_BATCH(8);
 937       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
 938       OUT_BATCH(reg);
 939       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 940                   offset);
 941       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
 942       OUT_BATCH(reg + sizeof(uint32_t));
 943       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 944                   offset + sizeof(uint32_t));
 945       ADVANCE_BATCH();
 946    } else {
 947       BEGIN_BATCH(6);
 948       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
 949       OUT_BATCH(reg);
 950       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 951                 offset);
 952       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
 953       OUT_BATCH(reg + sizeof(uint32_t));
 954       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 955                 offset + sizeof(uint32_t));
 956       ADVANCE_BATCH();
 957    }
 958 }
 959
 960 /*
 961  * Write a 32-bit register using immediate data.
 962  */
 963 void
 964 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
 965 {
 966    assert(brw->gen >= 6);
 967
 968    BEGIN_BATCH(3);
 969    OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
 970    OUT_BATCH(reg);
 971    OUT_BATCH(imm);
 972    ADVANCE_BATCH();
 973 }
 974
 975 /*
 976  * Write a 64-bit register using immediate data.
 977  */
 978 void
 979 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
 980 {
 981    assert(brw->gen >= 6);
 982
 983    BEGIN_BATCH(5);
 984    OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
 985    OUT_BATCH(reg);
 986    OUT_BATCH(imm & 0xffffffff);
 987    OUT_BATCH(reg + 4);
 988    OUT_BATCH(imm >> 32);
 989    ADVANCE_BATCH();
 990 }
 991
 992 /*
 993  * Copies a 32-bit register.
 994  */
 995 void
 996 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
 997 {
 998    assert(brw->gen >= 8 || brw->is_haswell);
 999
1000    BEGIN_BATCH(3);
1001    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1002    OUT_BATCH(src);
1003    OUT_BATCH(dest);
1004    ADVANCE_BATCH();
1005 }
1006
1007 /*
1008  * Copies a 64-bit register.
1009  */
1010 void
1011 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
1012 {
1013    assert(brw->gen >= 8 || brw->is_haswell);
1014
1015    BEGIN_BATCH(6);
1016    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1017    OUT_BATCH(src);
1018    OUT_BATCH(dest);
1019    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1020    OUT_BATCH(src + sizeof(uint32_t));
1021    OUT_BATCH(dest + sizeof(uint32_t));
1022    ADVANCE_BATCH();
1023 }
1024
1025 /*
1026  * Write 32-bits of immediate data to a GPU memory buffer.
1027  */
1028 void
1029 brw_store_data_imm32(struct brw_context *brw, struct brw_bo *bo,
1030                      uint32_t offset, uint32_t imm)
1031 {
1032    assert(brw->gen >= 6);
1033
1034    BEGIN_BATCH(4);
1035    OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
1036    if (brw->gen >= 8)
1037       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
1038                   offset);
1039    else {
1040       OUT_BATCH(0); /* MBZ */
1041       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
1042                 offset);
1043    }
1044    OUT_BATCH(imm);
1045    ADVANCE_BATCH();
1046 }
1047
1048 /*
1049  * Write 64-bits of immediate data to a GPU memory buffer.
1050  */
1051 void
1052 brw_store_data_imm64(struct brw_context *brw, struct brw_bo *bo,
1053                      uint32_t offset, uint64_t imm)
1054 {
1055    assert(brw->gen >= 6);
1056
1057    BEGIN_BATCH(5);
1058    OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
1059    if (brw->gen >= 8)
1060       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
1061                   offset);
1062    else {
1063       OUT_BATCH(0); /* MBZ */
1064       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
1065                 offset);
1066    }
1067    OUT_BATCH(imm & 0xffffffffu);
1068    OUT_BATCH(imm >> 32);
1069    ADVANCE_BATCH();
1070 }