src/mesa/drivers/dri/i965/intel_batchbuffer.c

   1 /*
   2  * Copyright 2006 VMware, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the
   7  * "Software"), to deal in the Software without restriction, including
   8  * without limitation the rights to use, copy, modify, merge, publish,
   9  * distribute, sublicense, and/or sell copies of the Software, and to
  10  * permit persons to whom the Software is furnished to do so, subject to
  11  * the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the
  14  * next paragraph) shall be included in all copies or substantial portions
  15  * of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  19  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  20  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  21  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  22  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  23  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #include "intel_batchbuffer.h"
  27 #include "intel_buffer_objects.h"
  28 #include "brw_bufmgr.h"
  29 #include "intel_buffers.h"
  30 #include "intel_fbo.h"
  31 #include "brw_context.h"
  32 #include "brw_defines.h"
  33 #include "brw_state.h"
  34 #include "common/gen_decoder.h"
  35
  36 #include "util/hash_table.h"
  37
  38 #include <xf86drm.h>
  39 #include <i915_drm.h>
  40
  41 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
  42
  43 static void
  44 intel_batchbuffer_reset(struct intel_batchbuffer *batch,
  45                         struct brw_bufmgr *bufmgr,
  46                         bool has_llc);
  47
  48 static bool
  49 uint_key_compare(const void *a, const void *b)
  50 {
  51    return a == b;
  52 }
  53
  54 static uint32_t
  55 uint_key_hash(const void *key)
  56 {
  57    return (uintptr_t) key;
  58 }
  59
  60 void
  61 intel_batchbuffer_init(struct intel_batchbuffer *batch,
  62                        struct brw_bufmgr *bufmgr,
  63                        bool has_llc)
  64 {
  65    intel_batchbuffer_reset(batch, bufmgr, has_llc);
  66
  67    if (!has_llc) {
  68       batch->cpu_map = malloc(BATCH_SZ);
  69       batch->map = batch->cpu_map;
  70       batch->map_next = batch->cpu_map;
  71    }
  72
  73    batch->reloc_count = 0;
  74    batch->reloc_array_size = 250;
  75    batch->relocs = malloc(batch->reloc_array_size *
  76                           sizeof(struct drm_i915_gem_relocation_entry));
  77    batch->exec_count = 0;
  78    batch->exec_array_size = 100;
  79    batch->exec_bos =
  80       malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
  81    batch->validation_list =
  82       malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
  83
  84    if (INTEL_DEBUG & DEBUG_BATCH) {
  85       batch->state_batch_sizes =
  86          _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
  87    }
  88 }
  89
  90 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
  91
  92 static unsigned
  93 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
  94 {
  95    if (bo != batch->bo) {
  96       unsigned index = READ_ONCE(bo->index);
  97
  98       if (index < batch->exec_count && batch->exec_bos[index] == bo)
  99          return index;
 100
 101       /* May have been shared between multiple active batches */
 102       for (index = 0; index < batch->exec_count; index++) {
 103          if (batch->exec_bos[index] == bo)
 104             return index;
 105       }
 106
 107       brw_bo_reference(bo);
 108    }
 109
 110    if (batch->exec_count == batch->exec_array_size) {
 111       batch->exec_array_size *= 2;
 112       batch->exec_bos =
 113          realloc(batch->exec_bos,
 114                  batch->exec_array_size * sizeof(batch->exec_bos[0]));
 115       batch->validation_list =
 116          realloc(batch->validation_list,
 117                  batch->exec_array_size * sizeof(batch->validation_list[0]));
 118    }
 119
 120    struct drm_i915_gem_exec_object2 *validation_entry =
 121       &batch->validation_list[batch->exec_count];
 122    validation_entry->handle = bo->gem_handle;
 123    if (bo == batch->bo) {
 124       validation_entry->relocation_count = batch->reloc_count;
 125       validation_entry->relocs_ptr = (uintptr_t) batch->relocs;
 126    } else {
 127       validation_entry->relocation_count = 0;
 128       validation_entry->relocs_ptr = 0;
 129    }
 130    validation_entry->alignment = bo->align;
 131    validation_entry->offset = bo->offset64;
 132    validation_entry->flags = bo->kflags;
 133    validation_entry->rsvd1 = 0;
 134    validation_entry->rsvd2 = 0;
 135
 136    bo->index = batch->exec_count;
 137    batch->exec_bos[batch->exec_count] = bo;
 138    batch->aperture_space += bo->size;
 139
 140    return batch->exec_count++;
 141 }
 142
 143 static void
 144 intel_batchbuffer_reset(struct intel_batchbuffer *batch,
 145                         struct brw_bufmgr *bufmgr,
 146                         bool has_llc)
 147 {
 148    if (batch->last_bo != NULL) {
 149       brw_bo_unreference(batch->last_bo);
 150       batch->last_bo = NULL;
 151    }
 152    batch->last_bo = batch->bo;
 153
 154    batch->bo = brw_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
 155    if (has_llc) {
 156       batch->map = brw_bo_map(NULL, batch->bo, MAP_READ | MAP_WRITE);
 157    }
 158    batch->map_next = batch->map;
 159
 160    batch->reserved_space = BATCH_RESERVED;
 161    batch->state_batch_offset = batch->bo->size;
 162    batch->needs_sol_reset = false;
 163    batch->state_base_address_emitted = false;
 164
 165    /* We don't know what ring the new batch will be sent to until we see the
 166     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
 167     */
 168    batch->ring = UNKNOWN_RING;
 169
 170    if (batch->state_batch_sizes)
 171       _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
 172 }
 173
 174 static void
 175 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
 176 {
 177    intel_batchbuffer_reset(&brw->batch, brw->bufmgr, brw->has_llc);
 178    brw_render_cache_set_clear(brw);
 179 }
 180
 181 void
 182 intel_batchbuffer_save_state(struct brw_context *brw)
 183 {
 184    brw->batch.saved.map_next = brw->batch.map_next;
 185    brw->batch.saved.reloc_count = brw->batch.reloc_count;
 186    brw->batch.saved.exec_count = brw->batch.exec_count;
 187 }
 188
 189 void
 190 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 191 {
 192    for (int i = brw->batch.saved.exec_count;
 193         i < brw->batch.exec_count; i++) {
 194       if (brw->batch.exec_bos[i] != brw->batch.bo) {
 195          brw_bo_unreference(brw->batch.exec_bos[i]);
 196       }
 197    }
 198    brw->batch.reloc_count = brw->batch.saved.reloc_count;
 199    brw->batch.exec_count = brw->batch.saved.exec_count;
 200
 201    brw->batch.map_next = brw->batch.saved.map_next;
 202    if (USED_BATCH(brw->batch) == 0)
 203       brw->batch.ring = UNKNOWN_RING;
 204 }
 205
 206 void
 207 intel_batchbuffer_free(struct intel_batchbuffer *batch)
 208 {
 209    free(batch->cpu_map);
 210
 211    for (int i = 0; i < batch->exec_count; i++) {
 212       if (batch->exec_bos[i] != batch->bo) {
 213          brw_bo_unreference(batch->exec_bos[i]);
 214       }
 215    }
 216    free(batch->relocs);
 217    free(batch->exec_bos);
 218    free(batch->validation_list);
 219
 220    brw_bo_unreference(batch->last_bo);
 221    brw_bo_unreference(batch->bo);
 222    if (batch->state_batch_sizes)
 223       _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
 224 }
 225
 226 void
 227 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
 228                                 enum brw_gpu_ring ring)
 229 {
 230    /* If we're switching rings, implicitly flush the batch. */
 231    if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
 232        brw->gen >= 6) {
 233       intel_batchbuffer_flush(brw);
 234    }
 235
 236 #ifdef DEBUG
 237    assert(sz < BATCH_SZ - BATCH_RESERVED);
 238 #endif
 239    if (intel_batchbuffer_space(&brw->batch) < sz)
 240       intel_batchbuffer_flush(brw);
 241
 242    /* The intel_batchbuffer_flush() calls above might have changed
 243     * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
 244     */
 245    brw->batch.ring = ring;
 246 }
 247
 248 #ifdef DEBUG
 249 #define CSI "\e["
 250 #define BLUE_HEADER  CSI "0;44m"
 251 #define NORMAL       CSI "0m"
 252
 253
 254 static void
 255 decode_struct(struct brw_context *brw, struct gen_spec *spec,
 256               const char *struct_name, uint32_t *data,
 257               uint32_t gtt_offset, uint32_t offset, bool color)
 258 {
 259    struct gen_group *group = gen_spec_find_struct(spec, struct_name);
 260    if (!group)
 261       return;
 262
 263    fprintf(stderr, "%s\n", struct_name);
 264    gen_print_group(stderr, group, gtt_offset + offset,
 265                    &data[offset / 4], color);
 266 }
 267
 268 static void
 269 decode_structs(struct brw_context *brw, struct gen_spec *spec,
 270                const char *struct_name,
 271                uint32_t *data, uint32_t gtt_offset, uint32_t offset,
 272                int struct_size, bool color)
 273 {
 274    struct gen_group *group = gen_spec_find_struct(spec, struct_name);
 275    if (!group)
 276       return;
 277
 278    int entries = brw_state_batch_size(brw, offset) / struct_size;
 279    for (int i = 0; i < entries; i++) {
 280       fprintf(stderr, "%s %d\n", struct_name, i);
 281       gen_print_group(stderr, group, gtt_offset + offset,
 282                       &data[(offset + i * struct_size) / 4], color);
 283    }
 284 }
 285
 286 static void
 287 do_batch_dump(struct brw_context *brw)
 288 {
 289    struct intel_batchbuffer *batch = &brw->batch;
 290    struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
 291
 292    if (batch->ring != RENDER_RING)
 293       return;
 294
 295    void *map = brw_bo_map(brw, batch->bo, MAP_READ);
 296    if (map == NULL) {
 297       fprintf(stderr,
 298               "WARNING: failed to map batchbuffer, "
 299               "dumping uploaded data instead.\n");
 300    }
 301
 302    uint32_t *data = map ? map : batch->map;
 303    uint32_t *end = data + USED_BATCH(*batch);
 304    uint32_t gtt_offset = map ? batch->bo->offset64 : 0;
 305    int length;
 306
 307    bool color = INTEL_DEBUG & DEBUG_COLOR;
 308    const char *header_color = color ? BLUE_HEADER : "";
 309    const char *reset_color  = color ? NORMAL : "";
 310
 311    for (uint32_t *p = data; p < end; p += length) {
 312       struct gen_group *inst = gen_spec_find_instruction(spec, p);
 313       length = gen_group_get_length(inst, p);
 314       assert(inst == NULL || length > 0);
 315       length = MAX2(1, length);
 316       if (inst == NULL) {
 317          fprintf(stderr, "unknown instruction %08x\n", p[0]);
 318          continue;
 319       }
 320
 321       uint64_t offset = gtt_offset + 4 * (p - data);
 322
 323       fprintf(stderr, "%s0x%08"PRIx64":  0x%08x:  %-80s%s\n", header_color,
 324               offset, p[0], gen_group_get_name(inst), reset_color);
 325
 326       gen_print_group(stderr, inst, offset, p, color);
 327
 328       switch (gen_group_get_opcode(inst) >> 16) {
 329       case _3DSTATE_PIPELINED_POINTERS:
 330          /* Note: these Gen4-5 pointers are full relocations rather than
 331           * offsets from the start of the batch.  So we need to subtract
 332           * gtt_offset (the start of the batch) to obtain an offset we
 333           * can add to the map and get at the data.
 334           */
 335          decode_struct(brw, spec, "VS_STATE", data, gtt_offset,
 336                        (p[1] & ~0x1fu) - gtt_offset, color);
 337          if (p[2] & 1) {
 338             decode_struct(brw, spec, "GS_STATE", data, gtt_offset,
 339                           (p[2] & ~0x1fu) - gtt_offset, color);
 340          }
 341          if (p[3] & 1) {
 342             decode_struct(brw, spec, "CLIP_STATE", data, gtt_offset,
 343                           (p[3] & ~0x1fu) - gtt_offset, color);
 344          }
 345          decode_struct(brw, spec, "SF_STATE", data, gtt_offset,
 346                        (p[4] & ~0x1fu) - gtt_offset, color);
 347          decode_struct(brw, spec, "WM_STATE", data, gtt_offset,
 348                        (p[5] & ~0x1fu) - gtt_offset, color);
 349          decode_struct(brw, spec, "COLOR_CALC_STATE", data, gtt_offset,
 350                        (p[6] & ~0x3fu) - gtt_offset, color);
 351          break;
 352       case _3DSTATE_BINDING_TABLE_POINTERS_VS:
 353       case _3DSTATE_BINDING_TABLE_POINTERS_HS:
 354       case _3DSTATE_BINDING_TABLE_POINTERS_DS:
 355       case _3DSTATE_BINDING_TABLE_POINTERS_GS:
 356       case _3DSTATE_BINDING_TABLE_POINTERS_PS: {
 357          struct gen_group *group =
 358             gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
 359          if (!group)
 360             break;
 361
 362          uint32_t bt_offset = p[1] & ~0x1fu;
 363          int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
 364          uint32_t *bt_pointers = &data[bt_offset / 4];
 365          for (int i = 0; i < bt_entries; i++) {
 366             fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
 367             gen_print_group(stderr, group, gtt_offset + bt_pointers[i],
 368                             &data[bt_pointers[i] / 4], color);
 369          }
 370          break;
 371       }
 372       case _3DSTATE_SAMPLER_STATE_POINTERS_VS:
 373       case _3DSTATE_SAMPLER_STATE_POINTERS_HS:
 374       case _3DSTATE_SAMPLER_STATE_POINTERS_DS:
 375       case _3DSTATE_SAMPLER_STATE_POINTERS_GS:
 376       case _3DSTATE_SAMPLER_STATE_POINTERS_PS:
 377          decode_structs(brw, spec, "SAMPLER_STATE", data,
 378                         gtt_offset, p[1] & ~0x1fu, 4 * 4, color);
 379          break;
 380       case _3DSTATE_VIEWPORT_STATE_POINTERS:
 381          decode_structs(brw, spec, "CLIP_VIEWPORT", data,
 382                         gtt_offset, p[1] & ~0x3fu, 4 * 4, color);
 383          decode_structs(brw, spec, "SF_VIEWPORT", data,
 384                         gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
 385          decode_structs(brw, spec, "CC_VIEWPORT", data,
 386                         gtt_offset, p[3] & ~0x3fu, 2 * 4, color);
 387          break;
 388       case _3DSTATE_VIEWPORT_STATE_POINTERS_CC:
 389          decode_structs(brw, spec, "CC_VIEWPORT", data,
 390                         gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
 391          break;
 392       case _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL:
 393          decode_structs(brw, spec, "SF_CLIP_VIEWPORT", data,
 394                         gtt_offset, p[1] & ~0x3fu, 16 * 4, color);
 395          break;
 396       case _3DSTATE_SCISSOR_STATE_POINTERS:
 397          decode_structs(brw, spec, "SCISSOR_RECT", data,
 398                         gtt_offset, p[1] & ~0x1fu, 2 * 4, color);
 399          break;
 400       case _3DSTATE_BLEND_STATE_POINTERS:
 401          /* TODO: handle Gen8+ extra dword at the beginning */
 402          decode_structs(brw, spec, "BLEND_STATE", data,
 403                         gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
 404          break;
 405       case _3DSTATE_CC_STATE_POINTERS:
 406          if (brw->gen >= 7) {
 407             decode_struct(brw, spec, "COLOR_CALC_STATE", data,
 408                           gtt_offset, p[1] & ~0x3fu, color);
 409          } else if (brw->gen == 6) {
 410             decode_structs(brw, spec, "BLEND_STATE", data,
 411                            gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
 412             decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
 413                           gtt_offset, p[2] & ~0x3fu, color);
 414             decode_struct(brw, spec, "COLOR_CALC_STATE", data,
 415                           gtt_offset, p[3] & ~0x3fu, color);
 416          }
 417          break;
 418       case _3DSTATE_DEPTH_STENCIL_STATE_POINTERS:
 419          decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
 420                        gtt_offset, p[1] & ~0x3fu, color);
 421          break;
 422       }
 423    }
 424
 425    if (map != NULL) {
 426       brw_bo_unmap(batch->bo);
 427    }
 428 }
 429 #else
 430 static void do_batch_dump(struct brw_context *brw) { }
 431 #endif
 432
 433 /**
 434  * Called when starting a new batch buffer.
 435  */
 436 static void
 437 brw_new_batch(struct brw_context *brw)
 438 {
 439    /* Unreference any BOs held by the previous batch, and reset counts. */
 440    for (int i = 0; i < brw->batch.exec_count; i++) {
 441       if (brw->batch.exec_bos[i] != brw->batch.bo) {
 442          brw_bo_unreference(brw->batch.exec_bos[i]);
 443       }
 444       brw->batch.exec_bos[i] = NULL;
 445    }
 446    brw->batch.reloc_count = 0;
 447    brw->batch.exec_count = 0;
 448    brw->batch.aperture_space = BATCH_SZ;
 449
 450    /* Create a new batchbuffer and reset the associated state: */
 451    intel_batchbuffer_reset_and_clear_render_cache(brw);
 452
 453    /* If the kernel supports hardware contexts, then most hardware state is
 454     * preserved between batches; we only need to re-emit state that is required
 455     * to be in every batch.  Otherwise we need to re-emit all the state that
 456     * would otherwise be stored in the context (which for all intents and
 457     * purposes means everything).
 458     */
 459    if (brw->hw_ctx == 0)
 460       brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
 461
 462    brw->ctx.NewDriverState |= BRW_NEW_BATCH;
 463
 464    brw->ib.index_size = -1;
 465
 466    /* We need to periodically reap the shader time results, because rollover
 467     * happens every few seconds.  We also want to see results every once in a
 468     * while, because many programs won't cleanly destroy our context, so the
 469     * end-of-run printout may not happen.
 470     */
 471    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
 472       brw_collect_and_report_shader_time(brw);
 473 }
 474
 475 /**
 476  * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
 477  * sending it off.
 478  *
 479  * This function can emit state (say, to preserve registers that aren't saved
 480  * between batches).  All of this state MUST fit in the reserved space at the
 481  * end of the batchbuffer.  If you add more GPU state, increase the reserved
 482  * space by updating the BATCH_RESERVED macro.
 483  */
 484 static void
 485 brw_finish_batch(struct brw_context *brw)
 486 {
 487    /* Capture the closing pipeline statistics register values necessary to
 488     * support query objects (in the non-hardware context world).
 489     */
 490    brw_emit_query_end(brw);
 491
 492    if (brw->batch.ring == RENDER_RING) {
 493       /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
 494        * assume that the L3 cache is configured according to the hardware
 495        * defaults.
 496        */
 497       if (brw->gen >= 7)
 498          gen7_restore_default_l3_config(brw);
 499
 500       if (brw->is_haswell) {
 501          /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
 502           * 3DSTATE_CC_STATE_POINTERS > "Note":
 503           *
 504           * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
 505           *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
 506           *
 507           * From the example in the docs, it seems to expect a regular pipe control
 508           * flush here as well. We may have done it already, but meh.
 509           *
 510           * See also WaAvoidRCZCounterRollover.
 511           */
 512          brw_emit_mi_flush(brw);
 513          BEGIN_BATCH(2);
 514          OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
 515          OUT_BATCH(brw->cc.state_offset | 1);
 516          ADVANCE_BATCH();
 517          brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
 518                                           PIPE_CONTROL_CS_STALL);
 519       }
 520    }
 521 }
 522
 523 static void
 524 throttle(struct brw_context *brw)
 525 {
 526    /* Wait for the swapbuffers before the one we just emitted, so we
 527     * don't get too many swaps outstanding for apps that are GPU-heavy
 528     * but not CPU-heavy.
 529     *
 530     * We're using intelDRI2Flush (called from the loader before
 531     * swapbuffer) and glFlush (for front buffer rendering) as the
 532     * indicator that a frame is done and then throttle when we get
 533     * here as we prepare to render the next frame.  At this point for
 534     * round trips for swap/copy and getting new buffers are done and
 535     * we'll spend less time waiting on the GPU.
 536     *
 537     * Unfortunately, we don't have a handle to the batch containing
 538     * the swap, and getting our hands on that doesn't seem worth it,
 539     * so we just use the first batch we emitted after the last swap.
 540     */
 541    if (brw->need_swap_throttle && brw->throttle_batch[0]) {
 542       if (brw->throttle_batch[1]) {
 543          if (!brw->disable_throttling) {
 544             /* Pass NULL rather than brw so we avoid perf_debug warnings;
 545              * stalling is common and expected here...
 546              */
 547             brw_bo_wait_rendering(brw->throttle_batch[1]);
 548          }
 549          brw_bo_unreference(brw->throttle_batch[1]);
 550       }
 551       brw->throttle_batch[1] = brw->throttle_batch[0];
 552       brw->throttle_batch[0] = NULL;
 553       brw->need_swap_throttle = false;
 554       /* Throttling here is more precise than the throttle ioctl, so skip it */
 555       brw->need_flush_throttle = false;
 556    }
 557
 558    if (brw->need_flush_throttle) {
 559       __DRIscreen *dri_screen = brw->screen->driScrnPriv;
 560       drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
 561       brw->need_flush_throttle = false;
 562    }
 563 }
 564
 565 static int
 566 execbuffer(int fd,
 567            struct intel_batchbuffer *batch,
 568            uint32_t ctx_id,
 569            int used,
 570            int in_fence,
 571            int *out_fence,
 572            int flags)
 573 {
 574    struct drm_i915_gem_execbuffer2 execbuf = {
 575       .buffers_ptr = (uintptr_t) batch->validation_list,
 576       .buffer_count = batch->exec_count,
 577       .batch_start_offset = 0,
 578       .batch_len = used,
 579       .flags = flags,
 580       .rsvd1 = ctx_id, /* rsvd1 is actually the context ID */
 581    };
 582
 583    unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
 584
 585    if (in_fence != -1) {
 586       execbuf.rsvd2 = in_fence;
 587       execbuf.flags |= I915_EXEC_FENCE_IN;
 588    }
 589
 590    if (out_fence != NULL) {
 591       cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2_WR;
 592       *out_fence = -1;
 593       execbuf.flags |= I915_EXEC_FENCE_OUT;
 594    }
 595
 596    int ret = drmIoctl(fd, cmd, &execbuf);
 597    if (ret != 0)
 598       ret = -errno;
 599
 600    for (int i = 0; i < batch->exec_count; i++) {
 601       struct brw_bo *bo = batch->exec_bos[i];
 602
 603       bo->idle = false;
 604       bo->index = -1;
 605
 606       /* Update brw_bo::offset64 */
 607       if (batch->validation_list[i].offset != bo->offset64) {
 608          DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
 609              bo->gem_handle, bo->offset64, batch->validation_list[i].offset);
 610          bo->offset64 = batch->validation_list[i].offset;
 611       }
 612    }
 613
 614    if (ret == 0 && out_fence != NULL)
 615       *out_fence = execbuf.rsvd2 >> 32;
 616
 617    return ret;
 618 }
 619
 620 static int
 621 do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
 622 {
 623    __DRIscreen *dri_screen = brw->screen->driScrnPriv;
 624    struct intel_batchbuffer *batch = &brw->batch;
 625    int ret = 0;
 626
 627    if (brw->has_llc) {
 628       brw_bo_unmap(batch->bo);
 629    } else {
 630       ret = brw_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map);
 631       if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
 632          ret = brw_bo_subdata(batch->bo,
 633                                     batch->state_batch_offset,
 634                                     batch->bo->size - batch->state_batch_offset,
 635                                     (char *)batch->map + batch->state_batch_offset);
 636       }
 637    }
 638
 639    if (!brw->screen->no_hw) {
 640       /* The requirement for using I915_EXEC_NO_RELOC are:
 641        *
 642        *   The addresses written in the objects must match the corresponding
 643        *   reloc.presumed_offset which in turn must match the corresponding
 644        *   execobject.offset.
 645        *
 646        *   Any render targets written to in the batch must be flagged with
 647        *   EXEC_OBJECT_WRITE.
 648        *
 649        *   To avoid stalling, execobject.offset should match the current
 650        *   address of that object within the active context.
 651        */
 652       int flags = I915_EXEC_NO_RELOC;
 653
 654       if (brw->gen >= 6 && batch->ring == BLT_RING) {
 655          flags |= I915_EXEC_BLT;
 656       } else {
 657          flags |= I915_EXEC_RENDER;
 658       }
 659       if (batch->needs_sol_reset)
 660          flags |= I915_EXEC_GEN7_SOL_RESET;
 661
 662       if (ret == 0) {
 663          uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;
 664
 665          /* Add the batch itself to the end of the validation list */
 666          add_exec_bo(batch, batch->bo);
 667
 668          ret = execbuffer(dri_screen->fd, batch, hw_ctx,
 669                           4 * USED_BATCH(*batch),
 670                           in_fence_fd, out_fence_fd, flags);
 671       }
 672
 673       throttle(brw);
 674    }
 675
 676    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
 677       do_batch_dump(brw);
 678
 679    if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
 680       brw_check_for_reset(brw);
 681
 682    if (ret != 0) {
 683       fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
 684       exit(1);
 685    }
 686
 687    return ret;
 688 }
 689
 690 /**
 691  * The in_fence_fd is ignored if -1.  Otherwise this function takes ownership
 692  * of the fd.
 693  *
 694  * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
 695  * of the returned fd.
 696  */
 697 int
 698 _intel_batchbuffer_flush_fence(struct brw_context *brw,
 699                                int in_fence_fd, int *out_fence_fd,
 700                                const char *file, int line)
 701 {
 702    int ret;
 703
 704    if (USED_BATCH(brw->batch) == 0)
 705       return 0;
 706
 707    if (brw->throttle_batch[0] == NULL) {
 708       brw->throttle_batch[0] = brw->batch.bo;
 709       brw_bo_reference(brw->throttle_batch[0]);
 710    }
 711
 712    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
 713       int bytes_for_commands = 4 * USED_BATCH(brw->batch);
 714       int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
 715       int total_bytes = bytes_for_commands + bytes_for_state;
 716       fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
 717               "%4db (state) = %4db (%0.1f%%)\n", file, line,
 718               bytes_for_commands, bytes_for_state,
 719               total_bytes,
 720               100.0f * total_bytes / BATCH_SZ);
 721    }
 722
 723    brw->batch.reserved_space = 0;
 724
 725    brw_finish_batch(brw);
 726
 727    /* Mark the end of the buffer. */
 728    intel_batchbuffer_emit_dword(&brw->batch, MI_BATCH_BUFFER_END);
 729    if (USED_BATCH(brw->batch) & 1) {
 730       /* Round batchbuffer usage to 2 DWORDs. */
 731       intel_batchbuffer_emit_dword(&brw->batch, MI_NOOP);
 732    }
 733
 734    intel_upload_finish(brw);
 735
 736    /* Check that we didn't just wrap our batchbuffer at a bad time. */
 737    assert(!brw->no_batch_wrap);
 738
 739    ret = do_flush_locked(brw, in_fence_fd, out_fence_fd);
 740
 741    if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
 742       fprintf(stderr, "waiting for idle\n");
 743       brw_bo_wait_rendering(brw->batch.bo);
 744    }
 745
 746    /* Start a new batch buffer. */
 747    brw_new_batch(brw);
 748
 749    return ret;
 750 }
 751
 752 bool
 753 brw_batch_has_aperture_space(struct brw_context *brw, unsigned extra_space)
 754 {
 755    return brw->batch.aperture_space + extra_space <=
 756           brw->screen->aperture_threshold;
 757 }
 758
 759 bool
 760 brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
 761 {
 762    unsigned index = READ_ONCE(bo->index);
 763    if (index < batch->exec_count && batch->exec_bos[index] == bo)
 764       return true;
 765
 766    for (int i = 0; i < batch->exec_count; i++) {
 767       if (batch->exec_bos[i] == bo)
 768          return true;
 769    }
 770    return false;
 771 }
 772
 773 /*  This is the only way buffers get added to the validate list.
 774  */
 775 uint64_t
 776 brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
 777                struct brw_bo *target, uint32_t target_offset,
 778                uint32_t read_domains, uint32_t write_domain)
 779 {
 780    assert(target != NULL);
 781
 782    if (batch->reloc_count == batch->reloc_array_size) {
 783       batch->reloc_array_size *= 2;
 784       batch->relocs = realloc(batch->relocs,
 785                               batch->reloc_array_size *
 786                               sizeof(struct drm_i915_gem_relocation_entry));
 787    }
 788
 789    /* Check args */
 790    assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
 791    assert(_mesa_bitcount(write_domain) <= 1);
 792
 793    uint64_t offset64;
 794    if (target != batch->bo) {
 795       unsigned int index = add_exec_bo(batch, target);
 796       struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
 797
 798       if (write_domain) {
 799          entry->flags |= EXEC_OBJECT_WRITE;
 800
 801          /* PIPECONTROL needs a w/a on gen6 */
 802          if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
 803             struct brw_context *brw = container_of(batch, brw, batch);
 804             if (brw->gen == 6)
 805                entry->flags |= EXEC_OBJECT_NEEDS_GTT;
 806          }
 807       }
 808
 809       offset64 = entry->offset;
 810    } else {
 811       offset64 = target->offset64;
 812    }
 813
 814    batch->relocs[batch->reloc_count++] =
 815       (struct drm_i915_gem_relocation_entry) {
 816          .offset = batch_offset,
 817          .delta = target_offset,
 818          .target_handle = target->gem_handle,
 819          .presumed_offset = offset64,
 820       };
 821
 822    /* Using the old buffer offset, write in what the right data would be, in
 823     * case the buffer doesn't move and we can short-circuit the relocation
 824     * processing in the kernel
 825     */
 826    return offset64 + target_offset;
 827 }
 828
 829 void
 830 intel_batchbuffer_data(struct brw_context *brw,
 831                        const void *data, GLuint bytes, enum brw_gpu_ring ring)
 832 {
 833    assert((bytes & 3) == 0);
 834    intel_batchbuffer_require_space(brw, bytes, ring);
 835    memcpy(brw->batch.map_next, data, bytes);
 836    brw->batch.map_next += bytes >> 2;
 837 }
 838
 839 static void
 840 load_sized_register_mem(struct brw_context *brw,
 841                         uint32_t reg,
 842                         struct brw_bo *bo,
 843                         uint32_t read_domains, uint32_t write_domain,
 844                         uint32_t offset,
 845                         int size)
 846 {
 847    int i;
 848
 849    /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
 850    assert(brw->gen >= 7);
 851
 852    if (brw->gen >= 8) {
 853       BEGIN_BATCH(4 * size);
 854       for (i = 0; i < size; i++) {
 855          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
 856          OUT_BATCH(reg + i * 4);
 857          OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
 858       }
 859       ADVANCE_BATCH();
 860    } else {
 861       BEGIN_BATCH(3 * size);
 862       for (i = 0; i < size; i++) {
 863          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
 864          OUT_BATCH(reg + i * 4);
 865          OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
 866       }
 867       ADVANCE_BATCH();
 868    }
 869 }
 870
 871 void
 872 brw_load_register_mem(struct brw_context *brw,
 873                       uint32_t reg,
 874                       struct brw_bo *bo,
 875                       uint32_t read_domains, uint32_t write_domain,
 876                       uint32_t offset)
 877 {
 878    load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
 879 }
 880
 881 void
 882 brw_load_register_mem64(struct brw_context *brw,
 883                         uint32_t reg,
 884                         struct brw_bo *bo,
 885                         uint32_t read_domains, uint32_t write_domain,
 886                         uint32_t offset)
 887 {
 888    load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
 889 }
 890
 891 /*
 892  * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
 893  */
 894 void
 895 brw_store_register_mem32(struct brw_context *brw,
 896                          struct brw_bo *bo, uint32_t reg, uint32_t offset)
 897 {
 898    assert(brw->gen >= 6);
 899
 900    if (brw->gen >= 8) {
 901       BEGIN_BATCH(4);
 902       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
 903       OUT_BATCH(reg);
 904       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 905                   offset);
 906       ADVANCE_BATCH();
 907    } else {
 908       BEGIN_BATCH(3);
 909       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
 910       OUT_BATCH(reg);
 911       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 912                 offset);
 913       ADVANCE_BATCH();
 914    }
 915 }
 916
 917 /*
 918  * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
 919  */
 920 void
 921 brw_store_register_mem64(struct brw_context *brw,
 922                          struct brw_bo *bo, uint32_t reg, uint32_t offset)
 923 {
 924    assert(brw->gen >= 6);
 925
 926    /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
 927     * read a full 64-bit register, we need to do two of them.
 928     */
 929    if (brw->gen >= 8) {
 930       BEGIN_BATCH(8);
 931       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
 932       OUT_BATCH(reg);
 933       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 934                   offset);
 935       OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
 936       OUT_BATCH(reg + sizeof(uint32_t));
 937       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 938                   offset + sizeof(uint32_t));
 939       ADVANCE_BATCH();
 940    } else {
 941       BEGIN_BATCH(6);
 942       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
 943       OUT_BATCH(reg);
 944       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 945                 offset);
 946       OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
 947       OUT_BATCH(reg + sizeof(uint32_t));
 948       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 949                 offset + sizeof(uint32_t));
 950       ADVANCE_BATCH();
 951    }
 952 }
 953
 954 /*
 955  * Write a 32-bit register using immediate data.
 956  */
 957 void
 958 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
 959 {
 960    assert(brw->gen >= 6);
 961
 962    BEGIN_BATCH(3);
 963    OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
 964    OUT_BATCH(reg);
 965    OUT_BATCH(imm);
 966    ADVANCE_BATCH();
 967 }
 968
 969 /*
 970  * Write a 64-bit register using immediate data.
 971  */
 972 void
 973 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
 974 {
 975    assert(brw->gen >= 6);
 976
 977    BEGIN_BATCH(5);
 978    OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
 979    OUT_BATCH(reg);
 980    OUT_BATCH(imm & 0xffffffff);
 981    OUT_BATCH(reg + 4);
 982    OUT_BATCH(imm >> 32);
 983    ADVANCE_BATCH();
 984 }
 985
 986 /*
 987  * Copies a 32-bit register.
 988  */
 989 void
 990 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
 991 {
 992    assert(brw->gen >= 8 || brw->is_haswell);
 993
 994    BEGIN_BATCH(3);
 995    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
 996    OUT_BATCH(src);
 997    OUT_BATCH(dest);
 998    ADVANCE_BATCH();
 999 }
1000
1001 /*
1002  * Copies a 64-bit register.
1003  */
1004 void
1005 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
1006 {
1007    assert(brw->gen >= 8 || brw->is_haswell);
1008
1009    BEGIN_BATCH(6);
1010    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1011    OUT_BATCH(src);
1012    OUT_BATCH(dest);
1013    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1014    OUT_BATCH(src + sizeof(uint32_t));
1015    OUT_BATCH(dest + sizeof(uint32_t));
1016    ADVANCE_BATCH();
1017 }
1018
1019 /*
1020  * Write 32-bits of immediate data to a GPU memory buffer.
1021  */
1022 void
1023 brw_store_data_imm32(struct brw_context *brw, struct brw_bo *bo,
1024                      uint32_t offset, uint32_t imm)
1025 {
1026    assert(brw->gen >= 6);
1027
1028    BEGIN_BATCH(4);
1029    OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
1030    if (brw->gen >= 8)
1031       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
1032                   offset);
1033    else {
1034       OUT_BATCH(0); /* MBZ */
1035       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
1036                 offset);
1037    }
1038    OUT_BATCH(imm);
1039    ADVANCE_BATCH();
1040 }
1041
1042 /*
1043  * Write 64-bits of immediate data to a GPU memory buffer.
1044  */
1045 void
1046 brw_store_data_imm64(struct brw_context *brw, struct brw_bo *bo,
1047                      uint32_t offset, uint64_t imm)
1048 {
1049    assert(brw->gen >= 6);
1050
1051    BEGIN_BATCH(5);
1052    OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
1053    if (brw->gen >= 8)
1054       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
1055                   offset);
1056    else {
1057       OUT_BATCH(0); /* MBZ */
1058       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
1059                 offset);
1060    }
1061    OUT_BATCH(imm & 0xffffffffu);
1062    OUT_BATCH(imm >> 32);
1063    ADVANCE_BATCH();
1064 }