src/mesa/drivers/dri/i965/genX_blorp_exec.c

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25
  26 #include "intel_batchbuffer.h"
  27 #include "intel_mipmap_tree.h"
  28 #include "intel_fbo.h"
  29
  30 #include "brw_context.h"
  31 #include "brw_state.h"
  32
  33 #include "blorp/blorp_genX_exec.h"
  34
  35 #if GEN_GEN <= 5
  36 #include "gen4_blorp_exec.h"
  37 #endif
  38
  39 #include "brw_blorp.h"
  40
  41 static void *
  42 blorp_emit_dwords(struct blorp_batch *batch, unsigned n)
  43 {
  44    assert(batch->blorp->driver_ctx == batch->driver_batch);
  45    struct brw_context *brw = batch->driver_batch;
  46
  47    intel_batchbuffer_begin(brw, n);
  48    uint32_t *map = brw->batch.map_next;
  49    brw->batch.map_next += n;
  50    intel_batchbuffer_advance(brw);
  51    return map;
  52 }
  53
  54 static uint64_t
  55 blorp_emit_reloc(struct blorp_batch *batch,
  56                  void *location, struct blorp_address address, uint32_t delta)
  57 {
  58    assert(batch->blorp->driver_ctx == batch->driver_batch);
  59    struct brw_context *brw = batch->driver_batch;
  60    uint32_t offset;
  61
  62    if (GEN_GEN < 6 && brw_ptr_in_state_buffer(&brw->batch, location)) {
  63       offset = (char *)location - (char *)brw->batch.state.map;
  64       return brw_state_reloc(&brw->batch, offset,
  65                              address.buffer, address.offset + delta,
  66                              address.reloc_flags);
  67    }
  68
  69    assert(!brw_ptr_in_state_buffer(&brw->batch, location));
  70
  71    offset = (char *)location - (char *)brw->batch.batch.map;
  72    return brw_batch_reloc(&brw->batch, offset,
  73                           address.buffer, address.offset + delta,
  74                           address.reloc_flags);
  75 }
  76
  77 static void
  78 blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
  79                     struct blorp_address address, uint32_t delta)
  80 {
  81    assert(batch->blorp->driver_ctx == batch->driver_batch);
  82    struct brw_context *brw = batch->driver_batch;
  83    struct brw_bo *bo = address.buffer;
  84
  85    uint64_t reloc_val =
  86       brw_state_reloc(&brw->batch, ss_offset, bo, address.offset + delta,
  87                       address.reloc_flags);
  88
  89    void *reloc_ptr = (void *)brw->batch.state.map + ss_offset;
  90 #if GEN_GEN >= 8
  91    *(uint64_t *)reloc_ptr = reloc_val;
  92 #else
  93    *(uint32_t *)reloc_ptr = reloc_val;
  94 #endif
  95 }
  96
  97 #if GEN_GEN >= 7 && GEN_GEN < 10
  98 static struct blorp_address
  99 blorp_get_surface_base_address(struct blorp_batch *batch)
 100 {
 101    assert(batch->blorp->driver_ctx == batch->driver_batch);
 102    struct brw_context *brw = batch->driver_batch;
 103    return (struct blorp_address) {
 104       .buffer = brw->batch.state.bo,
 105       .offset = 0,
 106    };
 107 }
 108 #endif
 109
 110 static void *
 111 blorp_alloc_dynamic_state(struct blorp_batch *batch,
 112                           uint32_t size,
 113                           uint32_t alignment,
 114                           uint32_t *offset)
 115 {
 116    assert(batch->blorp->driver_ctx == batch->driver_batch);
 117    struct brw_context *brw = batch->driver_batch;
 118
 119    return brw_state_batch(brw, size, alignment, offset);
 120 }
 121
 122 static void
 123 blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
 124                           unsigned state_size, unsigned state_alignment,
 125                           uint32_t *bt_offset, uint32_t *surface_offsets,
 126                           void **surface_maps)
 127 {
 128    assert(batch->blorp->driver_ctx == batch->driver_batch);
 129    struct brw_context *brw = batch->driver_batch;
 130
 131    uint32_t *bt_map = brw_state_batch(brw,
 132                                       num_entries * sizeof(uint32_t), 32,
 133                                       bt_offset);
 134
 135    for (unsigned i = 0; i < num_entries; i++) {
 136       surface_maps[i] = brw_state_batch(brw,
 137                                         state_size, state_alignment,
 138                                         &(surface_offsets)[i]);
 139       bt_map[i] = surface_offsets[i];
 140    }
 141 }
 142
 143 static void *
 144 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
 145                           struct blorp_address *addr)
 146 {
 147    assert(batch->blorp->driver_ctx == batch->driver_batch);
 148    struct brw_context *brw = batch->driver_batch;
 149
 150    /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
 151     *
 152     *    "The VF cache needs to be invalidated before binding and then using
 153     *    Vertex Buffers that overlap with any previously bound Vertex Buffer
 154     *    (at a 64B granularity) since the last invalidation.  A VF cache
 155     *    invalidate is performed by setting the "VF Cache Invalidation Enable"
 156     *    bit in PIPE_CONTROL."
 157     *
 158     * This restriction first appears in the Skylake PRM but the internal docs
 159     * also list it as being an issue on Broadwell.  In order to avoid this
 160     * problem, we align all vertex buffer allocations to 64 bytes.
 161     */
 162    uint32_t offset;
 163    void *data = brw_state_batch(brw, size, 64, &offset);
 164
 165    *addr = (struct blorp_address) {
 166       .buffer = brw->batch.state.bo,
 167       .offset = offset,
 168
 169       /* The VF cache designers apparently cut corners, and made the cache
 170        * only consider the bottom 32 bits of memory addresses.  If you happen
 171        * to have two vertex buffers which get placed exactly 4 GiB apart and
 172        * use them in back-to-back draw calls, you can get collisions.  To work
 173        * around this problem, we restrict vertex buffers to the low 32 bits of
 174        * the address space.
 175        */
 176       .reloc_flags = RELOC_32BIT,
 177
 178 #if GEN_GEN == 10
 179       .mocs = CNL_MOCS_WB,
 180 #elif GEN_GEN == 9
 181       .mocs = SKL_MOCS_WB,
 182 #elif GEN_GEN == 8
 183       .mocs = BDW_MOCS_WB,
 184 #elif GEN_GEN == 7
 185       .mocs = GEN7_MOCS_L3,
 186 #endif
 187    };
 188
 189    return data;
 190 }
 191
 192 /**
 193  * See vf_invalidate_for_vb_48b_transitions in genX_state_upload.c.
 194  */
 195 static void
 196 blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
 197                                            const struct blorp_address *addrs,
 198                                            unsigned num_vbs)
 199 {
 200 #if GEN_GEN >= 8 && GEN_GEN < 11
 201    struct brw_context *brw = batch->driver_batch;
 202    bool need_invalidate = false;
 203
 204    for (unsigned i = 0; i < num_vbs; i++) {
 205       struct brw_bo *bo = addrs[i].buffer;
 206       uint16_t high_bits =
 207          bo && (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32u : 0;
 208
 209       if (high_bits != brw->vb.last_bo_high_bits[i]) {
 210          need_invalidate = true;
 211          brw->vb.last_bo_high_bits[i] = high_bits;
 212       }
 213    }
 214
 215    if (need_invalidate) {
 216       brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL);
 217    }
 218 #endif
 219 }
 220
 221 #if GEN_GEN >= 8
 222 static struct blorp_address
 223 blorp_get_workaround_page(struct blorp_batch *batch)
 224 {
 225    assert(batch->blorp->driver_ctx == batch->driver_batch);
 226    struct brw_context *brw = batch->driver_batch;
 227
 228    return (struct blorp_address) {
 229       .buffer = brw->workaround_bo,
 230    };
 231 }
 232 #endif
 233
 234 static void
 235 blorp_flush_range(UNUSED struct blorp_batch *batch, UNUSED void *start,
 236                   UNUSED size_t size)
 237 {
 238    /* All allocated states come from the batch which we will flush before we
 239     * submit it.  There's nothing for us to do here.
 240     */
 241 }
 242
 243 static void
 244 blorp_emit_urb_config(struct blorp_batch *batch,
 245                       unsigned vs_entry_size,
 246                       MAYBE_UNUSED unsigned sf_entry_size)
 247 {
 248    assert(batch->blorp->driver_ctx == batch->driver_batch);
 249    struct brw_context *brw = batch->driver_batch;
 250
 251 #if GEN_GEN >= 7
 252    if (brw->urb.vsize >= vs_entry_size)
 253       return;
 254
 255    gen7_upload_urb(brw, vs_entry_size, false, false);
 256 #elif GEN_GEN == 6
 257    gen6_upload_urb(brw, vs_entry_size, false, 0);
 258 #else
 259    /* We calculate it now and emit later. */
 260    brw_calculate_urb_fence(brw, 0, vs_entry_size, sf_entry_size);
 261 #endif
 262 }
 263
 264 void
 265 genX(blorp_exec)(struct blorp_batch *batch,
 266                  const struct blorp_params *params)
 267 {
 268    assert(batch->blorp->driver_ctx == batch->driver_batch);
 269    struct brw_context *brw = batch->driver_batch;
 270    struct gl_context *ctx = &brw->ctx;
 271    bool check_aperture_failed_once = false;
 272
 273 #if GEN_GEN >= 11
 274    /* The PIPE_CONTROL command description says:
 275     *
 276     * "Whenever a Binding Table Index (BTI) used by a Render Taget Message
 277     *  points to a different RENDER_SURFACE_STATE, SW must issue a Render
 278     *  Target Cache Flush by enabling this bit. When render target flush
 279     *  is set due to new association of BTI, PS Scoreboard Stall bit must
 280     *  be set in this packet."
 281    */
 282    brw_emit_pipe_control_flush(brw,
 283                                PIPE_CONTROL_RENDER_TARGET_FLUSH |
 284                                PIPE_CONTROL_STALL_AT_SCOREBOARD);
 285 #endif
 286
 287    /* Flush the sampler and render caches.  We definitely need to flush the
 288     * sampler cache so that we get updated contents from the render cache for
 289     * the glBlitFramebuffer() source.  Also, we are sometimes warned in the
 290     * docs to flush the cache between reinterpretations of the same surface
 291     * data with different formats, which blorp does for stencil and depth
 292     * data.
 293     */
 294    if (params->src.enabled)
 295       brw_cache_flush_for_read(brw, params->src.addr.buffer);
 296    if (params->dst.enabled) {
 297       brw_cache_flush_for_render(brw, params->dst.addr.buffer,
 298                                  params->dst.view.format,
 299                                  params->dst.aux_usage);
 300    }
 301    if (params->depth.enabled)
 302       brw_cache_flush_for_depth(brw, params->depth.addr.buffer);
 303    if (params->stencil.enabled)
 304       brw_cache_flush_for_depth(brw, params->stencil.addr.buffer);
 305
 306    brw_select_pipeline(brw, BRW_RENDER_PIPELINE);
 307
 308 retry:
 309    intel_batchbuffer_require_space(brw, 1400);
 310    brw_require_statebuffer_space(brw, 600);
 311    intel_batchbuffer_save_state(brw);
 312    check_aperture_failed_once |= intel_batchbuffer_saved_state_is_empty(brw);
 313    brw->batch.no_wrap = true;
 314
 315 #if GEN_GEN == 6
 316    /* Emit workaround flushes when we switch from drawing to blorping. */
 317    brw_emit_post_sync_nonzero_flush(brw);
 318 #endif
 319
 320    brw_upload_state_base_address(brw);
 321
 322 #if GEN_GEN >= 8
 323    gen7_l3_state.emit(brw);
 324 #endif
 325
 326 #if GEN_GEN >= 6
 327    brw_emit_depth_stall_flushes(brw);
 328 #endif
 329
 330 #if GEN_GEN == 8
 331    gen8_write_pma_stall_bits(brw, 0);
 332 #endif
 333
 334    blorp_emit(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
 335       rect.ClippedDrawingRectangleXMax = MAX2(params->x1, params->x0) - 1;
 336       rect.ClippedDrawingRectangleYMax = MAX2(params->y1, params->y0) - 1;
 337    }
 338
 339    blorp_exec(batch, params);
 340
 341    brw->batch.no_wrap = false;
 342
 343    /* Check if the blorp op we just did would make our batch likely to fail to
 344     * map all the BOs into the GPU at batch exec time later.  If so, flush the
 345     * batch and try again with nothing else in the batch.
 346     */
 347    if (!brw_batch_has_aperture_space(brw, 0)) {
 348       if (!check_aperture_failed_once) {
 349          check_aperture_failed_once = true;
 350          intel_batchbuffer_reset_to_saved(brw);
 351          intel_batchbuffer_flush(brw);
 352          goto retry;
 353       } else {
 354          int ret = intel_batchbuffer_flush(brw);
 355          WARN_ONCE(ret == -ENOSPC,
 356                    "i965: blorp emit exceeded available aperture space\n");
 357       }
 358    }
 359
 360    if (unlikely(brw->always_flush_batch))
 361       intel_batchbuffer_flush(brw);
 362
 363    /* We've smashed all state compared to what the normal 3D pipeline
 364     * rendering tracks for GL.
 365     */
 366    brw->ctx.NewDriverState |= BRW_NEW_BLORP;
 367    brw->no_depth_or_stencil = !params->depth.enabled &&
 368                               !params->stencil.enabled;
 369    brw->ib.index_size = -1;
 370
 371    if (params->dst.enabled) {
 372       brw_render_cache_add_bo(brw, params->dst.addr.buffer,
 373                               params->dst.view.format,
 374                               params->dst.aux_usage);
 375    }
 376    if (params->depth.enabled)
 377       brw_depth_cache_add_bo(brw, params->depth.addr.buffer);
 378    if (params->stencil.enabled)
 379       brw_depth_cache_add_bo(brw, params->stencil.addr.buffer);
 380 }