X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2FgenX_blorp_exec.c;h=62a8310f68bc766740338620cb2e2f3b3a456470;hb=9406b3a5c16fd034d2f2db828f2f833c336683f9;hp=f9334ee13d7af29cb5b797603bba32e200f9c765;hpb=f5e5c0c1017d309684718804dbfac7f16518014e;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c index f9334ee13d7..62a8310f68b 100644 --- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c +++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c @@ -32,6 +32,10 @@ #include "blorp/blorp_genX_exec.h" +#if GEN_GEN <= 5 +#include "gen4_blorp_exec.h" +#endif + #include "brw_blorp.h" static void * @@ -40,7 +44,7 @@ blorp_emit_dwords(struct blorp_batch *batch, unsigned n) assert(batch->blorp->driver_ctx == batch->driver_batch); struct brw_context *brw = batch->driver_batch; - intel_batchbuffer_begin(brw, n, RENDER_RING); + intel_batchbuffer_begin(brw, n); uint32_t *map = brw->batch.map_next; brw->batch.map_next += n; intel_batchbuffer_advance(brw); @@ -53,12 +57,21 @@ blorp_emit_reloc(struct blorp_batch *batch, { assert(batch->blorp->driver_ctx == batch->driver_batch); struct brw_context *brw = batch->driver_batch; + uint32_t offset; - uint32_t offset = (char *)location - (char *)brw->batch.map; - return intel_batchbuffer_reloc(&brw->batch, address.buffer, offset, - address.read_domains, - address.write_domain, - address.offset + delta); + if (GEN_GEN < 6 && brw_ptr_in_state_buffer(&brw->batch, location)) { + offset = (char *)location - (char *)brw->batch.state.map; + return brw_state_reloc(&brw->batch, offset, + address.buffer, address.offset + delta, + address.reloc_flags); + } + + assert(!brw_ptr_in_state_buffer(&brw->batch, location)); + + offset = (char *)location - (char *)brw->batch.batch.map; + return brw_batch_reloc(&brw->batch, offset, + address.buffer, address.offset + delta, + address.reloc_flags); } static void @@ -67,14 +80,13 @@ blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset, { assert(batch->blorp->driver_ctx == batch->driver_batch); struct brw_context *brw = batch->driver_batch; - drm_intel_bo *bo = address.buffer; + struct brw_bo *bo = address.buffer; - drm_intel_bo_emit_reloc(brw->batch.bo, ss_offset, - bo, address.offset + delta, - address.read_domains, address.write_domain); + uint64_t reloc_val = + brw_state_reloc(&brw->batch, ss_offset, bo, address.offset + delta, + address.reloc_flags); - uint64_t reloc_val = bo->offset64 + address.offset + delta; - void *reloc_ptr = (void *)brw->batch.map + ss_offset; + void *reloc_ptr = (void *)brw->batch.state.map + ss_offset; #if GEN_GEN >= 8 *(uint64_t *)reloc_ptr = reloc_val; #else @@ -82,6 +94,27 @@ blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset, #endif } +static uint64_t +blorp_get_surface_address(struct blorp_batch *blorp_batch, + struct blorp_address address) +{ + /* We'll let blorp_surface_reloc write the address. */ + return 0ull; +} + +#if GEN_GEN >= 7 && GEN_GEN < 10 +static struct blorp_address +blorp_get_surface_base_address(struct blorp_batch *batch) +{ + assert(batch->blorp->driver_ctx == batch->driver_batch); + struct brw_context *brw = batch->driver_batch; + return (struct blorp_address) { + .buffer = brw->batch.state.bo, + .offset = 0, + }; +} +#endif + static void * blorp_alloc_dynamic_state(struct blorp_batch *batch, uint32_t size, @@ -122,21 +155,97 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size, assert(batch->blorp->driver_ctx == batch->driver_batch); struct brw_context *brw = batch->driver_batch; + /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS: + * + * "The VF cache needs to be invalidated before binding and then using + * Vertex Buffers that overlap with any previously bound Vertex Buffer + * (at a 64B granularity) since the last invalidation. A VF cache + * invalidate is performed by setting the "VF Cache Invalidation Enable" + * bit in PIPE_CONTROL." + * + * This restriction first appears in the Skylake PRM but the internal docs + * also list it as being an issue on Broadwell. In order to avoid this + * problem, we align all vertex buffer allocations to 64 bytes. + */ uint32_t offset; - void *data = brw_state_batch(brw, size, 32, &offset); + void *data = brw_state_batch(brw, size, 64, &offset); *addr = (struct blorp_address) { - .buffer = brw->batch.bo, - .read_domains = I915_GEM_DOMAIN_VERTEX, - .write_domain = 0, + .buffer = brw->batch.state.bo, .offset = offset, + + /* The VF cache designers apparently cut corners, and made the cache + * only consider the bottom 32 bits of memory addresses. If you happen + * to have two vertex buffers which get placed exactly 4 GiB apart and + * use them in back-to-back draw calls, you can get collisions. To work + * around this problem, we restrict vertex buffers to the low 32 bits of + * the address space. + */ + .reloc_flags = RELOC_32BIT, + +#if GEN_GEN == 11 + .mocs = ICL_MOCS_WB, +#elif GEN_GEN == 10 + .mocs = CNL_MOCS_WB, +#elif GEN_GEN == 9 + .mocs = SKL_MOCS_WB, +#elif GEN_GEN == 8 + .mocs = BDW_MOCS_WB, +#elif GEN_GEN == 7 + .mocs = GEN7_MOCS_L3, +#elif GEN_GEN > 6 +#error "Missing MOCS setting!" +#endif }; return data; } +/** + * See vf_invalidate_for_vb_48b_transitions in genX_state_upload.c. + */ +static void +blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch, + const struct blorp_address *addrs, + unsigned num_vbs) +{ +#if GEN_GEN >= 8 && GEN_GEN < 11 + struct brw_context *brw = batch->driver_batch; + bool need_invalidate = false; + + for (unsigned i = 0; i < num_vbs; i++) { + struct brw_bo *bo = addrs[i].buffer; + uint16_t high_bits = + bo && (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32u : 0; + + if (high_bits != brw->vb.last_bo_high_bits[i]) { + need_invalidate = true; + brw->vb.last_bo_high_bits[i] = high_bits; + } + } + + if (need_invalidate) { + brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL); + } +#endif +} + +#if GEN_GEN >= 8 +static struct blorp_address +blorp_get_workaround_page(struct blorp_batch *batch) +{ + assert(batch->blorp->driver_ctx == batch->driver_batch); + struct brw_context *brw = batch->driver_batch; + + return (struct blorp_address) { + .buffer = brw->workaround_bo, + }; +} +#endif + static void -blorp_flush_range(struct blorp_batch *batch, void *start, size_t size) +blorp_flush_range(UNUSED struct blorp_batch *batch, UNUSED void *start, + UNUSED size_t size) { /* All allocated states come from the batch which we will flush before we * submit it. There's nothing for us to do here. @@ -144,21 +253,23 @@ blorp_flush_range(struct blorp_batch *batch, void *start, size_t size) } static void -blorp_emit_urb_config(struct blorp_batch *batch, unsigned vs_entry_size) +blorp_emit_urb_config(struct blorp_batch *batch, + unsigned vs_entry_size, + UNUSED unsigned sf_entry_size) { assert(batch->blorp->driver_ctx == batch->driver_batch); struct brw_context *brw = batch->driver_batch; #if GEN_GEN >= 7 - if (!(brw->ctx.NewDriverState & (BRW_NEW_CONTEXT | BRW_NEW_URB_SIZE)) && - brw->urb.vsize >= vs_entry_size) + if (brw->urb.vsize >= vs_entry_size) return; - brw->ctx.NewDriverState |= BRW_NEW_URB_SIZE; - gen7_upload_urb(brw, vs_entry_size, false, false); -#else +#elif GEN_GEN == 6 gen6_upload_urb(brw, vs_entry_size, false, 0); +#else + /* We calculate it now and emit later. */ + brw_calculate_urb_fence(brw, 0, vs_entry_size, sf_entry_size); #endif } @@ -169,9 +280,22 @@ genX(blorp_exec)(struct blorp_batch *batch, assert(batch->blorp->driver_ctx == batch->driver_batch); struct brw_context *brw = batch->driver_batch; struct gl_context *ctx = &brw->ctx; - const uint32_t estimated_max_batch_usage = GEN_GEN >= 8 ? 1800 : 1500; bool check_aperture_failed_once = false; +#if GEN_GEN >= 11 + /* The PIPE_CONTROL command description says: + * + * "Whenever a Binding Table Index (BTI) used by a Render Taget Message + * points to a different RENDER_SURFACE_STATE, SW must issue a Render + * Target Cache Flush by enabling this bit. When render target flush + * is set due to new association of BTI, PS Scoreboard Stall bit must + * be set in this packet." + */ + brw_emit_pipe_control_flush(brw, + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_STALL_AT_SCOREBOARD); +#endif + /* Flush the sampler and render caches. We definitely need to flush the * sampler cache so that we get updated contents from the render cache for * the glBlitFramebuffer() source. Also, we are sometimes warned in the @@ -180,17 +304,25 @@ genX(blorp_exec)(struct blorp_batch *batch, * data. */ if (params->src.enabled) - brw_render_cache_set_check_flush(brw, params->src.addr.buffer); - brw_render_cache_set_check_flush(brw, params->dst.addr.buffer); + brw_cache_flush_for_read(brw, params->src.addr.buffer); + if (params->dst.enabled) { + brw_cache_flush_for_render(brw, params->dst.addr.buffer, + params->dst.view.format, + params->dst.aux_usage); + } + if (params->depth.enabled) + brw_cache_flush_for_depth(brw, params->depth.addr.buffer); + if (params->stencil.enabled) + brw_cache_flush_for_depth(brw, params->stencil.addr.buffer); brw_select_pipeline(brw, BRW_RENDER_PIPELINE); retry: - intel_batchbuffer_require_space(brw, estimated_max_batch_usage, RENDER_RING); + intel_batchbuffer_require_space(brw, 1400); + brw_require_statebuffer_space(brw, 600); intel_batchbuffer_save_state(brw); - drm_intel_bo *saved_bo = brw->batch.bo; - uint32_t saved_used = USED_BATCH(brw->batch); - uint32_t saved_state_batch_offset = brw->batch.state_batch_offset; + check_aperture_failed_once |= intel_batchbuffer_saved_state_is_empty(brw); + brw->batch.no_wrap = true; #if GEN_GEN == 6 /* Emit workaround flushes when we switch from drawing to blorping. */ @@ -203,12 +335,20 @@ retry: gen7_l3_state.emit(brw); #endif +#if GEN_GEN >= 6 brw_emit_depth_stall_flushes(brw); +#endif #if GEN_GEN == 8 gen8_write_pma_stall_bits(brw, 0); #endif + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1; + if (brw->current_hash_scale != scale) { + brw_emit_hashing_mode(brw, params->x1 - params->x0, + params->y1 - params->y0, scale); + } + blorp_emit(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) { rect.ClippedDrawingRectangleXMax = MAX2(params->x1, params->x0) - 1; rect.ClippedDrawingRectangleYMax = MAX2(params->y1, params->y0) - 1; @@ -216,23 +356,13 @@ retry: blorp_exec(batch, params); - /* Make sure we didn't wrap the batch unintentionally, and make sure we - * reserved enough space that a wrap will never happen. - */ - assert(brw->batch.bo == saved_bo); - assert((USED_BATCH(brw->batch) - saved_used) * 4 + - (saved_state_batch_offset - brw->batch.state_batch_offset) < - estimated_max_batch_usage); - /* Shut up compiler warnings on release build */ - (void)saved_bo; - (void)saved_used; - (void)saved_state_batch_offset; + brw->batch.no_wrap = false; /* Check if the blorp op we just did would make our batch likely to fail to * map all the BOs into the GPU at batch exec time later. If so, flush the * batch and try again with nothing else in the batch. */ - if (drm_intel_bufmgr_check_aperture_space(&brw->batch.bo, 1)) { + if (!brw_batch_has_aperture_space(brw, 0)) { if (!check_aperture_failed_once) { check_aperture_failed_once = true; intel_batchbuffer_reset_to_saved(brw); @@ -252,13 +382,17 @@ retry: * rendering tracks for GL. */ brw->ctx.NewDriverState |= BRW_NEW_BLORP; - brw->no_depth_or_stencil = false; - brw->ib.type = -1; - - if (params->dst.enabled) - brw_render_cache_set_add_bo(brw, params->dst.addr.buffer); + brw->no_depth_or_stencil = !params->depth.enabled && + !params->stencil.enabled; + brw->ib.index_size = -1; + + if (params->dst.enabled) { + brw_render_cache_add_bo(brw, params->dst.addr.buffer, + params->dst.view.format, + params->dst.aux_usage); + } if (params->depth.enabled) - brw_render_cache_set_add_bo(brw, params->depth.addr.buffer); + brw_depth_cache_add_bo(brw, params->depth.addr.buffer); if (params->stencil.enabled) - brw_render_cache_set_add_bo(brw, params->stencil.addr.buffer); + brw_depth_cache_add_bo(brw, params->stencil.addr.buffer); }