From fffba41c6828b8f46a162185147d3e9b9cc479e4 Mon Sep 17 00:00:00 2001 From: Paul Berry Date: Wed, 27 Mar 2013 09:49:17 -0700 Subject: [PATCH] i965/gs: Allocate URB space for use by GS. Previously, we gave all of the URB space (other than the small amount that is used for push constants) to the vertex shader. However, when a geometry shader is active, we need to divide it up between the vertex and geometry shaders. The size of the URB entries for the vertex and geometry shaders can vary dramatically from one shader to the next. So it doesn't make sense to simply split the available space in two. In particular: - On Ivy Bridge GT1, this would not leave enough space for the worst case geometry shader, which requires 64k of URB space. - Due to hardware-imposed limits on the maximum number of URB entries, sometimes a given shader stage will only be capable of using a small amount of URB space. When this happens, it may make sense to allocate substantially less than half of the available space to that stage. Our algorithm for dividing space between the two stages is to first compute (a) the minimum amount of URB space that each stage needs in order to function properly, and (b) the amount of additional URB space that each stage "wants" (i.e. that it would be capable of making use of). If the total amount of space available is not enough to satisfy needs + wants, then each stage's "wants" amount is scaled back by the same factor in order to fit. When only a vertex shader is active, this algorithm produces equivalent results to the old algorithm (if the vertex shader stage can make use of all the available URB space, we assign all the space to it; if it can't, we let it use as much as it can). In the future, when we need to support tessellation control and tessellation evaluation pipeline stages, it should be straightforward to expand this algorithm to cover them. v2: Use "unsigned" rather than "GLuint". Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_context.h | 6 +- src/mesa/drivers/dri/i965/gen7_blorp.cpp | 16 ++- src/mesa/drivers/dri/i965/gen7_urb.c | 156 +++++++++++++++++++---- 3 files changed, 143 insertions(+), 35 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 3f17f1d1066..0bfe606420b 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1511,8 +1511,10 @@ void gen7_allocate_push_constants(struct brw_context *brw); void -gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries, - GLuint vs_size, GLuint vs_start); +gen7_emit_urb_state(struct brw_context *brw, + unsigned nr_vs_entries, unsigned vs_size, + unsigned vs_start, unsigned nr_gs_entries, + unsigned gs_size, unsigned gs_start); diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp index a387836b9f2..6c798b12631 100644 --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp @@ -51,14 +51,16 @@ static void gen7_blorp_emit_urb_config(struct brw_context *brw, const brw_blorp_params *params) { - /* The minimum valid value is 32. See 3DSTATE_URB_VS, - * Dword 1.15:0 "VS Number of URB Entries". + /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS, Dword + * 1.15:0 "VS Number of URB Entries". */ - int num_vs_entries = 32; - int vs_size = 2; - int vs_start = 2; /* skip over push constants */ - - gen7_emit_urb_state(brw, num_vs_entries, vs_size, vs_start); + gen7_emit_urb_state(brw, + 32 /* num_vs_entries */, + 2 /* vs_size */, + 2 /* vs_start */, + 0 /* num_gs_entries */, + 1 /* gs_size */, + 2 /* gs_start */); } diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c index 927af3782a2..ed5cda8001b 100644 --- a/src/mesa/drivers/dri/i965/gen7_urb.c +++ b/src/mesa/drivers/dri/i965/gen7_urb.c @@ -74,34 +74,137 @@ gen7_upload_urb(struct brw_context *brw) { const int push_size_kB = brw->is_haswell && brw->gt == 3 ? 32 : 16; - /* Total space for entries is URB size - 16kB for push constants */ - int handle_region_size = (brw->urb.size - push_size_kB) * 1024; /* bytes */ - /* CACHE_NEW_VS_PROG */ unsigned vs_size = MAX2(brw->vs.prog_data->base.urb_entry_size, 1); - - int nr_vs_entries = handle_region_size / (vs_size * 64); - if (nr_vs_entries > brw->urb.max_vs_entries) - nr_vs_entries = brw->urb.max_vs_entries; - - /* According to volume 2a, nr_vs_entries must be a multiple of 8. */ - brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 8); - - /* URB Starting Addresses are specified in multiples of 8kB. */ - brw->urb.vs_start = push_size_kB / 8; /* skip over push constants */ - - assert(brw->urb.nr_vs_entries % 8 == 0); - assert(brw->urb.nr_gs_entries % 8 == 0); - /* GS requirement */ - assert(!brw->ff_gs.prog_active); + unsigned vs_entry_size_bytes = vs_size * 64; + /* BRW_NEW_GEOMETRY_PROGRAM, CACHE_NEW_GS_PROG */ + bool gs_present = brw->geometry_program; + unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1; + unsigned gs_entry_size_bytes = gs_size * 64; + + /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS): + * + * VS Number of URB Entries must be divisible by 8 if the VS URB Entry + * Allocation Size is less than 9 512-bit URB entries. + * + * Similar text exists for GS. + */ + unsigned vs_granularity = (vs_size < 9) ? 8 : 1; + unsigned gs_granularity = (gs_size < 9) ? 8 : 1; + + /* URB allocations must be done in 8k chunks. */ + unsigned chunk_size_bytes = 8192; + + /* Determine the size of the URB in chunks. + */ + unsigned urb_chunks = brw->urb.size * 1024 / chunk_size_bytes; + + /* Reserve space for push constants */ + unsigned push_constant_bytes = 1024 * push_size_kB; + unsigned push_constant_chunks = + push_constant_bytes / chunk_size_bytes; + + /* Initially, assign each stage the minimum amount of URB space it needs, + * and make a note of how much additional space it "wants" (the amount of + * additional space it could actually make use of). + */ + + /* VS always requires at least 32 URB entries */ + unsigned vs_chunks = + ALIGN(32 * vs_entry_size_bytes, chunk_size_bytes) / chunk_size_bytes; + unsigned vs_wants = + ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes, + chunk_size_bytes) / chunk_size_bytes - vs_chunks; + + unsigned gs_chunks = 0; + unsigned gs_wants = 0; + if (gs_present) { + /* There are two constraints on the minimum amount of URB space we can + * allocate: + * + * (1) We need room for at least 2 URB entries, since we always operate + * the GS in DUAL_OBJECT mode. + * + * (2) We can't allocate less than nr_gs_entries_granularity. + */ + gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes, + chunk_size_bytes) / chunk_size_bytes; + gs_wants = + ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes, + chunk_size_bytes) / chunk_size_bytes - gs_chunks; + } + + /* There should always be enough URB space to satisfy the minimum + * requirements of each stage. + */ + unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks; + assert(total_needs <= urb_chunks); + + /* Mete out remaining space (if any) in proportion to "wants". */ + unsigned total_wants = vs_wants + gs_wants; + unsigned remaining_space = urb_chunks - total_needs; + if (remaining_space > total_wants) + remaining_space = total_wants; + if (remaining_space > 0) { + unsigned vs_additional = (unsigned) + round(vs_wants * (((double) remaining_space) / total_wants)); + vs_chunks += vs_additional; + remaining_space -= vs_additional; + gs_chunks += remaining_space; + } + + /* Sanity check that we haven't over-allocated. */ + assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks); + + /* Finally, compute the number of entries that can fit in the space + * allocated to each stage. + */ + unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes; + unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes; + + /* Since we rounded up when computing *_wants, this may be slightly more + * than the maximum allowed amount, so correct for that. + */ + nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries); + nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries); + + /* Ensure that we program a multiple of the granularity. */ + nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity); + nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity); + + /* Finally, sanity check to make sure we have at least the minimum number + * of entries needed for each stage. + */ + assert(nr_vs_entries >= 32); + if (gs_present) + assert(nr_gs_entries >= 2); + + /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems + * better to put reasonable data in there rather than leave them + * uninitialized. + */ + brw->urb.nr_vs_entries = nr_vs_entries; + brw->urb.nr_gs_entries = nr_gs_entries; + + /* Lay out the URB in the following order: + * - push constants + * - VS + * - GS + */ + brw->urb.vs_start = push_constant_chunks; + brw->urb.gs_start = push_constant_chunks + vs_chunks; gen7_emit_vs_workaround_flush(brw); - gen7_emit_urb_state(brw, brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start); + gen7_emit_urb_state(brw, + brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start, + brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start); } void -gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries, - GLuint vs_size, GLuint vs_start) +gen7_emit_urb_state(struct brw_context *brw, + unsigned nr_vs_entries, unsigned vs_size, + unsigned vs_start, unsigned nr_gs_entries, + unsigned gs_size, unsigned gs_start) { BEGIN_BATCH(8); OUT_BATCH(_3DSTATE_URB_VS << 16 | (2 - 2)); @@ -109,11 +212,12 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries, ((vs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); - /* Allocate the GS, HS, and DS zero space - we don't use them. */ OUT_BATCH(_3DSTATE_URB_GS << 16 | (2 - 2)); - OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | - (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); + OUT_BATCH(nr_gs_entries | + ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | + (gs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); + /* Allocate the HS and DS zero space - we don't use them. */ OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2)); OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); @@ -127,8 +231,8 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries, const struct brw_tracked_state gen7_urb = { .dirty = { .mesa = 0, - .brw = BRW_NEW_CONTEXT, - .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_FF_GS_PROG), + .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM, + .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_GS_PROG), }, .emit = gen7_upload_urb, }; -- 2.30.2