X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fgen7_urb.c;h=abc922a86a356a400c77d2922d706467c517ee20;hb=bccf2a25a89622c49dcfa488763c245e6584d568;hp=b60bd23b49f7dcc6d6f7a3490cc81f8c51f2a4f8;hpb=ff366f3db4a117244c6076e5babd440c912200f9;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c index b60bd23b49f..abc922a86a3 100644 --- a/src/mesa/drivers/dri/i965/gen7_urb.c +++ b/src/mesa/drivers/dri/i965/gen7_urb.c @@ -27,6 +27,8 @@ #include "brw_state.h" #include "brw_defines.h" +#include "common/gen_l3_config.h" + /** * The following diagram shows how we partition the URB: * @@ -60,15 +62,17 @@ static void gen7_allocate_push_constants(struct brw_context *brw) { + const struct gen_device_info *devinfo = &brw->screen->devinfo; + /* BRW_NEW_GEOMETRY_PROGRAM */ - bool gs_present = brw->geometry_program; + bool gs_present = brw->programs[MESA_SHADER_GEOMETRY]; /* BRW_NEW_TESS_PROGRAMS */ - bool tess_present = brw->tess_eval_program; + bool tess_present = brw->programs[MESA_SHADER_TESS_EVAL]; unsigned avail_size = 16; unsigned multiplier = - (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 2 : 1; + (devinfo->gen >= 8 || (devinfo->is_haswell && devinfo->gt == 3)) ? 2 : 1; int stages = 2 + gs_present + 2 * tess_present; @@ -99,7 +103,11 @@ gen7_allocate_push_constants(struct brw_context *brw) * Similar text exists for the other 3DSTATE_PUSH_CONSTANT_ALLOC_* * commands. */ - brw->ctx.NewDriverState |= BRW_NEW_PUSH_CONSTANT_ALLOCATION; + brw->vs.base.push_constants_dirty = true; + brw->tcs.base.push_constants_dirty = true; + brw->tes.base.push_constants_dirty = true; + brw->gs.base.push_constants_dirty = true; + brw->wm.base.push_constants_dirty = true; } void @@ -107,8 +115,36 @@ gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, unsigned hs_size, unsigned ds_size, unsigned gs_size, unsigned fs_size) { + const struct gen_device_info *devinfo = &brw->screen->devinfo; unsigned offset = 0; + /* From the SKL PRM, Workarounds section (#878): + * + * Push constant buffer corruption possible. WA: Insert 2 zero-length + * PushConst_PS before every intended PushConst_PS update, issue a + * NULLPRIM after each of the zero len PC update to make sure CS commits + * them. + * + * This workaround is attempting to solve a pixel shader push constant + * synchronization issue. + * + * There's an unpublished WA that involves re-emitting + * 3DSTATE_PUSH_CONSTANT_ALLOC_PS for every 500-ish 3DSTATE_CONSTANT_PS + * packets. Since our counting methods may not be reliable due to + * context-switching and pre-emption, we instead choose to approximate this + * behavior by re-emitting the packet at the top of the batch. + */ + if (brw->ctx.NewDriverState == BRW_NEW_BATCH) { + /* SKL GT2 and GLK 2x6 have reliably demonstrated this issue thus far. + * We've also seen some intermittent failures from SKL GT4 and BXT in + * the past. + */ + if (!devinfo->is_skylake && + !devinfo->is_broxton && + !devinfo->is_geminilake) + return; + } + BEGIN_BATCH(10); OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2)); OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); @@ -137,7 +173,7 @@ gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, * * No such restriction exists for Haswell or Baytrail. */ - if (brw->gen < 8 && !brw->is_haswell && !brw->is_baytrail) + if (devinfo->gen < 8 && !devinfo->is_haswell && !devinfo->is_baytrail) gen7_emit_cs_stall_flush(brw); } @@ -145,46 +181,13 @@ const struct brw_tracked_state gen7_push_constant_space = { .dirty = { .mesa = 0, .brw = BRW_NEW_CONTEXT | + BRW_NEW_BATCH | /* Push constant workaround */ BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TESS_PROGRAMS, }, .emit = gen7_allocate_push_constants, }; -static void -gen7_emit_urb_state(struct brw_context *brw, - unsigned nr_vs_entries, - unsigned vs_size, unsigned vs_start, - unsigned nr_hs_entries, - unsigned hs_size, unsigned hs_start, - unsigned nr_ds_entries, - unsigned ds_size, unsigned ds_start, - unsigned nr_gs_entries, - unsigned gs_size, unsigned gs_start) -{ - BEGIN_BATCH(8); - OUT_BATCH(_3DSTATE_URB_VS << 16 | (2 - 2)); - OUT_BATCH(nr_vs_entries | - ((vs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | - (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); - - OUT_BATCH(_3DSTATE_URB_GS << 16 | (2 - 2)); - OUT_BATCH(nr_gs_entries | - ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | - (gs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); - - OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2)); - OUT_BATCH(nr_hs_entries | - ((hs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | - (hs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); - - OUT_BATCH(_3DSTATE_URB_DS << 16 | (2 - 2)); - OUT_BATCH(nr_ds_entries | - ((ds_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | - (ds_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); - ADVANCE_BATCH(); -} - static void upload_urb(struct brw_context *brw) { @@ -192,10 +195,10 @@ upload_urb(struct brw_context *brw) const struct brw_vue_prog_data *vs_vue_prog_data = brw_vue_prog_data(brw->vs.base.prog_data); const unsigned vs_size = MAX2(vs_vue_prog_data->urb_entry_size, 1); - /* BRW_NEW_GEOMETRY_PROGRAM, BRW_NEW_GS_PROG_DATA */ - const bool gs_present = brw->geometry_program; - /* BRW_NEW_TESS_PROGRAMS */ - const bool tess_present = brw->tess_eval_program; + /* BRW_NEW_GS_PROG_DATA */ + const bool gs_present = brw->gs.base.prog_data; + /* BRW_NEW_TES_PROG_DATA */ + const bool tess_present = brw->tes.base.prog_data; gen7_upload_urb(brw, vs_size, gs_present, tess_present); } @@ -205,246 +208,69 @@ gen7_upload_urb(struct brw_context *brw, unsigned vs_size, bool gs_present, bool tess_present) { const struct gen_device_info *devinfo = &brw->screen->devinfo; - const int push_size_kB = - (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 32 : 16; - /* BRW_NEW_VS_PROG_DATA */ - unsigned vs_entry_size_bytes = vs_size * 64; - /* BRW_NEW_GEOMETRY_PROGRAM, BRW_NEW_GS_PROG_DATA */ - const struct brw_vue_prog_data *gs_vue_prog_data = - brw_vue_prog_data(brw->gs.base.prog_data); - unsigned gs_size = gs_present ? gs_vue_prog_data->urb_entry_size : 1; - unsigned gs_entry_size_bytes = gs_size * 64; - - /* BRW_NEW_TCS_PROG_DATA */ - const struct brw_vue_prog_data *tcs_vue_prog_data = - brw_vue_prog_data(brw->tcs.base.prog_data); - unsigned hs_size = tess_present ? tcs_vue_prog_data->urb_entry_size : 1; - unsigned hs_entry_size_bytes = hs_size * 64; - /* BRW_NEW_TES_PROG_DATA */ - const struct brw_vue_prog_data *tes_vue_prog_data = - brw_vue_prog_data(brw->tes.base.prog_data); - unsigned ds_size = tess_present ? tes_vue_prog_data->urb_entry_size : 1; - unsigned ds_entry_size_bytes = ds_size * 64; + /* BRW_NEW_{VS,TCS,TES,GS}_PROG_DATA */ + struct brw_vue_prog_data *prog_data[4] = { + [MESA_SHADER_VERTEX] = + brw_vue_prog_data(brw->vs.base.prog_data), + [MESA_SHADER_TESS_CTRL] = + tess_present ? brw_vue_prog_data(brw->tcs.base.prog_data) : NULL, + [MESA_SHADER_TESS_EVAL] = + tess_present ? brw_vue_prog_data(brw->tes.base.prog_data) : NULL, + [MESA_SHADER_GEOMETRY] = + gs_present ? brw_vue_prog_data(brw->gs.base.prog_data) : NULL, + }; + + unsigned entry_size[4]; + entry_size[MESA_SHADER_VERTEX] = vs_size; + for (int i = MESA_SHADER_TESS_CTRL; i <= MESA_SHADER_GEOMETRY; i++) { + entry_size[i] = prog_data[i] ? prog_data[i]->urb_entry_size : 1; + } /* If we're just switching between programs with the same URB requirements, * skip the rest of the logic. */ - if (!(brw->ctx.NewDriverState & BRW_NEW_CONTEXT) && - !(brw->ctx.NewDriverState & BRW_NEW_URB_SIZE) && - brw->urb.vsize == vs_size && + if (brw->urb.vsize == entry_size[MESA_SHADER_VERTEX] && brw->urb.gs_present == gs_present && - brw->urb.gsize == gs_size && + brw->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] && brw->urb.tess_present == tess_present && - brw->urb.hsize == hs_size && - brw->urb.dsize == ds_size) { + brw->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] && + brw->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) { return; } - brw->urb.vsize = vs_size; + brw->urb.vsize = entry_size[MESA_SHADER_VERTEX]; brw->urb.gs_present = gs_present; - brw->urb.gsize = gs_size; + brw->urb.gsize = entry_size[MESA_SHADER_GEOMETRY]; brw->urb.tess_present = tess_present; - brw->urb.hsize = hs_size; - brw->urb.dsize = ds_size; - - /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS): - * - * VS Number of URB Entries must be divisible by 8 if the VS URB Entry - * Allocation Size is less than 9 512-bit URB entries. - * - * Similar text exists for HS, DS and GS. - */ - unsigned vs_granularity = (vs_size < 9) ? 8 : 1; - unsigned hs_granularity = (hs_size < 9) ? 8 : 1; - unsigned ds_granularity = (ds_size < 9) ? 8 : 1; - unsigned gs_granularity = (gs_size < 9) ? 8 : 1; - - /* URB allocations must be done in 8k chunks. */ - unsigned chunk_size_bytes = 8192; - - /* Determine the size of the URB in chunks. - * BRW_NEW_URB_SIZE - */ - unsigned urb_chunks = brw->urb.size * 1024 / chunk_size_bytes; - - /* Reserve space for push constants */ - unsigned push_constant_bytes = 1024 * push_size_kB; - unsigned push_constant_chunks = - push_constant_bytes / chunk_size_bytes; - - /* Initially, assign each stage the minimum amount of URB space it needs, - * and make a note of how much additional space it "wants" (the amount of - * additional space it could actually make use of). - */ - - /* VS has a lower limit on the number of URB entries. - * - * From the Broadwell PRM, 3DSTATE_URB_VS instruction: - * "When tessellation is enabled, the VS Number of URB Entries must be - * greater than or equal to 192." - */ - unsigned vs_min_entries = - tess_present && brw->gen == 8 ? 192 : devinfo->urb.min_vs_entries; - /* Min VS Entries isn't a multiple of 8 on Cherryview/Broxton; round up */ - vs_min_entries = ALIGN(vs_min_entries, vs_granularity); - - unsigned vs_chunks = - DIV_ROUND_UP(vs_min_entries * vs_entry_size_bytes, chunk_size_bytes); - unsigned vs_wants = - DIV_ROUND_UP(devinfo->urb.max_vs_entries * vs_entry_size_bytes, - chunk_size_bytes) - vs_chunks; - - unsigned gs_chunks = 0; - unsigned gs_wants = 0; - if (gs_present) { - /* There are two constraints on the minimum amount of URB space we can - * allocate: - * - * (1) We need room for at least 2 URB entries, since we always operate - * the GS in DUAL_OBJECT mode. - * - * (2) We can't allocate less than nr_gs_entries_granularity. - */ - gs_chunks = DIV_ROUND_UP(MAX2(gs_granularity, 2) * gs_entry_size_bytes, - chunk_size_bytes); - gs_wants = DIV_ROUND_UP(devinfo->urb.max_gs_entries * gs_entry_size_bytes, - chunk_size_bytes) - gs_chunks; - } - - unsigned hs_chunks = 0; - unsigned hs_wants = 0; - unsigned ds_chunks = 0; - unsigned ds_wants = 0; - - if (tess_present) { - hs_chunks = - DIV_ROUND_UP(hs_granularity * hs_entry_size_bytes, - chunk_size_bytes); - hs_wants = - DIV_ROUND_UP(devinfo->urb.max_tcs_entries * hs_entry_size_bytes, - chunk_size_bytes) - hs_chunks; - - ds_chunks = - DIV_ROUND_UP(devinfo->urb.min_ds_entries * ds_entry_size_bytes, - chunk_size_bytes); - ds_wants = - DIV_ROUND_UP(devinfo->urb.max_tes_entries * ds_entry_size_bytes, - chunk_size_bytes) - ds_chunks; - } - - /* There should always be enough URB space to satisfy the minimum - * requirements of each stage. - */ - unsigned total_needs = push_constant_chunks + - vs_chunks + hs_chunks + ds_chunks + gs_chunks; - assert(total_needs <= urb_chunks); - - /* Mete out remaining space (if any) in proportion to "wants". */ - unsigned total_wants = vs_wants + hs_wants + ds_wants + gs_wants; - unsigned remaining_space = urb_chunks - total_needs; - if (remaining_space > total_wants) - remaining_space = total_wants; - if (remaining_space > 0) { - unsigned vs_additional = (unsigned) - roundf(vs_wants * (((float) remaining_space) / total_wants)); - vs_chunks += vs_additional; - remaining_space -= vs_additional; - total_wants -= vs_wants; - - if (total_wants > 0) { - unsigned hs_additional = (unsigned) - round(hs_wants * (((double) remaining_space) / total_wants)); - hs_chunks += hs_additional; - remaining_space -= hs_additional; - total_wants -= hs_wants; - } - - if (total_wants > 0) { - unsigned ds_additional = (unsigned) - round(ds_wants * (((double) remaining_space) / total_wants)); - ds_chunks += ds_additional; - remaining_space -= ds_additional; - total_wants -= ds_wants; - } - - gs_chunks += remaining_space; - } - - /* Sanity check that we haven't over-allocated. */ - assert(push_constant_chunks + - vs_chunks + hs_chunks + ds_chunks + gs_chunks <= urb_chunks); + brw->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL]; + brw->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL]; - /* Finally, compute the number of entries that can fit in the space - * allocated to each stage. - */ - unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes; - unsigned nr_hs_entries = hs_chunks * chunk_size_bytes / hs_entry_size_bytes; - unsigned nr_ds_entries = ds_chunks * chunk_size_bytes / ds_entry_size_bytes; - unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes; - - /* Since we rounded up when computing *_wants, this may be slightly more - * than the maximum allowed amount, so correct for that. - */ - nr_vs_entries = MIN2(nr_vs_entries, devinfo->urb.max_vs_entries); - nr_hs_entries = MIN2(nr_hs_entries, devinfo->urb.max_tcs_entries); - nr_ds_entries = MIN2(nr_ds_entries, devinfo->urb.max_tes_entries); - nr_gs_entries = MIN2(nr_gs_entries, devinfo->urb.max_gs_entries); + unsigned entries[4]; + unsigned start[4]; + gen_get_urb_config(devinfo, brw->l3.config, + tess_present, gs_present, entry_size, + entries, start, NULL); - /* Ensure that we program a multiple of the granularity. */ - nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity); - nr_hs_entries = ROUND_DOWN_TO(nr_hs_entries, hs_granularity); - nr_ds_entries = ROUND_DOWN_TO(nr_ds_entries, ds_granularity); - nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity); + if (devinfo->gen == 7 && !devinfo->is_haswell && !devinfo->is_baytrail) + gen7_emit_vs_workaround_flush(brw); - /* Finally, sanity check to make sure we have at least the minimum number - * of entries needed for each stage. - */ - assert(nr_vs_entries >= vs_min_entries); - if (gs_present) - assert(nr_gs_entries >= 2); - if (tess_present) { - assert(nr_hs_entries >= 1); - assert(nr_ds_entries >= devinfo->urb.min_ds_entries); + BEGIN_BATCH(8); + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { + assert(devinfo->gen != 10 || entry_size[i] % 3); + OUT_BATCH((_3DSTATE_URB_VS + i) << 16 | (2 - 2)); + OUT_BATCH(entries[i] | + ((entry_size[i] - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | + (start[i] << GEN7_URB_STARTING_ADDRESS_SHIFT)); } - - /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems - * better to put reasonable data in there rather than leave them - * uninitialized. - */ - brw->urb.nr_vs_entries = nr_vs_entries; - brw->urb.nr_hs_entries = nr_hs_entries; - brw->urb.nr_ds_entries = nr_ds_entries; - brw->urb.nr_gs_entries = nr_gs_entries; - - /* Lay out the URB in the following order: - * - push constants - * - VS - * - HS - * - DS - * - GS - */ - brw->urb.vs_start = push_constant_chunks; - brw->urb.hs_start = push_constant_chunks + vs_chunks; - brw->urb.ds_start = push_constant_chunks + vs_chunks + hs_chunks; - brw->urb.gs_start = push_constant_chunks + vs_chunks + hs_chunks + - ds_chunks; - - if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail) - gen7_emit_vs_workaround_flush(brw); - gen7_emit_urb_state(brw, - brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start, - brw->urb.nr_hs_entries, hs_size, brw->urb.hs_start, - brw->urb.nr_ds_entries, ds_size, brw->urb.ds_start, - brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start); + ADVANCE_BATCH(); } const struct brw_tracked_state gen7_urb = { .dirty = { .mesa = 0, - .brw = BRW_NEW_CONTEXT | + .brw = BRW_NEW_BLORP | + BRW_NEW_CONTEXT | BRW_NEW_URB_SIZE | - BRW_NEW_GEOMETRY_PROGRAM | - BRW_NEW_TESS_PROGRAMS | BRW_NEW_GS_PROG_DATA | BRW_NEW_TCS_PROG_DATA | BRW_NEW_TES_PROG_DATA |