X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fgen7_urb.c;h=2e5f8e60ba94295e50b6b01bd13ccefc70e8eb21;hb=16096e9119d7f68c0382cf802ff7f3a93668ede9;hp=2653e9cbeef4f8d9f2b96e89872da7e400447feb;hpb=9b1a6745f6b827170ac29a00510dbb740c81a116;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c index 2653e9cbeef..2e5f8e60ba9 100644 --- a/src/mesa/drivers/dri/i965/gen7_urb.c +++ b/src/mesa/drivers/dri/i965/gen7_urb.c @@ -27,6 +27,8 @@ #include "brw_state.h" #include "brw_defines.h" +#include "common/gen_l3_config.h" + /** * The following diagram shows how we partition the URB: * @@ -34,7 +36,7 @@ * __________-__________ _________________-_________________ * / \ / \ * +-------------------------------------------------------------+ - * | VS/FS/GS Push | VS/GS URB | + * | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB | * | Constants | Entries | * +-------------------------------------------------------------+ * @@ -60,27 +62,34 @@ static void gen7_allocate_push_constants(struct brw_context *brw) { + const struct gen_device_info *devinfo = &brw->screen->devinfo; + + /* BRW_NEW_GEOMETRY_PROGRAM */ + bool gs_present = brw->programs[MESA_SHADER_GEOMETRY]; + + /* BRW_NEW_TESS_PROGRAMS */ + bool tess_present = brw->programs[MESA_SHADER_TESS_EVAL]; + unsigned avail_size = 16; unsigned multiplier = - (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 2 : 1; + (devinfo->gen >= 8 || (devinfo->is_haswell && devinfo->gt == 3)) ? 2 : 1; - /* BRW_NEW_GEOMETRY_PROGRAM */ - bool gs_present = brw->geometry_program; + int stages = 2 + gs_present + 2 * tess_present; - unsigned vs_size, gs_size; - if (gs_present) { - vs_size = avail_size / 3; - avail_size -= vs_size; - gs_size = avail_size / 2; - avail_size -= gs_size; - } else { - vs_size = avail_size / 2; - avail_size -= vs_size; - gs_size = 0; - } - unsigned fs_size = avail_size; + /* Divide up the available space equally between stages. Because we + * round down (using floor division), there may be some left over + * space. We allocate that to the pixel shader stage. + */ + unsigned size_per_stage = avail_size / stages; + + unsigned vs_size = size_per_stage; + unsigned hs_size = tess_present ? size_per_stage : 0; + unsigned ds_size = tess_present ? size_per_stage : 0; + unsigned gs_size = gs_present ? size_per_stage : 0; + unsigned fs_size = avail_size - size_per_stage * (stages - 1); gen7_emit_push_constant_state(brw, multiplier * vs_size, + multiplier * hs_size, multiplier * ds_size, multiplier * gs_size, multiplier * fs_size); /* From p115 of the Ivy Bridge PRM (3.2.1.4 3DSTATE_PUSH_CONSTANT_ALLOC_VS): @@ -94,20 +103,34 @@ gen7_allocate_push_constants(struct brw_context *brw) * Similar text exists for the other 3DSTATE_PUSH_CONSTANT_ALLOC_* * commands. */ - brw->state.dirty.brw |= BRW_NEW_PUSH_CONSTANT_ALLOCATION; + brw->vs.base.push_constants_dirty = true; + brw->tcs.base.push_constants_dirty = true; + brw->tes.base.push_constants_dirty = true; + brw->gs.base.push_constants_dirty = true; + brw->wm.base.push_constants_dirty = true; } void gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, + unsigned hs_size, unsigned ds_size, unsigned gs_size, unsigned fs_size) { + const struct gen_device_info *devinfo = &brw->screen->devinfo; unsigned offset = 0; - BEGIN_BATCH(6); + BEGIN_BATCH(10); OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2)); OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); offset += vs_size; + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_HS << 16 | (2 - 2)); + OUT_BATCH(hs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); + offset += hs_size; + + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_DS << 16 | (2 - 2)); + OUT_BATCH(ds_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); + offset += ds_size; + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_GS << 16 | (2 - 2)); OUT_BATCH(gs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); offset += gs_size; @@ -118,191 +141,113 @@ gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS): * - * A PIPE_CONTOL command with the CS Stall bit set must be programmed + * A PIPE_CONTROL command with the CS Stall bit set must be programmed * in the ring after this instruction. * - * No such restriction exists for Haswell. + * No such restriction exists for Haswell or Baytrail. */ - if (brw->gen < 8 && !brw->is_haswell) + if (devinfo->gen < 8 && !devinfo->is_haswell && !devinfo->is_baytrail) gen7_emit_cs_stall_flush(brw); } const struct brw_tracked_state gen7_push_constant_space = { .dirty = { .mesa = 0, - .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM, - .cache = 0, + .brw = BRW_NEW_CONTEXT | + BRW_NEW_GEOMETRY_PROGRAM | + BRW_NEW_TESS_PROGRAMS, }, .emit = gen7_allocate_push_constants, }; static void -gen7_upload_urb(struct brw_context *brw) +upload_urb(struct brw_context *brw) { - const int push_size_kB = - (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 32 : 16; - - /* CACHE_NEW_VS_PROG */ - unsigned vs_size = MAX2(brw->vs.prog_data->base.urb_entry_size, 1); - unsigned vs_entry_size_bytes = vs_size * 64; - /* BRW_NEW_GEOMETRY_PROGRAM, CACHE_NEW_GS_PROG */ - bool gs_present = brw->geometry_program; - unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1; - unsigned gs_entry_size_bytes = gs_size * 64; - - /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS): - * - * VS Number of URB Entries must be divisible by 8 if the VS URB Entry - * Allocation Size is less than 9 512-bit URB entries. - * - * Similar text exists for GS. - */ - unsigned vs_granularity = (vs_size < 9) ? 8 : 1; - unsigned gs_granularity = (gs_size < 9) ? 8 : 1; - - /* URB allocations must be done in 8k chunks. */ - unsigned chunk_size_bytes = 8192; - - /* Determine the size of the URB in chunks. - */ - unsigned urb_chunks = brw->urb.size * 1024 / chunk_size_bytes; - - /* Reserve space for push constants */ - unsigned push_constant_bytes = 1024 * push_size_kB; - unsigned push_constant_chunks = - push_constant_bytes / chunk_size_bytes; - - /* Initially, assign each stage the minimum amount of URB space it needs, - * and make a note of how much additional space it "wants" (the amount of - * additional space it could actually make use of). - */ - - /* VS has a lower limit on the number of URB entries */ - unsigned vs_chunks = - ALIGN(brw->urb.min_vs_entries * vs_entry_size_bytes, chunk_size_bytes) / - chunk_size_bytes; - unsigned vs_wants = - ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes, - chunk_size_bytes) / chunk_size_bytes - vs_chunks; + /* BRW_NEW_VS_PROG_DATA */ + const struct brw_vue_prog_data *vs_vue_prog_data = + brw_vue_prog_data(brw->vs.base.prog_data); + const unsigned vs_size = MAX2(vs_vue_prog_data->urb_entry_size, 1); + /* BRW_NEW_GS_PROG_DATA */ + const bool gs_present = brw->gs.base.prog_data; + /* BRW_NEW_TES_PROG_DATA */ + const bool tess_present = brw->tes.base.prog_data; + + gen7_upload_urb(brw, vs_size, gs_present, tess_present); +} - unsigned gs_chunks = 0; - unsigned gs_wants = 0; - if (gs_present) { - /* There are two constraints on the minimum amount of URB space we can - * allocate: - * - * (1) We need room for at least 2 URB entries, since we always operate - * the GS in DUAL_OBJECT mode. - * - * (2) We can't allocate less than nr_gs_entries_granularity. - */ - gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes, - chunk_size_bytes) / chunk_size_bytes; - gs_wants = - ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes, - chunk_size_bytes) / chunk_size_bytes - gs_chunks; +void +gen7_upload_urb(struct brw_context *brw, unsigned vs_size, + bool gs_present, bool tess_present) +{ + const struct gen_device_info *devinfo = &brw->screen->devinfo; + const int push_size_kB = + (devinfo->gen >= 8 || (devinfo->is_haswell && devinfo->gt == 3)) ? 32 : 16; + + /* BRW_NEW_{VS,TCS,TES,GS}_PROG_DATA */ + struct brw_vue_prog_data *prog_data[4] = { + [MESA_SHADER_VERTEX] = + brw_vue_prog_data(brw->vs.base.prog_data), + [MESA_SHADER_TESS_CTRL] = + tess_present ? brw_vue_prog_data(brw->tcs.base.prog_data) : NULL, + [MESA_SHADER_TESS_EVAL] = + tess_present ? brw_vue_prog_data(brw->tes.base.prog_data) : NULL, + [MESA_SHADER_GEOMETRY] = + gs_present ? brw_vue_prog_data(brw->gs.base.prog_data) : NULL, + }; + + unsigned entry_size[4]; + entry_size[MESA_SHADER_VERTEX] = vs_size; + for (int i = MESA_SHADER_TESS_CTRL; i <= MESA_SHADER_GEOMETRY; i++) { + entry_size[i] = prog_data[i] ? prog_data[i]->urb_entry_size : 1; } - /* There should always be enough URB space to satisfy the minimum - * requirements of each stage. + /* If we're just switching between programs with the same URB requirements, + * skip the rest of the logic. */ - unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks; - assert(total_needs <= urb_chunks); - - /* Mete out remaining space (if any) in proportion to "wants". */ - unsigned total_wants = vs_wants + gs_wants; - unsigned remaining_space = urb_chunks - total_needs; - if (remaining_space > total_wants) - remaining_space = total_wants; - if (remaining_space > 0) { - unsigned vs_additional = (unsigned) - round(vs_wants * (((double) remaining_space) / total_wants)); - vs_chunks += vs_additional; - remaining_space -= vs_additional; - gs_chunks += remaining_space; + if (brw->urb.vsize == entry_size[MESA_SHADER_VERTEX] && + brw->urb.gs_present == gs_present && + brw->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] && + brw->urb.tess_present == tess_present && + brw->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] && + brw->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) { + return; } - - /* Sanity check that we haven't over-allocated. */ - assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks); - - /* Finally, compute the number of entries that can fit in the space - * allocated to each stage. - */ - unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes; - unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes; - - /* Since we rounded up when computing *_wants, this may be slightly more - * than the maximum allowed amount, so correct for that. - */ - nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries); - nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries); - - /* Ensure that we program a multiple of the granularity. */ - nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity); - nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity); - - /* Finally, sanity check to make sure we have at least the minimum number - * of entries needed for each stage. - */ - assert(nr_vs_entries >= brw->urb.min_vs_entries); - if (gs_present) - assert(nr_gs_entries >= 2); - - /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems - * better to put reasonable data in there rather than leave them - * uninitialized. - */ - brw->urb.nr_vs_entries = nr_vs_entries; - brw->urb.nr_gs_entries = nr_gs_entries; - - /* Lay out the URB in the following order: - * - push constants - * - VS - * - GS - */ - brw->urb.vs_start = push_constant_chunks; - brw->urb.gs_start = push_constant_chunks + vs_chunks; - - if (brw->gen == 7 && !brw->is_haswell) + brw->urb.vsize = entry_size[MESA_SHADER_VERTEX]; + brw->urb.gs_present = gs_present; + brw->urb.gsize = entry_size[MESA_SHADER_GEOMETRY]; + brw->urb.tess_present = tess_present; + brw->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL]; + brw->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL]; + + unsigned entries[4]; + unsigned start[4]; + gen_get_urb_config(devinfo, 1024 * push_size_kB, 1024 * brw->urb.size, + tess_present, gs_present, entry_size, entries, start); + + if (devinfo->gen == 7 && !devinfo->is_haswell && !devinfo->is_baytrail) gen7_emit_vs_workaround_flush(brw); - gen7_emit_urb_state(brw, - brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start, - brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start); -} -void -gen7_emit_urb_state(struct brw_context *brw, - unsigned nr_vs_entries, unsigned vs_size, - unsigned vs_start, unsigned nr_gs_entries, - unsigned gs_size, unsigned gs_start) -{ BEGIN_BATCH(8); - OUT_BATCH(_3DSTATE_URB_VS << 16 | (2 - 2)); - OUT_BATCH(nr_vs_entries | - ((vs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | - (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); - - OUT_BATCH(_3DSTATE_URB_GS << 16 | (2 - 2)); - OUT_BATCH(nr_gs_entries | - ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | - (gs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); - - /* Allocate the HS and DS zero space - we don't use them. */ - OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2)); - OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | - (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); - - OUT_BATCH(_3DSTATE_URB_DS << 16 | (2 - 2)); - OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | - (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { + assert(devinfo->gen != 10 || entry_size[i] % 3); + OUT_BATCH((_3DSTATE_URB_VS + i) << 16 | (2 - 2)); + OUT_BATCH(entries[i] | + ((entry_size[i] - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | + (start[i] << GEN7_URB_STARTING_ADDRESS_SHIFT)); + } ADVANCE_BATCH(); } const struct brw_tracked_state gen7_urb = { .dirty = { .mesa = 0, - .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM, - .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_GS_PROG), + .brw = BRW_NEW_BLORP | + BRW_NEW_CONTEXT | + BRW_NEW_URB_SIZE | + BRW_NEW_GS_PROG_DATA | + BRW_NEW_TCS_PROG_DATA | + BRW_NEW_TES_PROG_DATA | + BRW_NEW_VS_PROG_DATA, }, - .emit = gen7_upload_urb, + .emit = upload_urb, };