From 73de09f265cb1c66d70fd9eb92021882bfbbbef6 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 7 Jun 2010 09:25:10 -0700 Subject: [PATCH] i965: Convert the binding table to streamed indirect state. This slightly reduces reduces cairo-gl firefox-talos-gfx runtime on my Ironlake: before: [ # ] backend test min(s) median(s) stddev. count [ 0] gl firefox-talos-gfx 38.236 38.383 0.43% 5/6 after: [ 0] gl firefox-talos-gfx 37.799 38.203 0.39% 6/6 It turns out the cost of caching these objects and looking them up in the cache again is greater than the cost of just computing the object again, particularly when the overhead of having a separate BO to pin is removed. (Those that are paying close attention will note that this is a reversal of the path I was moving the driver in a couple of years ago. The major thing that has changed is that back then all state was recomputed when we wrapped the streaming state buffer, including recompiling our precious programs. Now, we're uncaching just the objects that are cheap to compute, and retaining caching of expensive objects) --- configure.ac | 2 +- src/mesa/drivers/dri/i965/brw_context.h | 7 +- src/mesa/drivers/dri/i965/brw_misc_state.c | 28 ++-- src/mesa/drivers/dri/i965/brw_state.h | 1 + src/mesa/drivers/dri/i965/brw_state_cache.c | 1 - src/mesa/drivers/dri/i965/brw_state_upload.c | 4 +- .../drivers/dri/i965/brw_vs_surface_state.c | 112 +++++++--------- .../drivers/dri/i965/brw_wm_surface_state.c | 123 +++++++++--------- 8 files changed, 132 insertions(+), 146 deletions(-) diff --git a/configure.ac b/configure.ac index 1056fa18100..7307d89ae21 100644 --- a/configure.ac +++ b/configure.ac @@ -860,7 +860,7 @@ AC_SUBST([DRI_LIB_DEPS]) case $DRI_DIRS in *i915*|*i965*) - PKG_CHECK_MODULES([INTEL], [libdrm_intel >= 2.4.19]) + PKG_CHECK_MODULES([INTEL], [libdrm_intel >= 2.4.21]) ;; esac diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index a8290673838..86b86fde9a7 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -131,6 +131,7 @@ struct brw_context; #define BRW_NEW_WM_INPUT_DIMENSIONS 0x100 #define BRW_NEW_PSP 0x800 #define BRW_NEW_WM_SURFACES 0x1000 +#define BRW_NEW_BINDING_TABLE 0x2000 #define BRW_NEW_INDICES 0x4000 #define BRW_NEW_VERTICES 0x8000 /** @@ -302,7 +303,6 @@ enum brw_cache_id { BRW_CLIP_UNIT, BRW_CLIP_PROG, BRW_SS_SURFACE, - BRW_SS_SURF_BIND, BRW_MAX_CACHE }; @@ -377,7 +377,6 @@ struct brw_tracked_state { #define CACHE_NEW_CLIP_UNIT (1<vs.bind_bo); - brw_add_validated_bo(brw, brw->wm.bind_bo); -} - /** * Upload the binding table pointers, which point each stage's array of surface * state pointers. @@ -116,23 +110,24 @@ static void upload_binding_table_pointers(struct brw_context *brw) BEGIN_BATCH(6); OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2)); if (brw->vs.bind_bo != NULL) - OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */ + OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, + brw->vs.bind_bo_offset); /* vs */ else OUT_BATCH(0); OUT_BATCH(0); /* gs */ OUT_BATCH(0); /* clip */ OUT_BATCH(0); /* sf */ - OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */ + OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, + brw->wm.bind_bo_offset); /* wm/ps */ ADVANCE_BATCH(); } const struct brw_tracked_state brw_binding_table_pointers = { .dirty = { .mesa = 0, - .brw = BRW_NEW_BATCH, - .cache = CACHE_NEW_SURF_BIND, + .brw = BRW_NEW_BATCH | BRW_NEW_BINDING_TABLE, + .cache = 0, }, - .prepare = prepare_binding_table_pointers, .emit = upload_binding_table_pointers, }; @@ -154,21 +149,22 @@ static void upload_gen6_binding_table_pointers(struct brw_context *brw) GEN6_BINDING_TABLE_MODIFY_PS | (4 - 2)); if (brw->vs.bind_bo != NULL) - OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */ + OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, + brw->vs.bind_bo_offset); /* vs */ else OUT_BATCH(0); OUT_BATCH(0); /* gs */ - OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */ + OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, + brw->wm.bind_bo_offset); /* wm/ps */ ADVANCE_BATCH(); } const struct brw_tracked_state gen6_binding_table_pointers = { .dirty = { .mesa = 0, - .brw = BRW_NEW_BATCH, - .cache = CACHE_NEW_SURF_BIND, + .brw = BRW_NEW_BATCH | BRW_NEW_BINDING_TABLE, + .cache = 0, }, - .prepare = prepare_binding_table_pointers, .emit = upload_gen6_binding_table_pointers, }; diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h index 364be941171..68fd7d4f807 100644 --- a/src/mesa/drivers/dri/i965/brw_state.h +++ b/src/mesa/drivers/dri/i965/brw_state.h @@ -81,6 +81,7 @@ const struct brw_tracked_state brw_wm_prog; const struct brw_tracked_state brw_wm_samplers; const struct brw_tracked_state brw_wm_constant_surface; const struct brw_tracked_state brw_wm_surfaces; +const struct brw_tracked_state brw_wm_binding_table; const struct brw_tracked_state brw_wm_unit; const struct brw_tracked_state brw_psp_urb_cbs; diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c index f1d89484fce..45f1088f4aa 100644 --- a/src/mesa/drivers/dri/i965/brw_state_cache.c +++ b/src/mesa/drivers/dri/i965/brw_state_cache.c @@ -410,7 +410,6 @@ brw_init_surface_cache(struct brw_context *brw) calloc(1, cache->size * sizeof(struct brw_cache_item)); brw_init_cache_id(cache, "SS_SURFACE", BRW_SS_SURFACE); - brw_init_cache_id(cache, "SS_SURF_BIND", BRW_SS_SURF_BIND); } diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c index 08535bb59cc..e345dbcf5b7 100644 --- a/src/mesa/drivers/dri/i965/brw_state_upload.c +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c @@ -69,6 +69,7 @@ static const struct brw_tracked_state *gen4_atoms[] = &brw_vs_surfaces, /* must do before unit */ &brw_wm_constant_surface, /* must do before wm surfaces/bind bo */ &brw_wm_surfaces, /* must do before samplers and unit */ + &brw_wm_binding_table, &brw_wm_samplers, &brw_wm_unit, @@ -268,6 +269,8 @@ static struct dirty_bit_map brw_bits[] = { DEFINE_BIT(BRW_NEW_CONTEXT), DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS), DEFINE_BIT(BRW_NEW_PSP), + DEFINE_BIT(BRW_NEW_WM_SURFACES), + DEFINE_BIT(BRW_NEW_BINDING_TABLE), DEFINE_BIT(BRW_NEW_INDICES), DEFINE_BIT(BRW_NEW_INDEX_BUFFER), DEFINE_BIT(BRW_NEW_VERTICES), @@ -295,7 +298,6 @@ static struct dirty_bit_map cache_bits[] = { DEFINE_BIT(CACHE_NEW_CLIP_UNIT), DEFINE_BIT(CACHE_NEW_CLIP_PROG), DEFINE_BIT(CACHE_NEW_SURFACE), - DEFINE_BIT(CACHE_NEW_SURF_BIND), {0, 0, 0} }; diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c index 26164e907f4..d946756af70 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c @@ -151,49 +151,29 @@ brw_update_vs_constant_surface( GLcontext *ctx, } -/** - * Constructs the binding table for the VS surface state. - */ -static drm_intel_bo * -brw_vs_get_binding_table(struct brw_context *brw) +static void +prepare_vs_surfaces(struct brw_context *brw) { - drm_intel_bo *bind_bo; - - bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND, - NULL, 0, - brw->vs.surf_bo, BRW_VS_MAX_SURF, - NULL); - - if (bind_bo == NULL) { - GLuint data_size = BRW_VS_MAX_SURF * sizeof(GLuint); - uint32_t data[BRW_VS_MAX_SURF]; - int i; - - for (i = 0; i < BRW_VS_MAX_SURF; i++) - if (brw->vs.surf_bo[i]) - data[i] = brw->vs.surf_bo[i]->offset; - else - data[i] = 0; - - bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND, - NULL, 0, - brw->vs.surf_bo, BRW_VS_MAX_SURF, - data, data_size); - - /* Emit binding table relocations to surface state */ - for (i = 0; i < BRW_VS_MAX_SURF; i++) { - if (brw->vs.surf_bo[i] != NULL) { - /* The presumed offsets were set in the data values for - * brw_upload_cache. - */ - drm_intel_bo_emit_reloc(bind_bo, i * 4, - brw->vs.surf_bo[i], 0, - I915_GEM_DOMAIN_INSTRUCTION, 0); - } + GLcontext *ctx = &brw->intel.ctx; + int i; + int nr_surfaces = 0; + + brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER); + + for (i = 0; i < BRW_VS_MAX_SURF; i++) { + if (brw->vs.surf_bo[i] != NULL) { + nr_surfaces = i + 1; } } - return bind_bo; + if (brw->vs.nr_surfaces != nr_surfaces) { + brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES; + brw->vs.nr_surfaces = nr_surfaces; + } + + for (i = 0; i < BRW_VS_MAX_SURF; i++) { + brw_add_validated_bo(brw, brw->vs.surf_bo[i]); + } } /** @@ -203,43 +183,51 @@ brw_vs_get_binding_table(struct brw_context *brw) * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and * CACHE_NEW_SURF_BIND for the binding table upload. */ -static void prepare_vs_surfaces(struct brw_context *brw ) +static void upload_vs_surfaces(struct brw_context *brw) { - GLcontext *ctx = &brw->intel.ctx; + uint32_t *bind; int i; - int nr_surfaces = 0; - - brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER); - for (i = 0; i < BRW_VS_MAX_SURF; i++) { - if (brw->vs.surf_bo[i] != NULL) { - nr_surfaces = i + 1; + /* BRW_NEW_NR_VS_SURFACES */ + if (brw->vs.nr_surfaces == 0) { + if (brw->vs.bind_bo) { + drm_intel_bo_unreference(brw->vs.bind_bo); + brw->vs.bind_bo = NULL; + brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE; } + return; } - if (brw->vs.nr_surfaces != nr_surfaces) { - brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES; - brw->vs.nr_surfaces = nr_surfaces; - } - - /* Note that we don't end up updating the bind_bo if we don't have a - * surface to be pointing at. This should be relatively harmless, as it - * just slightly increases our working set size. + /* Might want to calculate nr_surfaces first, to avoid taking up so much + * space for the binding table. (once we have vs samplers) */ - if (brw->vs.nr_surfaces != 0) { - drm_intel_bo_unreference(brw->vs.bind_bo); - brw->vs.bind_bo = brw_vs_get_binding_table(brw); + bind = brw_state_batch(brw, sizeof(uint32_t) * BRW_VS_MAX_SURF, + 32, &brw->vs.bind_bo, &brw->vs.bind_bo_offset); + + for (i = 0; i < BRW_VS_MAX_SURF; i++) { + /* BRW_NEW_VS_CONSTBUF */ + if (brw->vs.surf_bo[i]) { + drm_intel_bo_emit_reloc(brw->vs.bind_bo, + brw->vs.bind_bo_offset + i * sizeof(uint32_t), + brw->vs.surf_bo[i], 0, + I915_GEM_DOMAIN_INSTRUCTION, 0); + bind[i] = brw->vs.surf_bo[i]->offset; + } else { + bind[i] = 0; + } } + + brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE; } const struct brw_tracked_state brw_vs_surfaces = { .dirty = { .mesa = 0, - .brw = (BRW_NEW_VS_CONSTBUF), + .brw = (BRW_NEW_VS_CONSTBUF | + BRW_NEW_NR_VS_SURFACES | + BRW_NEW_BATCH), .cache = 0 }, .prepare = prepare_vs_surfaces, + .emit = upload_vs_surfaces, }; - - - diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index 2b216fddbb5..ba6a6258f57 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -639,57 +639,10 @@ brw_update_renderbuffer_surface(struct brw_context *brw, } } - -/** - * Constructs the binding table for the WM surface state, which maps unit - * numbers to surface state objects. - */ -static drm_intel_bo * -brw_wm_get_binding_table(struct brw_context *brw) -{ - drm_intel_bo *bind_bo; - - assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF); - - bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND, - NULL, 0, - brw->wm.surf_bo, brw->wm.nr_surfaces, - NULL); - - if (bind_bo == NULL) { - GLuint data_size = brw->wm.nr_surfaces * sizeof(GLuint); - uint32_t data[BRW_WM_MAX_SURF]; - int i; - - for (i = 0; i < brw->wm.nr_surfaces; i++) - if (brw->wm.surf_bo[i]) - data[i] = brw->wm.surf_bo[i]->offset; - else - data[i] = 0; - - bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND, - NULL, 0, - brw->wm.surf_bo, brw->wm.nr_surfaces, - data, data_size); - - /* Emit binding table relocations to surface state */ - for (i = 0; i < BRW_WM_MAX_SURF; i++) { - if (brw->wm.surf_bo[i] != NULL) { - drm_intel_bo_emit_reloc(bind_bo, i * sizeof(GLuint), - brw->wm.surf_bo[i], 0, - I915_GEM_DOMAIN_INSTRUCTION, 0); - } - } - } - - return bind_bo; -} - static void prepare_wm_surfaces(struct brw_context *brw ) { GLcontext *ctx = &brw->intel.ctx; GLuint i; - int old_nr_surfaces; /* _NEW_BUFFERS | _NEW_COLOR */ /* Update surfaces for drawing buffers */ @@ -703,32 +656,21 @@ static void prepare_wm_surfaces(struct brw_context *brw ) brw_update_renderbuffer_surface(brw, NULL, 0); } - old_nr_surfaces = brw->wm.nr_surfaces; - brw->wm.nr_surfaces = BRW_MAX_DRAW_BUFFERS; - - if (brw->wm.surf_bo[SURF_INDEX_FRAG_CONST_BUFFER] != NULL) - brw->wm.nr_surfaces = SURF_INDEX_FRAG_CONST_BUFFER + 1; - /* Update surfaces for textures */ for (i = 0; i < BRW_MAX_TEX_UNIT; i++) { const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i]; const GLuint surf = SURF_INDEX_TEXTURE(i); - /* _NEW_TEXTURE, BRW_NEW_TEXDATA */ + /* _NEW_TEXTURE */ if (texUnit->_ReallyEnabled) { brw_update_texture_surface(ctx, i); - brw->wm.nr_surfaces = surf + 1; } else { drm_intel_bo_unreference(brw->wm.surf_bo[surf]); brw->wm.surf_bo[surf] = NULL; } } - drm_intel_bo_unreference(brw->wm.bind_bo); - brw->wm.bind_bo = brw_wm_get_binding_table(brw); - - if (brw->wm.nr_surfaces != old_nr_surfaces) - brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES; + brw->state.dirty.brw |= BRW_NEW_WM_SURFACES; } const struct brw_tracked_state brw_wm_surfaces = { @@ -736,12 +678,69 @@ const struct brw_tracked_state brw_wm_surfaces = { .mesa = (_NEW_COLOR | _NEW_TEXTURE | _NEW_BUFFERS), - .brw = (BRW_NEW_CONTEXT | - BRW_NEW_WM_SURFACES), + .brw = (BRW_NEW_CONTEXT), .cache = 0 }, .prepare = prepare_wm_surfaces, }; +static void +brw_wm_prepare_binding_table(struct brw_context *brw) +{ + int i; + for (i = 0; i < BRW_WM_MAX_SURF; i++) { + if (brw->wm.surf_bo[i]) { + brw_add_validated_bo(brw, brw->wm.surf_bo[i]); + } + } +} +/** + * Constructs the binding table for the WM surface state, which maps unit + * numbers to surface state objects. + */ +static void +brw_wm_upload_binding_table(struct brw_context *brw) +{ + uint32_t *bind; + int i, nr_surfaces = 0; + + /* Might want to calculate nr_surfaces first, to avoid taking up so much + * space for the binding table. + */ + bind = brw_state_batch(brw, sizeof(uint32_t) * BRW_WM_MAX_SURF, + 32, &brw->wm.bind_bo, &brw->wm.bind_bo_offset); + + for (i = 0; i < BRW_WM_MAX_SURF; i++) { + /* BRW_NEW_WM_SURFACES */ + if (brw->wm.surf_bo[i]) { + drm_intel_bo_emit_reloc(brw->wm.bind_bo, + brw->wm.bind_bo_offset + i * sizeof(uint32_t), + brw->wm.surf_bo[i], 0, + I915_GEM_DOMAIN_INSTRUCTION, 0); + bind[i] = brw->wm.surf_bo[i]->offset; + nr_surfaces = i + 1; + } else { + bind[i] = 0; + } + } + + if (brw->wm.nr_surfaces != nr_surfaces) { + brw->wm.nr_surfaces = nr_surfaces; + brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES; + } + + brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE; +} + +const struct brw_tracked_state brw_wm_binding_table = { + .dirty = { + .mesa = 0, + .brw = (BRW_NEW_BATCH | + BRW_NEW_WM_SURFACES), + .cache = 0 + }, + .prepare = brw_wm_prepare_binding_table, + .emit = brw_wm_upload_binding_table, +}; -- 2.30.2