From 1a447749ed421db8eb6ba20012630785aef9bb12 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Sat, 23 Apr 2011 00:33:43 -0700 Subject: [PATCH] i965/gen4: Move WM state to state streaming. The samplers are about to become streamed for gen6 performance, which would cause this unit to blow out the state cache. Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_context.h | 2 +- src/mesa/drivers/dri/i965/brw_misc_state.c | 4 +- src/mesa/drivers/dri/i965/brw_state_dump.c | 3 +- src/mesa/drivers/dri/i965/brw_vtbl.c | 1 - src/mesa/drivers/dri/i965/brw_wm_state.c | 307 ++++++++------------- 5 files changed, 125 insertions(+), 192 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index fcc3a7941b6..b61a6ff9bc3 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -699,9 +699,9 @@ struct brw_context /** Binding table of pointers to surf_bo entries */ uint32_t bind_bo_offset; uint32_t surf_offset[BRW_WM_MAX_SURF]; + uint32_t state_offset; /* offset in batchbuffer to pre-gen6 WM state */ drm_intel_bo *prog_bo; - drm_intel_bo *state_bo; drm_intel_bo *const_bo; /* pull constant buffer. */ /** * This is offset in the batch to the push constants on gen6. diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c index 19eea07ebc6..0ddd61bfda1 100644 --- a/src/mesa/drivers/dri/i965/brw_misc_state.c +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c @@ -151,7 +151,8 @@ static void upload_pipelined_state_pointers(struct brw_context *brw ) OUT_RELOC(brw->clip.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, brw->sf.state_offset); - OUT_RELOC(brw->wm.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); + OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, + brw->wm.state_offset); OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, brw->cc.state_offset); ADVANCE_BATCH(); @@ -166,7 +167,6 @@ static void prepare_psp_urb_cbs(struct brw_context *brw) brw_add_validated_bo(brw, brw->gs.state_bo); brw_add_validated_bo(brw, brw->clip.state_bo); brw_add_validated_bo(brw, brw->sf.state_bo); - brw_add_validated_bo(brw, brw->wm.state_bo); } static void upload_psp_urb_cbs(struct brw_context *brw ) diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c index e47adf600a8..06cf97519ad 100644 --- a/src/mesa/drivers/dri/i965/brw_state_dump.c +++ b/src/mesa/drivers/dri/i965/brw_state_dump.c @@ -405,7 +405,8 @@ void brw_debug_batch(struct intel_context *intel) dump_sf_viewport_state(brw); if (intel->gen < 6) - state_struct_out("WM", brw->wm.state_bo, 0, sizeof(struct brw_wm_unit_state)); + state_struct_out("WM", intel->batch.bo, brw->wm.state_offset, + sizeof(struct brw_wm_unit_state)); brw_debug_prog("WM prog", brw->wm.prog_bo); if (intel->gen >= 6) { diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c index 5aec6feb990..7add92aab43 100644 --- a/src/mesa/drivers/dri/i965/brw_vtbl.c +++ b/src/mesa/drivers/dri/i965/brw_vtbl.c @@ -89,7 +89,6 @@ static void brw_destroy_context( struct intel_context *intel ) dri_bo_release(&brw->wm.sdc_bo[i]); dri_bo_release(&brw->wm.sampler_bo); dri_bo_release(&brw->wm.prog_bo); - dri_bo_release(&brw->wm.state_bo); dri_bo_release(&brw->wm.const_bo); dri_bo_release(&brw->cc.prog_bo); diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c index 9d0a7a8d27d..d7faf490cfa 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c @@ -40,22 +40,6 @@ * WM unit - fragment programs and rasterization */ -struct brw_wm_unit_key { - unsigned int total_grf, total_grf_16, total_scratch; - unsigned int urb_entry_read_length; - unsigned int curb_entry_read_length; - unsigned int dispatch_grf_start_reg; - uint32_t prog_offset_16; - - unsigned int curbe_offset; - - unsigned int nr_surfaces, sampler_count; - GLboolean uses_depth, computes_depth, uses_kill, is_glsl; - GLboolean polygon_stipple, stats_wm, line_stipple, offset_enable; - GLboolean color_write_enable; - GLfloat offset_units, offset_factor; -}; - bool brw_color_buffer_write_enabled(struct brw_context *brw) { @@ -82,25 +66,21 @@ brw_color_buffer_write_enabled(struct brw_context *brw) return false; } +/** + * Setup wm hardware state. See page 225 of Volume 2 + */ static void -wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key) +brw_prepare_wm_unit(struct brw_context *brw) { - struct gl_context *ctx = &brw->intel.ctx; - const struct gl_fragment_program *fp = brw->fragment_program; struct intel_context *intel = &brw->intel; + struct gl_context *ctx = &intel->ctx; + const struct gl_fragment_program *fp = brw->fragment_program; + struct brw_wm_unit_state *wm; - memset(key, 0, sizeof(*key)); + wm = brw_state_batch(brw, sizeof(*wm), 32, &brw->wm.state_offset); + memset(wm, 0, sizeof(*wm)); - /* CACHE_NEW_WM_PROG */ - key->total_grf = brw->wm.prog_data->total_grf; - key->total_grf_16 = brw->wm.prog_data->total_grf_16; - key->urb_entry_read_length = brw->wm.prog_data->urb_read_length; - key->curb_entry_read_length = brw->wm.prog_data->curb_read_length; - key->dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf; - key->total_scratch = brw->wm.prog_data->total_scratch; - key->prog_offset_16 = brw->wm.prog_data->prog_offset_16; - - if (key->prog_offset_16) { + if (brw->wm.prog_data->prog_offset_16) { /* These two fields should be the same pre-gen6, which is why we * only have one hardware field to program for both dispatch * widths. @@ -109,215 +89,167 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key) brw->wm.prog_data->first_curbe_grf_16); } - /* BRW_NEW_CURBE_OFFSETS */ - key->curbe_offset = brw->curbe.wm_start; + /* CACHE_NEW_WM_PROG */ + wm->thread0.grf_reg_count = ALIGN(brw->wm.prog_data->total_grf, 16) / 16 - 1; + wm->wm9.grf_reg_count_2 = ALIGN(brw->wm.prog_data->total_grf_16, 16) / 16 - 1; + wm->thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */ + /* reloc */ + wm->wm9.kernel_start_pointer_2 = (brw->wm.prog_bo->offset + + brw->wm.prog_data->prog_offset_16) >> 6; + wm->thread1.depth_coef_urb_read_offset = 1; + wm->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754; - /* BRW_NEW_NR_SURFACEs */ - key->nr_surfaces = brw->wm.nr_surfaces; + if (intel->gen == 5) + wm->thread1.binding_table_entry_count = 0; /* hardware requirement */ + else { + /* BRW_NEW_NR_SURFACES */ + wm->thread1.binding_table_entry_count = brw->wm.nr_surfaces; + } - /* CACHE_NEW_SAMPLER */ - key->sampler_count = brw->wm.sampler_count; + if (brw->wm.prog_data->total_scratch != 0) { + wm->thread2.scratch_space_base_pointer = + brw->wm.scratch_bo->offset >> 10; /* reloc */ + wm->thread2.per_thread_scratch_space = + ffs(brw->wm.prog_data->total_scratch) - 11; + } else { + wm->thread2.scratch_space_base_pointer = 0; + wm->thread2.per_thread_scratch_space = 0; + } - /* _NEW_POLYGONSTIPPLE */ - key->polygon_stipple = ctx->Polygon.StippleFlag; + wm->thread3.dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf; + wm->thread3.urb_entry_read_length = brw->wm.prog_data->urb_read_length; + wm->thread3.urb_entry_read_offset = 0; + wm->thread3.const_urb_entry_read_length = + brw->wm.prog_data->curb_read_length; + /* BRW_NEW_CURBE_OFFSETS */ + wm->thread3.const_urb_entry_read_offset = brw->curbe.wm_start * 2; - /* BRW_NEW_FRAGMENT_PROGRAM */ - key->uses_depth = (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0; + if (intel->gen == 5) + wm->wm4.sampler_count = 0; /* hardware requirement */ + else { + /* CACHE_NEW_SAMPLER */ + wm->wm4.sampler_count = (brw->wm.sampler_count + 1) / 4; + } - /* as far as we can tell */ - key->computes_depth = - (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) != 0; + if (brw->wm.sampler_bo != NULL) { + /* reloc */ + wm->wm4.sampler_state_pointer = brw->wm.sampler_bo->offset >> 5; + } else { + wm->wm4.sampler_state_pointer = 0; + } + + /* BRW_NEW_FRAGMENT_PROGRAM */ + wm->wm5.program_uses_depth = (fp->Base.InputsRead & + (1 << FRAG_ATTRIB_WPOS)) != 0; + wm->wm5.program_computes_depth = (fp->Base.OutputsWritten & + BITFIELD64_BIT(FRAG_RESULT_DEPTH)) != 0; /* BRW_NEW_DEPTH_BUFFER * Override for NULL depthbuffer case, required by the Pixel Shader Computed * Depth field. */ if (brw->state.depth_region == NULL) - key->computes_depth = 0; - - /* _NEW_BUFFERS | _NEW_COLOR */ - key->color_write_enable = brw_color_buffer_write_enabled(brw); + wm->wm5.program_computes_depth = 0; /* _NEW_COLOR */ - key->uses_kill = fp->UsesKill || ctx->Color.AlphaEnabled; + wm->wm5.program_uses_killpixel = fp->UsesKill || ctx->Color.AlphaEnabled; + - /* If using the fragment shader backend, the program is always - * 8-wide. + /* BRW_NEW_FRAGMENT_PROGRAM + * + * If using the fragment shader backend, the program is always + * 8-wide. If not, it's always 16. */ if (ctx->Shader.CurrentFragmentProgram) { struct brw_shader *shader = (struct brw_shader *) ctx->Shader.CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT]; if (shader != NULL && shader->ir != NULL) { - key->is_glsl = GL_TRUE; + wm->wm5.enable_8_pix = 1; + if (brw->wm.prog_data->prog_offset_16) + wm->wm5.enable_16_pix = 1; } } + if (!wm->wm5.enable_8_pix) + wm->wm5.enable_16_pix = 1; - /* _NEW_DEPTH */ - key->stats_wm = intel->stats_wm; - - /* _NEW_LINE */ - key->line_stipple = ctx->Line.StippleFlag; - - /* _NEW_POLYGON */ - key->offset_enable = ctx->Polygon.OffsetFill; - key->offset_units = ctx->Polygon.OffsetUnits; - key->offset_factor = ctx->Polygon.OffsetFactor; -} - -/** - * Setup wm hardware state. See page 225 of Volume 2 - */ -static drm_intel_bo * -wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key, - drm_intel_bo **reloc_bufs) -{ - struct intel_context *intel = &brw->intel; - struct brw_wm_unit_state wm; - drm_intel_bo *bo; - - memset(&wm, 0, sizeof(wm)); - - wm.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1; - wm.wm9.grf_reg_count_2 = ALIGN(key->total_grf_16, 16) / 16 - 1; - wm.thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */ - wm.wm9.kernel_start_pointer_2 = (brw->wm.prog_bo->offset + - key->prog_offset_16) >> 6; /* reloc */ - wm.thread1.depth_coef_urb_read_offset = 1; - wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754; - - if (intel->gen == 5) - wm.thread1.binding_table_entry_count = 0; /* hardware requirement */ - else - wm.thread1.binding_table_entry_count = key->nr_surfaces; - - if (key->total_scratch != 0) { - wm.thread2.scratch_space_base_pointer = - brw->wm.scratch_bo->offset >> 10; /* reloc */ - wm.thread2.per_thread_scratch_space = ffs(key->total_scratch) - 11; - } else { - wm.thread2.scratch_space_base_pointer = 0; - wm.thread2.per_thread_scratch_space = 0; - } - - wm.thread3.dispatch_grf_start_reg = key->dispatch_grf_start_reg; - wm.thread3.urb_entry_read_length = key->urb_entry_read_length; - wm.thread3.urb_entry_read_offset = 0; - wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length; - wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2; - - if (intel->gen == 5) - wm.wm4.sampler_count = 0; /* hardware requirement */ - else - wm.wm4.sampler_count = (key->sampler_count + 1) / 4; - - if (brw->wm.sampler_bo != NULL) { - /* reloc */ - wm.wm4.sampler_state_pointer = brw->wm.sampler_bo->offset >> 5; - } else { - wm.wm4.sampler_state_pointer = 0; - } - - wm.wm5.program_uses_depth = key->uses_depth; - wm.wm5.program_computes_depth = key->computes_depth; - wm.wm5.program_uses_killpixel = key->uses_kill; + wm->wm5.max_threads = brw->wm_max_threads - 1; - if (key->is_glsl) { - wm.wm5.enable_8_pix = 1; - if (key->prog_offset_16) - wm.wm5.enable_16_pix = 1; - } else - wm.wm5.enable_16_pix = 1; - - wm.wm5.max_threads = brw->wm_max_threads - 1; - - if (key->color_write_enable || - key->uses_kill || - key->computes_depth) { - wm.wm5.thread_dispatch_enable = 1; + /* _NEW_BUFFERS | _NEW_COLOR */ + if (brw_color_buffer_write_enabled(brw) || + wm->wm5.program_uses_killpixel || + wm->wm5.program_computes_depth) { + wm->wm5.thread_dispatch_enable = 1; } - wm.wm5.legacy_line_rast = 0; - wm.wm5.legacy_global_depth_bias = 0; - wm.wm5.early_depth_test = 1; /* never need to disable */ - wm.wm5.line_aa_region_width = 0; - wm.wm5.line_endcap_aa_region_width = 1; + wm->wm5.legacy_line_rast = 0; + wm->wm5.legacy_global_depth_bias = 0; + wm->wm5.early_depth_test = 1; /* never need to disable */ + wm->wm5.line_aa_region_width = 0; + wm->wm5.line_endcap_aa_region_width = 1; - wm.wm5.polygon_stipple = key->polygon_stipple; + /* _NEW_POLYGONSTIPPLE */ + wm->wm5.polygon_stipple = ctx->Polygon.StippleFlag; - if (key->offset_enable) { - wm.wm5.depth_offset = 1; + /* _NEW_POLYGON */ + if (ctx->Polygon.OffsetFill) { + wm->wm5.depth_offset = 1; /* Something wierd going on with legacy_global_depth_bias, * offset_constant, scaling and MRD. This value passes glean * but gives some odd results elsewere (eg. the * quad-offset-units test). */ - wm.global_depth_offset_constant = key->offset_units * 2; + wm->global_depth_offset_constant = ctx->Polygon.OffsetUnits * 2; /* This is the only value that passes glean: */ - wm.global_depth_offset_scale = key->offset_factor; + wm->global_depth_offset_scale = ctx->Polygon.OffsetFactor; } - wm.wm5.line_stipple = key->line_stipple; - - if (unlikely(INTEL_DEBUG & DEBUG_STATS) || key->stats_wm) - wm.wm4.stats_enable = 1; + /* _NEW_LINE */ + wm->wm5.line_stipple = ctx->Line.StippleFlag; - bo = brw_upload_cache(&brw->cache, BRW_WM_UNIT, - key, sizeof(*key), - reloc_bufs, 3, - &wm, sizeof(wm)); + /* _NEW_DEPTH */ + if (unlikely(INTEL_DEBUG & DEBUG_STATS) || intel->stats_wm) + wm->wm4.stats_enable = 1; /* Emit WM program relocation */ - drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, thread0), - brw->wm.prog_bo, wm.thread0.grf_reg_count << 1, + drm_intel_bo_emit_reloc(intel->batch.bo, + brw->wm.state_offset + + offsetof(struct brw_wm_unit_state, thread0), + brw->wm.prog_bo, wm->thread0.grf_reg_count << 1, I915_GEM_DOMAIN_INSTRUCTION, 0); - if (key->prog_offset_16) { - drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, wm9), - brw->wm.prog_bo, ((wm.wm9.grf_reg_count_2 << 1) + - key->prog_offset_16), + if (brw->wm.prog_data->prog_offset_16) { + drm_intel_bo_emit_reloc(intel->batch.bo, + brw->wm.state_offset + + offsetof(struct brw_wm_unit_state, wm9), + brw->wm.prog_bo, + ((wm->wm9.grf_reg_count_2 << 1) + + brw->wm.prog_data->prog_offset_16), I915_GEM_DOMAIN_INSTRUCTION, 0); } /* Emit scratch space relocation */ - if (key->total_scratch != 0) { - drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, thread2), + if (brw->wm.prog_data->total_scratch != 0) { + drm_intel_bo_emit_reloc(intel->batch.bo, + brw->wm.state_offset + + offsetof(struct brw_wm_unit_state, thread2), brw->wm.scratch_bo, - wm.thread2.per_thread_scratch_space, + wm->thread2.per_thread_scratch_space, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER); } /* Emit sampler state relocation */ - if (key->sampler_count != 0) { - drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, wm4), - brw->wm.sampler_bo, (wm.wm4.stats_enable | - (wm.wm4.sampler_count << 2)), + if (brw->wm.sampler_count != 0) { + drm_intel_bo_emit_reloc(intel->batch.bo, + brw->wm.state_offset + + offsetof(struct brw_wm_unit_state, wm4), + brw->wm.sampler_bo, (wm->wm4.stats_enable | + (wm->wm4.sampler_count << 2)), I915_GEM_DOMAIN_INSTRUCTION, 0); } - return bo; -} - - -static void upload_wm_unit( struct brw_context *brw ) -{ - struct brw_wm_unit_key key; - drm_intel_bo *reloc_bufs[3]; - wm_unit_populate_key(brw, &key); - - reloc_bufs[0] = brw->wm.prog_bo; - reloc_bufs[1] = brw->wm.scratch_bo; - reloc_bufs[2] = brw->wm.sampler_bo; - - drm_intel_bo_unreference(brw->wm.state_bo); - brw->wm.state_bo = brw_search_cache(&brw->cache, BRW_WM_UNIT, - &key, sizeof(key), - reloc_bufs, 3, - NULL); - if (brw->wm.state_bo == NULL) { - brw->wm.state_bo = wm_unit_create_from_key(brw, &key, reloc_bufs); - } + brw->state.dirty.cache |= CACHE_NEW_WM_UNIT; } const struct brw_tracked_state brw_wm_unit = { @@ -329,7 +261,8 @@ const struct brw_tracked_state brw_wm_unit = { _NEW_DEPTH | _NEW_BUFFERS), - .brw = (BRW_NEW_FRAGMENT_PROGRAM | + .brw = (BRW_NEW_BATCH | + BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_CURBE_OFFSETS | BRW_NEW_DEPTH_BUFFER | BRW_NEW_NR_WM_SURFACES), @@ -337,6 +270,6 @@ const struct brw_tracked_state brw_wm_unit = { .cache = (CACHE_NEW_WM_PROG | CACHE_NEW_SAMPLER) }, - .prepare = upload_wm_unit, + .prepare = brw_prepare_wm_unit, }; -- 2.30.2