From b1be5bd205d3efcaf4012d2c9a12831da57fc7fb Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 9 Mar 2011 11:11:04 -0800 Subject: [PATCH] i965: Change the SF unit from state caching to state streaming. This is a 28.1% +/- 1.4% (n=10) performance improvement for the hacked-up-for-cache-misses scissor-many (n=10), and no statistically significant wall-time performance difference for the hacked-up-for-cache-hits version (n=9, first outlier in each removed since IPS was warming up. User time increased by about 4.7%, but kernel time decreased equivalently). --- src/mesa/drivers/dri/i965/brw_context.h | 1 + src/mesa/drivers/dri/i965/brw_misc_state.c | 3 +- src/mesa/drivers/dri/i965/brw_sf_state.c | 148 ++++++--------------- 3 files changed, 45 insertions(+), 107 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 7b0551a92bc..5cf5590b606 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -639,6 +639,7 @@ struct brw_context drm_intel_bo *prog_bo; drm_intel_bo *state_bo; + uint32_t state_offset; drm_intel_bo *vp_bo; } sf; diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c index c768be23fa7..92eba8fe173 100644 --- a/src/mesa/drivers/dri/i965/brw_misc_state.c +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c @@ -149,7 +149,8 @@ static void upload_pipelined_state_pointers(struct brw_context *brw ) else OUT_BATCH(0); OUT_RELOC(brw->clip.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); - OUT_RELOC(brw->sf.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); + OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, + brw->sf.state_offset); OUT_RELOC(brw->wm.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, brw->cc.state_offset); diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c index 560d51f2c6b..f8b5275561d 100644 --- a/src/mesa/drivers/dri/i965/brw_sf_state.c +++ b/src/mesa/drivers/dri/i965/brw_sf_state.c @@ -119,72 +119,21 @@ const struct brw_tracked_state brw_sf_vp = { .prepare = upload_sf_vp }; -struct brw_sf_unit_key { - unsigned int total_grf; - unsigned int urb_entry_read_length; - - unsigned int nr_urb_entries, urb_size, sfsize; - - GLenum front_face, cull_face; - unsigned pv_first:1; - unsigned scissor:1; - unsigned line_smooth:1; - unsigned point_sprite:1; - unsigned use_vs_point_size:1; - unsigned render_to_fbo:1; - float line_width; - float point_size; -}; - -static void -sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key) -{ - struct gl_context *ctx = &brw->intel.ctx; - memset(key, 0, sizeof(*key)); - - /* CACHE_NEW_SF_PROG */ - key->total_grf = brw->sf.prog_data->total_grf; - key->urb_entry_read_length = brw->sf.prog_data->urb_read_length; - - /* BRW_NEW_URB_FENCE */ - key->nr_urb_entries = brw->urb.nr_sf_entries; - key->urb_size = brw->urb.vsize; - key->sfsize = brw->urb.sfsize; - - key->scissor = ctx->Scissor.Enabled; - key->front_face = ctx->Polygon.FrontFace; - - if (ctx->Polygon.CullFlag) - key->cull_face = ctx->Polygon.CullFaceMode; - else - key->cull_face = GL_NONE; - - key->line_width = ctx->Line.Width; - key->line_smooth = ctx->Line.SmoothFlag; - - key->point_sprite = ctx->Point.PointSprite; - key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize); - key->use_vs_point_size = (ctx->VertexProgram.PointSizeEnabled || - ctx->Point._Attenuated); - - /* _NEW_LIGHT */ - key->pv_first = (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION); - - key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0; -} - -static drm_intel_bo * -sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, - drm_intel_bo **reloc_bufs) +static void upload_sf_unit( struct brw_context *brw ) { struct intel_context *intel = &brw->intel; - struct brw_sf_unit_state sf_stack, *sf = &sf_stack; - drm_intel_bo *bo; + struct gl_context *ctx = &intel->ctx; + struct brw_sf_unit_state *sf; + drm_intel_bo *bo = intel->batch.bo; int chipset_max_threads; + bool render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0; + + sf = brw_state_batch(brw, sizeof(*sf), 64, &brw->sf.state_offset); memset(sf, 0, sizeof(*sf)); - sf->thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1; + /* CACHE_NEW_SF_PROG */ + sf->thread0.grf_reg_count = ALIGN(brw->sf.prog_data->total_grf, 16) / 16 - 1; sf->thread0.kernel_start_pointer = brw->sf.prog_bo->offset >> 6; /* reloc */ sf->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754; @@ -196,10 +145,12 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, else sf->thread3.urb_entry_read_offset = 1; - sf->thread3.urb_entry_read_length = key->urb_entry_read_length; + /* CACHE_NEW_SF_PROG */ + sf->thread3.urb_entry_read_length = brw->sf.prog_data->urb_read_length; - sf->thread4.nr_urb_entries = key->nr_urb_entries; - sf->thread4.urb_entry_allocation_size = key->sfsize - 1; + /* BRW_NEW_URB_FENCE */ + sf->thread4.nr_urb_entries = brw->urb.nr_sf_entries; + sf->thread4.urb_entry_allocation_size = brw->urb.sfsize - 1; /* Each SF thread produces 1 PUE, and there can be up to 24 (Pre-Ironlake) or * 48 (Ironlake) threads. @@ -209,7 +160,9 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, else chipset_max_threads = 24; - sf->thread4.max_threads = MIN2(chipset_max_threads, key->nr_urb_entries) - 1; + /* BRW_NEW_URB_FENCE */ + sf->thread4.max_threads = MIN2(chipset_max_threads, + brw->urb.nr_sf_entries) - 1; if (unlikely(INTEL_DEBUG & DEBUG_SINGLE_THREAD)) sf->thread4.max_threads = 0; @@ -223,21 +176,23 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, sf->sf5.viewport_transform = 1; /* _NEW_SCISSOR */ - if (key->scissor) + if (ctx->Scissor.Enabled) sf->sf6.scissor = 1; /* _NEW_POLYGON */ - if (key->front_face == GL_CCW) + if (ctx->Polygon.FrontFace == GL_CCW) sf->sf5.front_winding = BRW_FRONTWINDING_CCW; else sf->sf5.front_winding = BRW_FRONTWINDING_CW; - /* The viewport is inverted for rendering to a FBO, and that inverts + /* _NEW_BUFFERS + * The viewport is inverted for rendering to a FBO, and that inverts * polygon front/back orientation. */ - sf->sf5.front_winding ^= key->render_to_fbo; + sf->sf5.front_winding ^= render_to_fbo; - switch (key->cull_face) { + /* _NEW_POLYGON */ + switch (ctx->Polygon.CullFlag ? ctx->Polygon.CullFaceMode : GL_NONE) { case GL_FRONT: sf->sf6.cull_mode = BRW_CULLMODE_FRONT; break; @@ -257,17 +212,16 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, /* _NEW_LINE */ /* XXX use ctx->Const.Min/MaxLineWidth here */ - sf->sf6.line_width = CLAMP(key->line_width, 1.0, 5.0) * (1<<1); + sf->sf6.line_width = CLAMP(ctx->Line.Width, 1.0, 5.0) * (1<<1); sf->sf6.line_endcap_aa_region_width = 1; - if (key->line_smooth) + if (ctx->Line.SmoothFlag) sf->sf6.aa_enable = 1; else if (sf->sf6.line_width <= 0x2) sf->sf6.line_width = 0; /* _NEW_BUFFERS */ - key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0; - if (!key->render_to_fbo) { + if (!render_to_fbo) { /* Rendering to an OpenGL window */ sf->sf6.point_rast_rule = BRW_RASTRULE_UPPER_RIGHT; } @@ -296,14 +250,18 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, /* XXX clamp max depends on AA vs. non-AA */ /* _NEW_POINT */ - sf->sf7.sprite_point = key->point_sprite; - sf->sf7.point_size = CLAMP(rint(key->point_size), 1, 255) * (1<<3); - sf->sf7.use_point_size_state = !key->use_vs_point_size; + sf->sf7.sprite_point = ctx->Point.PointSprite; + sf->sf7.point_size = CLAMP(rint(CLAMP(ctx->Point.Size, + ctx->Point.MinSize, + ctx->Point.MaxSize)), 1, 255) * (1<<3); + sf->sf7.use_point_size_state = !(ctx->VertexProgram.PointSizeEnabled || + ctx->Point._Attenuated); sf->sf7.aa_line_distance_mode = 0; /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons: + * _NEW_LIGHT */ - if (!key->pv_first) { + if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) { sf->sf7.trifan_pv = 2; sf->sf7.linestrip_pv = 1; sf->sf7.tristrip_pv = 2; @@ -319,46 +277,23 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, sf->sf6.dest_org_vbias = 0x8; sf->sf6.dest_org_hbias = 0x8; - bo = brw_upload_cache(&brw->cache, BRW_SF_UNIT, - key, sizeof(*key), - reloc_bufs, 2, - sf, sizeof(*sf)); - /* STATE_PREFETCH command description describes this state as being * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain. */ /* Emit SF program relocation */ - drm_intel_bo_emit_reloc(bo, offsetof(struct brw_sf_unit_state, thread0), + drm_intel_bo_emit_reloc(bo, (brw->sf.state_offset + + offsetof(struct brw_sf_unit_state, thread0)), brw->sf.prog_bo, sf->thread0.grf_reg_count << 1, I915_GEM_DOMAIN_INSTRUCTION, 0); /* Emit SF viewport relocation */ - drm_intel_bo_emit_reloc(bo, offsetof(struct brw_sf_unit_state, sf5), + drm_intel_bo_emit_reloc(bo, (brw->sf.state_offset + + offsetof(struct brw_sf_unit_state, sf5)), brw->sf.vp_bo, (sf->sf5.front_winding | (sf->sf5.viewport_transform << 1)), I915_GEM_DOMAIN_INSTRUCTION, 0); - return bo; -} - -static void upload_sf_unit( struct brw_context *brw ) -{ - struct brw_sf_unit_key key; - drm_intel_bo *reloc_bufs[2]; - - sf_unit_populate_key(brw, &key); - - reloc_bufs[0] = brw->sf.prog_bo; - reloc_bufs[1] = brw->sf.vp_bo; - - drm_intel_bo_unreference(brw->sf.state_bo); - brw->sf.state_bo = brw_search_cache(&brw->cache, BRW_SF_UNIT, - &key, sizeof(key), - reloc_bufs, 2, - NULL); - if (brw->sf.state_bo == NULL) { - brw->sf.state_bo = sf_unit_create_from_key(brw, &key, reloc_bufs); - } + brw->state.dirty.cache |= CACHE_NEW_SF_UNIT; } const struct brw_tracked_state brw_sf_unit = { @@ -369,7 +304,8 @@ const struct brw_tracked_state brw_sf_unit = { _NEW_POINT | _NEW_SCISSOR | _NEW_BUFFERS), - .brw = BRW_NEW_URB_FENCE, + .brw = (BRW_NEW_BATCH | + BRW_NEW_URB_FENCE), .cache = (CACHE_NEW_SF_VP | CACHE_NEW_SF_PROG) }, -- 2.30.2