From f0cecd43d6b6d3f5def3fd43b9c95baaf3be9b16 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 13 Feb 2012 10:08:23 -0800 Subject: [PATCH] i965: Move VUE map computation to once at VS compile time. With this and the previous patch, 640x480 nexuiz is running 0.169118% +/- 0.0863696% faster (n=121). On a VS state change microbenchmark, performance is increased 8.28645% +/- 0.460478% (n=52). v2: Fix CACHE_NEW_VS comment. Reviewed-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/brw_clip.c | 2 +- src/mesa/drivers/dri/i965/brw_context.h | 5 ++--- src/mesa/drivers/dri/i965/brw_gs.c | 2 +- src/mesa/drivers/dri/i965/brw_sf.c | 2 +- src/mesa/drivers/dri/i965/brw_vec4_emit.cpp | 2 +- .../drivers/dri/i965/brw_vec4_visitor.cpp | 12 +++++------ src/mesa/drivers/dri/i965/brw_vs.c | 16 ++++++++------- src/mesa/drivers/dri/i965/brw_vs.h | 1 - src/mesa/drivers/dri/i965/brw_vs_emit.c | 20 +++++++++---------- src/mesa/drivers/dri/i965/gen6_sf_state.c | 10 +++++----- src/mesa/drivers/dri/i965/gen7_sf_state.c | 10 +++++----- src/mesa/drivers/dri/i965/gen7_sol_state.c | 9 +++------ 12 files changed, 42 insertions(+), 49 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c index 5b5f551948e..d411208f2c9 100644 --- a/src/mesa/drivers/dri/i965/brw_clip.c +++ b/src/mesa/drivers/dri/i965/brw_clip.c @@ -69,7 +69,7 @@ static void compile_clip_prog( struct brw_context *brw, c.func.single_program_flow = 1; c.key = *key; - brw_compute_vue_map(&c.vue_map, intel, brw->vs.prog_data); + c.vue_map = brw->vs.prog_data->vue_map; /* nr_regs is the number of registers filled by reading data from the VUE. * This program accesses the entire VUE, so nr_regs needs to be the size of diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index c6860a76d50..503585c70c8 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -381,6 +381,8 @@ struct brw_gs_prog_data { }; struct brw_vs_prog_data { + struct brw_vue_map vue_map; + GLuint curb_read_length; GLuint urb_read_length; GLuint total_grf; @@ -1045,9 +1047,6 @@ void brw_upload_cs_urb_state(struct brw_context *brw); int brw_disasm (FILE *file, struct brw_instruction *inst, int gen); /* brw_vs.c */ -void brw_compute_vue_map(struct brw_vue_map *vue_map, - const struct intel_context *intel, - const struct brw_vs_prog_data *prog_data); gl_clip_plane *brw_select_clip_planes(struct gl_context *ctx); /* brw_wm.c */ diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c index c6132df26d1..bfca169f7cb 100644 --- a/src/mesa/drivers/dri/i965/brw_gs.c +++ b/src/mesa/drivers/dri/i965/brw_gs.c @@ -56,7 +56,7 @@ static void compile_gs_prog( struct brw_context *brw, memset(&c, 0, sizeof(c)); c.key = *key; - brw_compute_vue_map(&c.vue_map, intel, brw->vs.prog_data); + c.vue_map = brw->vs.prog_data->vue_map; c.nr_regs = (c.vue_map.num_slots + 1)/2; mem_ctx = NULL; diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c index 6c28d773c59..6e63583e546 100644 --- a/src/mesa/drivers/dri/i965/brw_sf.c +++ b/src/mesa/drivers/dri/i965/brw_sf.c @@ -63,7 +63,7 @@ static void compile_sf_prog( struct brw_context *brw, brw_init_compile(brw, &c.func, mem_ctx); c.key = *key; - brw_compute_vue_map(&c.vue_map, intel, brw->vs.prog_data); + c.vue_map = brw->vs.prog_data->vue_map; c.urb_entry_read_offset = brw_sf_compute_urb_entry_read_offset(intel); c.nr_attr_regs = (c.vue_map.num_slots + 1)/2 - c.urb_entry_read_offset; c.nr_setup_regs = c.nr_attr_regs; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp index 917c927a125..f9eed61d92c 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp @@ -96,7 +96,7 @@ vec4_visitor::setup_attributes(int payload_reg) prog_data->urb_read_length = (nr_attributes + 1) / 2; - unsigned vue_entries = MAX2(nr_attributes, c->vue_map.num_slots); + unsigned vue_entries = MAX2(nr_attributes, c->prog_data.vue_map.num_slots); if (intel->gen == 6) c->prog_data.urb_entry_size = ALIGN(vue_entries, 8) / 8; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 5dfe1c1354c..f9a08a011f2 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -2252,8 +2252,6 @@ vec4_visitor::emit_urb_writes() /* FINISHME: edgeflag */ - brw_compute_vue_map(&c->vue_map, intel, &c->prog_data); - /* First mrf is the g0-based message header containing URB handles and such, * which is implied in VS_OPCODE_URB_WRITE. */ @@ -2265,8 +2263,8 @@ vec4_visitor::emit_urb_writes() /* Set up the VUE data for the first URB write */ int slot; - for (slot = 0; slot < c->vue_map.num_slots; ++slot) { - emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]); + for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) { + emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]); /* If this was max_usable_mrf, we can't fit anything more into this URB * WRITE. @@ -2281,16 +2279,16 @@ vec4_visitor::emit_urb_writes() vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE); inst->base_mrf = base_mrf; inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf); - inst->eot = (slot >= c->vue_map.num_slots); + inst->eot = (slot >= c->prog_data.vue_map.num_slots); /* Optional second URB write */ if (!inst->eot) { mrf = base_mrf + 1; - for (; slot < c->vue_map.num_slots; ++slot) { + for (; slot < c->prog_data.vue_map.num_slots; ++slot) { assert(mrf < max_usable_mrf); - emit_urb_slot(mrf++, c->vue_map.slot_to_vert_result[slot]); + emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]); } current_annotation = "URB write"; diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index be82177f40d..ca205cdf79a 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -57,13 +57,13 @@ static inline void assign_vue_slot(struct brw_vue_map *vue_map, * prog_data->userclip and prog_data->outputs_written in their key * (generated by CACHE_NEW_VS_PROG). */ -void -brw_compute_vue_map(struct brw_vue_map *vue_map, - const struct intel_context *intel, - const struct brw_vs_prog_data *prog_data) +static void +brw_compute_vue_map(struct brw_vs_compile *c) { - bool userclip_active = prog_data->userclip; - GLbitfield64 outputs_written = prog_data->outputs_written; + struct brw_context *brw = c->func.brw; + const struct intel_context *intel = &brw->intel; + struct brw_vue_map *vue_map = &c->prog_data.vue_map; + GLbitfield64 outputs_written = c->prog_data.outputs_written; int i; vue_map->num_slots = 0; @@ -118,7 +118,7 @@ brw_compute_vue_map(struct brw_vue_map *vue_map, */ assign_vue_slot(vue_map, VERT_RESULT_PSIZ); assign_vue_slot(vue_map, VERT_RESULT_HPOS); - if (userclip_active) { + if (c->key.userclip_active) { assign_vue_slot(vue_map, VERT_RESULT_CLIP_DIST0); assign_vue_slot(vue_map, VERT_RESULT_CLIP_DIST1); } @@ -218,6 +218,8 @@ do_vs_prog(struct brw_context *brw, c.prog_data.inputs_read |= VERT_BIT_EDGEFLAG; } + brw_compute_vue_map(&c); + /* Put dummy slots into the VUE for the SF to put the replaced * point sprite coords in. We shouldn't need these dummy slots, * which take up precious URB space, but it would mean that the SF diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h index 8814251a84c..490fcc0622b 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.h +++ b/src/mesa/drivers/dri/i965/brw_vs.h @@ -92,7 +92,6 @@ struct brw_vs_compile { GLuint nr_inputs; - struct brw_vue_map vue_map; GLuint first_output; GLuint last_scratch; diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c index 07fc0af6a9d..4bdd366ff34 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_emit.c +++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c @@ -173,7 +173,6 @@ static inline bool can_use_direct_mrf(int vert_result, */ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) { - struct brw_context *brw = c->func.brw; struct intel_context *intel = &c->func.brw->intel; GLuint i, reg = 0, slot; int attributes_in_vue; @@ -326,13 +325,12 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) /* Allocate outputs. The non-position outputs go straight into message regs. */ - brw_compute_vue_map(&c->vue_map, intel, &c->prog_data); c->first_output = reg; first_reladdr_output = get_first_reladdr_output(&c->vp->program); - for (slot = 0; slot < c->vue_map.num_slots; slot++) { - int vert_result = c->vue_map.slot_to_vert_result[slot]; + for (slot = 0; slot < c->prog_data.vue_map.num_slots; slot++) { + int vert_result = c->prog_data.vue_map.slot_to_vert_result[slot]; assert(vert_result < Elements(c->regs[PROGRAM_OUTPUT])); if (can_use_direct_mrf(vert_result, first_reladdr_output, slot)) { c->regs[PROGRAM_OUTPUT][vert_result] = brw_message_reg(slot + 1); @@ -405,7 +403,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size * them to fit the biggest thing they need to. */ - attributes_in_vue = MAX2(c->vue_map.num_slots, c->nr_inputs); + attributes_in_vue = MAX2(c->prog_data.vue_map.num_slots, c->nr_inputs); if (intel->gen == 6) { /* Each attribute is 32 bytes (2 vec4s), so dividing by 8 gives us the @@ -1678,12 +1676,12 @@ static void emit_vertex_write( struct brw_vs_compile *c) } /* Move variable-addressed, non-overflow outputs to their MRFs. */ - for (slot = len_vertex_header; slot < c->vue_map.num_slots; ++slot) { + for (slot = len_vertex_header; slot < c->prog_data.vue_map.num_slots; ++slot) { if (slot >= MAX_SLOTS_IN_FIRST_URB_WRITE) break; int mrf = slot + 1; - int vert_result = c->vue_map.slot_to_vert_result[slot]; + int vert_result = c->prog_data.vue_map.slot_to_vert_result[slot]; if (c->regs[PROGRAM_OUTPUT][vert_result].file == BRW_GENERAL_REGISTER_FILE) { brw_MOV(p, brw_message_reg(mrf), @@ -1691,7 +1689,7 @@ static void emit_vertex_write( struct brw_vs_compile *c) } } - eot = (slot >= c->vue_map.num_slots); + eot = (slot >= c->prog_data.vue_map.num_slots); /* Message header, plus the (first part of the) VUE. */ msg_len = 1 + slot; @@ -1712,14 +1710,14 @@ static void emit_vertex_write( struct brw_vs_compile *c) 0, /* urb destination offset */ BRW_URB_SWIZZLE_INTERLEAVE); - if (slot < c->vue_map.num_slots) { + if (slot < c->prog_data.vue_map.num_slots) { /* Not all of the vertex outputs/results fit into the MRF. * Move the overflowed attributes from the GRF to the MRF and * issue another brw_urb_WRITE(). */ GLuint mrf = 1; - for (; slot < c->vue_map.num_slots; ++slot) { - int vert_result = c->vue_map.slot_to_vert_result[slot]; + for (; slot < c->prog_data.vue_map.num_slots; ++slot) { + int vert_result = c->prog_data.vue_map.slot_to_vert_result[slot]; /* move from GRF to MRF */ brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][vert_result]); diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c index c4e7c4c6488..95ed1f74ce4 100644 --- a/src/mesa/drivers/dri/i965/gen6_sf_state.c +++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c @@ -112,7 +112,6 @@ upload_sf_state(struct brw_context *brw) { struct intel_context *intel = &brw->intel; struct gl_context *ctx = &intel->ctx; - struct brw_vue_map vue_map; uint32_t urb_entry_read_length; /* BRW_NEW_FRAGMENT_PROGRAM */ uint32_t num_outputs = _mesa_bitcount_64(brw->fragment_program->Base.InputsRead); @@ -129,8 +128,8 @@ upload_sf_state(struct brw_context *brw) uint32_t point_sprite_origin; /* CACHE_NEW_VS_PROG */ - brw_compute_vue_map(&vue_map, intel, brw->vs.prog_data); - urb_entry_read_length = (vue_map.num_slots + 1)/2 - urb_entry_read_offset; + urb_entry_read_length = ((brw->vs.prog_data->vue_map.num_slots + 1) / 2 - + urb_entry_read_offset); if (urb_entry_read_length == 0) { /* Setting the URB entry read length to 0 causes undefined behavior, so * if we have no URB data to read, set it to 1. @@ -301,9 +300,10 @@ upload_sf_state(struct brw_context *brw) */ assert(input_index < 16 || attr == input_index); - /* _NEW_LIGHT | _NEW_PROGRAM */ + /* CACHE_NEW_VS_PROG | _NEW_LIGHT | _NEW_PROGRAM */ attr_overrides[input_index++] = - get_attr_override(&vue_map, urb_entry_read_offset, attr, + get_attr_override(&brw->vs.prog_data->vue_map, + urb_entry_read_offset, attr, ctx->VertexProgram._TwoSideEnabled); } diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c index 49460b2802d..c8f4393a529 100644 --- a/src/mesa/drivers/dri/i965/gen7_sf_state.c +++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c @@ -33,7 +33,6 @@ upload_sbe_state(struct brw_context *brw) { struct intel_context *intel = &brw->intel; struct gl_context *ctx = &intel->ctx; - struct brw_vue_map vue_map; uint32_t urb_entry_read_length; /* BRW_NEW_FRAGMENT_PROGRAM */ uint32_t num_outputs = _mesa_bitcount_64(brw->fragment_program->Base.InputsRead); @@ -49,8 +48,8 @@ upload_sbe_state(struct brw_context *brw) uint32_t point_sprite_origin; /* CACHE_NEW_VS_PROG */ - brw_compute_vue_map(&vue_map, intel, brw->vs.prog_data); - urb_entry_read_length = (vue_map.num_slots + 1)/2 - urb_entry_read_offset; + urb_entry_read_length = ((brw->vs.prog_data->vue_map.num_slots + 1) / 2 - + urb_entry_read_offset); if (urb_entry_read_length == 0) { /* Setting the URB entry read length to 0 causes undefined behavior, so * if we have no URB data to read, set it to 1. @@ -114,9 +113,10 @@ upload_sbe_state(struct brw_context *brw) */ assert(input_index < 16 || attr == input_index); - /* _NEW_LIGHT | _NEW_PROGRAM */ + /* CACHE_NEW_VS_PROG | _NEW_LIGHT | _NEW_PROGRAM */ attr_overrides[input_index++] = - get_attr_override(&vue_map, urb_entry_read_offset, attr, + get_attr_override(&brw->vs.prog_data->vue_map, + urb_entry_read_offset, attr, ctx->VertexProgram._TwoSideEnabled); } diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c index 134153e6703..1a89503d2d5 100644 --- a/src/mesa/drivers/dri/i965/gen7_sol_state.c +++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c @@ -239,14 +239,11 @@ upload_sol_state(struct brw_context *brw) struct gl_transform_feedback_object *xfb_obj = ctx->TransformFeedback.CurrentObject; bool active = xfb_obj->Active && !xfb_obj->Paused; - struct brw_vue_map vue_map; - - /* CACHE_NEW_VS_PROG */ - brw_compute_vue_map(&vue_map, intel, brw->vs.prog_data); if (active) { upload_3dstate_so_buffers(brw); - upload_3dstate_so_decl_list(brw, &vue_map); + /* CACHE_NEW_VS_PROG */ + upload_3dstate_so_decl_list(brw, &brw->vs.prog_data->vue_map); intel->batch.needs_sol_reset = true; } @@ -256,7 +253,7 @@ upload_sol_state(struct brw_context *brw) * MMIO register updates (current performed by the kernel at each batch * emit). */ - upload_3dstate_streamout(brw, active, &vue_map); + upload_3dstate_streamout(brw, active, &brw->vs.prog_data->vue_map); } const struct brw_tracked_state gen7_sol_state = { -- 2.30.2