From d4976158c7f32705b48c773c3abd1b22bebe9c16 Mon Sep 17 00:00:00 2001 From: Paul Berry Date: Tue, 29 Nov 2011 14:51:03 -0800 Subject: [PATCH] i965 gen6: Implement pass-through GS for transform feedback. In Gen6, transform feedback is accomplished by having the geometry shader send vertex data to the data port using "Streamed Vertex Buffer Write" messages, while simultaneously passing vertices through to the rest of the graphics pipeline (if rendering is enabled). This patch adds a geometry shader program that simply passes vertices through to the rest of the graphics pipeline. The rest of transform feedback functionality will be added in future patches. To make the new geometry shader easier to test, I've added an environment variable "INTEL_FORCE_GS". If this environment variable is enabled, then the pass-through geometry shader will always be used, regardless of whether transform feedback is in effect. On my Sandy Bridge laptop, I'm able to enable INTEL_FORCE_GS with no Piglit regressions. Reviewed-by: Kenneth Graunke Acked-by: Eric Anholt --- src/mesa/drivers/dri/i965/brw_defines.h | 3 + src/mesa/drivers/dri/i965/brw_eu.h | 5 + src/mesa/drivers/dri/i965/brw_gs.c | 106 ++++++++++++++++------ src/mesa/drivers/dri/i965/brw_gs.h | 2 + src/mesa/drivers/dri/i965/brw_gs_emit.c | 92 +++++++++++++++++++ src/mesa/drivers/dri/i965/gen6_gs_state.c | 46 ++++++---- 6 files changed, 208 insertions(+), 46 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index d94923195d5..95039aa65bc 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -1075,6 +1075,9 @@ enum brw_message_target { # define GEN6_GS_SVBI_POSTINCREMENT_VALUE_MASK INTEL_MASK(25, 16) # define GEN6_GS_ENABLE (1 << 15) +# define BRW_GS_EDGE_INDICATOR_0 (1 << 8) +# define BRW_GS_EDGE_INDICATOR_1 (1 << 9) + #define _3DSTATE_HS 0x781B /* GEN7+ */ #define _3DSTATE_TE 0x781C /* GEN7+ */ #define _3DSTATE_DS 0x781D /* GEN7+ */ diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index dcb1fc91678..596be02158c 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -650,6 +650,11 @@ static INLINE struct brw_reg get_element_ud( struct brw_reg reg, GLuint elt ) return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt)); } +static INLINE struct brw_reg get_element_d( struct brw_reg reg, GLuint elt ) +{ + return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_D), elt)); +} + static INLINE struct brw_reg brw_swizzle( struct brw_reg reg, GLuint x, diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c index e72ff5e5a8f..69ffa19c40c 100644 --- a/src/mesa/drivers/dri/i965/brw_gs.c +++ b/src/mesa/drivers/dri/i965/brw_gs.c @@ -53,12 +53,6 @@ static void compile_gs_prog( struct brw_context *brw, void *mem_ctx; GLuint program_size; - /* Gen6: VF has already converted into polygon, and LINELOOP is - * converted to LINESTRIP at the beginning of the 3D pipeline. - */ - if (intel->gen >= 6) - return; - memset(&c, 0, sizeof(c)); c.key = *key; @@ -80,24 +74,60 @@ static void compile_gs_prog( struct brw_context *brw, */ brw_set_mask_control(&c.func, BRW_MASK_DISABLE); - - /* Note that primitives which don't require a GS program have - * already been weeded out by this stage: - */ - - switch (key->primitive) { - case _3DPRIM_QUADLIST: - brw_gs_quads( &c, key ); - break; - case _3DPRIM_QUADSTRIP: - brw_gs_quad_strip( &c, key ); - break; - case _3DPRIM_LINELOOP: - brw_gs_lines( &c ); - break; - default: - ralloc_free(mem_ctx); - return; + if (intel->gen >= 6) { + unsigned num_verts; + bool check_edge_flag; + /* On Sandybridge, we use the GS for implementing transform feedback + * (called "Stream Out" in the PRM). + */ + switch (key->primitive) { + case _3DPRIM_POINTLIST: + num_verts = 1; + check_edge_flag = false; + break; + case _3DPRIM_LINELIST: + case _3DPRIM_LINESTRIP: + case _3DPRIM_LINELOOP: + num_verts = 2; + check_edge_flag = false; + break; + case _3DPRIM_TRILIST: + case _3DPRIM_TRIFAN: + case _3DPRIM_TRISTRIP: + case _3DPRIM_RECTLIST: + num_verts = 3; + check_edge_flag = false; + break; + case _3DPRIM_QUADLIST: + case _3DPRIM_QUADSTRIP: + case _3DPRIM_POLYGON: + num_verts = 3; + check_edge_flag = true; + break; + default: + assert(!"Unexpected primitive type in Gen6 SOL program."); + return; + } + gen6_sol_program(&c, key, num_verts, check_edge_flag); + } else { + /* On Gen4-5, we use the GS to decompose certain types of primitives. + * Note that primitives which don't require a GS program have already + * been weeded out by now. + */ + switch (key->primitive) { + case _3DPRIM_QUADLIST: + brw_gs_quads( &c, key ); + break; + case _3DPRIM_QUADSTRIP: + brw_gs_quad_strip( &c, key ); + break; + case _3DPRIM_LINELOOP: + brw_gs_lines( &c ); + break; + default: + ralloc_free(mem_ctx); + return; + } } /* get the program @@ -148,11 +178,26 @@ static void populate_key( struct brw_context *brw, /* _NEW_TRANSFORM */ key->userclip_active = (ctx->Transform.ClipPlanesEnabled != 0); - key->need_gs_prog = (intel->gen >= 6) - ? 0 - : (brw->primitive == _3DPRIM_QUADLIST || - brw->primitive == _3DPRIM_QUADSTRIP || - brw->primitive == _3DPRIM_LINELOOP); + if (intel->gen >= 7) { + /* On Gen7 and later, we don't use GS (yet). */ + key->need_gs_prog = false; + } else if (intel->gen == 6) { + /* On Gen6, GS is used for transform feedback. */ + /* _NEW_TRANSFORM_FEEDBACK */ + key->need_gs_prog = ctx->TransformFeedback.CurrentObject->Active; + } else { + /* Pre-gen6, GS is used to transform QUADLIST, QUADSTRIP, and LINELOOP + * into simpler primitives. + */ + key->need_gs_prog = (brw->primitive == _3DPRIM_QUADLIST || + brw->primitive == _3DPRIM_QUADSTRIP || + brw->primitive == _3DPRIM_LINELOOP); + } + /* For testing, the environment variable INTEL_FORCE_GS can be used to + * force a GS program to be used, even if it's not necessary. + */ + if (getenv("INTEL_FORCE_GS")) + key->need_gs_prog = true; } /* Calculate interpolants for triangle and line rasterization. @@ -183,7 +228,8 @@ brw_upload_gs_prog(struct brw_context *brw) const struct brw_tracked_state brw_gs_prog = { .dirty = { .mesa = (_NEW_LIGHT | - _NEW_TRANSFORM), + _NEW_TRANSFORM | + _NEW_TRANSFORM_FEEDBACK), .brw = BRW_NEW_PRIMITIVE, .cache = CACHE_NEW_VS_PROG }, diff --git a/src/mesa/drivers/dri/i965/brw_gs.h b/src/mesa/drivers/dri/i965/brw_gs.h index 93448a77f08..abcb0b2db59 100644 --- a/src/mesa/drivers/dri/i965/brw_gs.h +++ b/src/mesa/drivers/dri/i965/brw_gs.h @@ -73,5 +73,7 @@ struct brw_gs_compile { void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key ); void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key *key ); void brw_gs_lines( struct brw_gs_compile *c ); +void gen6_sol_program(struct brw_gs_compile *c, struct brw_gs_prog_key *key, + unsigned num_verts, bool check_edge_flag); #endif diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c index 9b1dfbfa97b..322f9bd81c1 100644 --- a/src/mesa/drivers/dri/i965/brw_gs_emit.c +++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c @@ -100,6 +100,37 @@ static void brw_gs_overwrite_header_dw2(struct brw_gs_compile *c, brw_MOV(p, get_element_ud(c->reg.header, 2), brw_imm_ud(dw2)); } +/** + * Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0. + * + * When the thread is spawned, GRF 0 contains the primitive type in bits 4:0 + * of DWORD 2. URB_WRITE messages need the primitive type in bits 6:2 of + * DWORD 2. So this function extracts the primitive type field, bitshifts it + * appropriately, and stores it in c->reg.header. + */ +static void brw_gs_overwrite_header_dw2_from_r0(struct brw_gs_compile *c) +{ + struct brw_compile *p = &c->func; + brw_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2), + brw_imm_ud(0x1f)); + brw_SHL(p, get_element_ud(c->reg.header, 2), + get_element_ud(c->reg.header, 2), brw_imm_ud(2)); +} + +/** + * Apply an additive offset to DWORD 2 of c->reg.header. + * + * This is used to set/unset the "PrimStart" and "PrimEnd" flags appropriately + * for each vertex. + */ +static void brw_gs_offset_header_dw2(struct brw_gs_compile *c, int offset) +{ + struct brw_compile *p = &c->func; + brw_ADD(p, get_element_d(c->reg.header, 2), get_element_d(c->reg.header, 2), + brw_imm_d(offset)); +} + + /** * Emit a vertex using the URB_WRITE message. Use the contents of * c->reg.header for the message header, and the registers starting at \c vert @@ -269,3 +300,64 @@ void brw_gs_lines( struct brw_gs_compile *c ) | URB_WRITE_PRIM_END)); brw_gs_emit_vue(c, c->reg.vertex[1], 1); } + +/** + * Generate the geometry shader program used on Gen6 to perform stream output + * (transform feedback). + */ +void +gen6_sol_program(struct brw_gs_compile *c, struct brw_gs_prog_key *key, + unsigned num_verts, bool check_edge_flags) +{ + struct brw_compile *p = &c->func; + + brw_gs_alloc_regs(c, num_verts); + brw_gs_initialize_header(c); + + brw_gs_ff_sync(c, 1); + + brw_gs_overwrite_header_dw2_from_r0(c); + switch (num_verts) { + case 1: + brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_START | URB_WRITE_PRIM_END); + brw_gs_emit_vue(c, c->reg.vertex[0], true); + break; + case 2: + brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_START); + brw_gs_emit_vue(c, c->reg.vertex[0], false); + brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_END - URB_WRITE_PRIM_START); + brw_gs_emit_vue(c, c->reg.vertex[1], true); + break; + case 3: + if (check_edge_flags) { + /* Only emit vertices 0 and 1 if this is the first triangle of the + * polygon. Otherwise they are redundant. + */ + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), + get_element_ud(c->reg.R0, 2), + brw_imm_ud(BRW_GS_EDGE_INDICATOR_0)); + brw_IF(p, BRW_EXECUTE_1); + } + brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_START); + brw_gs_emit_vue(c, c->reg.vertex[0], false); + brw_gs_offset_header_dw2(c, -URB_WRITE_PRIM_START); + brw_gs_emit_vue(c, c->reg.vertex[1], false); + if (check_edge_flags) { + brw_ENDIF(p); + /* Only emit vertex 2 in PRIM_END mode if this is the last triangle + * of the polygon. Otherwise leave the primitive incomplete because + * there are more polygon vertices coming. + */ + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); + brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), + get_element_ud(c->reg.R0, 2), + brw_imm_ud(BRW_GS_EDGE_INDICATOR_1)); + brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); + } + brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_END); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + brw_gs_emit_vue(c, c->reg.vertex[2], true); + break; + } +} diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c index d29f0290727..42962a64d36 100644 --- a/src/mesa/drivers/dri/i965/gen6_gs_state.c +++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c @@ -44,22 +44,36 @@ upload_gs_state(struct brw_context *brw) OUT_BATCH(0); ADVANCE_BATCH(); - // GS should never be used on Gen6. Disable it. - assert(!brw->gs.prog_active); - BEGIN_BATCH(7); - OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2)); - OUT_BATCH(0); /* prog_bo */ - OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) | - (0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); - OUT_BATCH(0); /* scratch space base offset */ - OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) | - (0 << GEN6_GS_URB_READ_LENGTH_SHIFT) | - (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT)); - OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) | - GEN6_GS_STATISTICS_ENABLE | - GEN6_GS_RENDERING_ENABLE); - OUT_BATCH(0); - ADVANCE_BATCH(); + if (brw->gs.prog_active) { + BEGIN_BATCH(7); + OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2)); + OUT_BATCH(brw->gs.prog_offset); + OUT_BATCH(GEN6_GS_SPF_MODE | GEN6_GS_VECTOR_MASK_ENABLE); + OUT_BATCH(0); /* no scratch space */ + OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) | + (brw->gs.prog_data->urb_read_length << GEN6_GS_URB_READ_LENGTH_SHIFT)); + OUT_BATCH(((brw->max_gs_threads - 1) << GEN6_GS_MAX_THREADS_SHIFT) | + GEN6_GS_STATISTICS_ENABLE | + GEN6_GS_SO_STATISTICS_ENABLE | + GEN6_GS_RENDERING_ENABLE); + OUT_BATCH(GEN6_GS_ENABLE); + ADVANCE_BATCH(); + } else { + BEGIN_BATCH(7); + OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2)); + OUT_BATCH(0); /* prog_bo */ + OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) | + (0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); + OUT_BATCH(0); /* scratch space base offset */ + OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) | + (0 << GEN6_GS_URB_READ_LENGTH_SHIFT) | + (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT)); + OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) | + GEN6_GS_STATISTICS_ENABLE | + GEN6_GS_RENDERING_ENABLE); + OUT_BATCH(0); + ADVANCE_BATCH(); + } } const struct brw_tracked_state gen6_gs_state = { -- 2.30.2