From 697d666d7860b3bdced32ca7fde9dea38f67da15 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 11 Jun 2010 23:09:36 -0700 Subject: [PATCH] r300/compiler: Handle loops in deadcode analysis. This also allows us to split the loop emulation into two phases. A tranformation phase which either unrolls loops or prepares them to be emulated, and the emulation phase which unrolls remaining loops until the instruction limit is reached. The second phase is completed after the deadcode analysis in order to get a more accurate count of the number of instructions in the body of loops. --- .../drivers/dri/r300/compiler/r3xx_fragprog.c | 22 ++++-- .../drivers/dri/r300/compiler/r3xx_vertprog.c | 10 ++- .../r300/compiler/radeon_dataflow_deadcode.c | 75 +++++++++++++------ .../dri/r300/compiler/radeon_emulate_loops.c | 60 ++++----------- .../dri/r300/compiler/radeon_emulate_loops.h | 22 +++++- 5 files changed, 112 insertions(+), 77 deletions(-) diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c index bbdfa0d56f9..31f556a96af 100644 --- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c +++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c @@ -97,6 +97,8 @@ static void debug_program_log(struct r300_fragment_program_compiler* c, const ch void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c) { + struct emulate_loop_state loop_state; + rewrite_depth_out(c); debug_program_log(c, "before compilation"); @@ -104,14 +106,11 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c) /* XXX Ideally this should be done only for r3xx, but since * we don't have branching support for r5xx, we use the emulation * on all chipsets. */ - - if (c->Base.is_r500) { - rc_emulate_loops(&c->Base, R500_PFS_MAX_INST); - } else { - rc_emulate_loops(&c->Base, R300_PFS_MAX_ALU_INST); - } - debug_program_log(c, "after emulate loops"); + rc_transform_unroll_loops(&c->Base, &loop_state); + + debug_program_log(c, "after transform loops"); + rc_emulate_branches(&c->Base); debug_program_log(c, "after emulate branches"); @@ -161,6 +160,15 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c) debug_program_log(c, "after deadcode"); + if(c->Base.is_r500){ + rc_emulate_loops(&loop_state, R500_PFS_MAX_INST); + } + else{ + rc_emulate_loops(&loop_state, R300_PFS_MAX_ALU_INST); + } + + debug_program_log(c, "after emulate looops"); + rc_optimize(&c->Base); debug_program_log(c, "after dataflow optimize"); diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c index e984797e2d3..bd8d63246a2 100644 --- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c +++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c @@ -593,6 +593,8 @@ static struct rc_swizzle_caps r300_vertprog_swizzle_caps = { void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler) { + struct emulate_loop_state loop_state; + compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps; addArtificialOutputs(compiler); @@ -602,10 +604,14 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler) /* XXX Ideally this should be done only for r3xx, but since * we don't have branching support for r5xx, we use the emulation * on all chipsets. */ + rc_transform_unroll_loops(&compiler->Base, &loop_state); + + debug_program_log(compiler, "after transform loops"); + if (compiler->Base.is_r500){ - rc_emulate_loops(&compiler->Base, R500_VS_MAX_ALU); + rc_emulate_loops(&loop_state, R500_VS_MAX_ALU); } else { - rc_emulate_loops(&compiler->Base, R300_VS_MAX_ALU); + rc_emulate_loops(&loop_state, R300_VS_MAX_ALU); } debug_program_log(compiler, "after emulate loops"); diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c index e3c2c83c0cf..f8bced2532b 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c @@ -202,32 +202,61 @@ void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_f inst = inst->Prev) { const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); - if (opcode->IsFlowControl) { - if (opcode->Opcode == RC_OPCODE_ENDIF) { - push_branch(&s); - } else { - if (s.BranchStackSize) { - struct branchinfo * branch = &s.BranchStack[s.BranchStackSize-1]; - - if (opcode->Opcode == RC_OPCODE_IF) { - or_updatemasks(&s.R, - &s.R, - branch->HaveElse ? &branch->StoreElse : &branch->StoreEndif); - - s.BranchStackSize--; - } else if (opcode->Opcode == RC_OPCODE_ELSE) { - if (branch->HaveElse) { - rc_error(c, "%s: Multiple ELSE for one IF/ENDIF\n", __FUNCTION__); - } else { - memcpy(&branch->StoreElse, &s.R, sizeof(s.R)); - memcpy(&s.R, &branch->StoreEndif, sizeof(s.R)); - branch->HaveElse = 1; - } + switch(opcode->Opcode){ + /* Mark all sources in the loop body as used before doing + * normal deadcode analysis. This is probably not optimal. + */ + case RC_OPCODE_ENDLOOP: + { + int endloops = 1; + struct rc_instruction *ptr; + for(ptr = inst->Prev; endloops > 0; ptr = ptr->Prev){ + opcode = rc_get_opcode_info(ptr->U.I.Opcode); + if(ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){ + endloops--; + continue; + } + if(ptr->U.I.Opcode == RC_OPCODE_ENDLOOP){ + endloops++; + continue; + } + if(opcode->HasDstReg){ + int src = 0; + unsigned int srcmasks[3]; + rc_compute_sources_for_writemask(ptr, + ptr->U.I.DstReg.WriteMask, srcmasks); + for(src=0; src < opcode->NumSrcRegs; src++){ + mark_used(&s, + ptr->U.I.SrcReg[src].File, + ptr->U.I.SrcReg[src].Index, + srcmasks[src]); + } + } + } + break; + } + case RC_OPCODE_ENDIF: + push_branch(&s); + break; + default: + if (opcode->IsFlowControl && s.BranchStackSize) { + struct branchinfo * branch = &s.BranchStack[s.BranchStackSize-1]; + if (opcode->Opcode == RC_OPCODE_IF) { + or_updatemasks(&s.R, + &s.R, + branch->HaveElse ? &branch->StoreElse : &branch->StoreEndif); + + s.BranchStackSize--; + } else if (opcode->Opcode == RC_OPCODE_ELSE) { + if (branch->HaveElse) { + rc_error(c, "%s: Multiple ELSE for one IF/ENDIF\n", __FUNCTION__); } else { - rc_error(c, "%s: Unhandled control flow instruction %s\n", __FUNCTION__, opcode->Name); + memcpy(&branch->StoreElse, &s.R, sizeof(s.R)); + memcpy(&s.R, &branch->StoreEndif, sizeof(s.R)); + branch->HaveElse = 1; } } else { - rc_error(c, "%s: Unexpected control flow instruction\n", __FUNCTION__); + rc_error(c, "%s: Unhandled control flow instruction %s\n", __FUNCTION__, opcode->Name); } } } diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c index 4c5d29f4217..1aaaa6cccd2 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c @@ -38,22 +38,6 @@ #define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0) -struct emulate_loop_state { - struct radeon_compiler * C; - struct loop_info * Loops; - unsigned int LoopCount; - unsigned int LoopReserved; -}; - -struct loop_info { - struct rc_instruction * BeginLoop; - struct rc_instruction * Cond; - struct rc_instruction * If; - struct rc_instruction * Brk; - struct rc_instruction * EndIf; - struct rc_instruction * EndLoop; -}; - struct const_value { struct radeon_compiler * C; @@ -214,8 +198,7 @@ static void get_incr_amount(void * data, struct rc_instruction * inst, } static int transform_const_loop(struct emulate_loop_state * s, - struct loop_info * loop, - struct rc_instruction * cond) + struct loop_info * loop) { int end_loops = 1; int iterations; @@ -228,13 +211,13 @@ static int transform_const_loop(struct emulate_loop_state * s, /* Find the counter and the upper limit */ - if(src_reg_is_immediate(&cond->U.I.SrcReg[0], s->C)){ - limit = &cond->U.I.SrcReg[0]; - counter = &cond->U.I.SrcReg[1]; + if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[0], s->C)){ + limit = &loop->Cond->U.I.SrcReg[0]; + counter = &loop->Cond->U.I.SrcReg[1]; } - else if(src_reg_is_immediate(&cond->U.I.SrcReg[1], s->C)){ - limit = &cond->U.I.SrcReg[1]; - counter = &cond->U.I.SrcReg[0]; + else if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[1], s->C)){ + limit = &loop->Cond->U.I.SrcReg[1]; + counter = &loop->Cond->U.I.SrcReg[0]; } else{ DBG("No constant limit.\n"); @@ -414,7 +397,7 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s, } /* Check if the number of loops is known at compile time. */ - if(transform_const_loop(s, loop, ptr)){ + if(transform_const_loop(s, loop)){ return loop->BeginLoop->Next; } @@ -425,9 +408,14 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s, return loop->EndLoop; } -static void rc_transform_loops(struct emulate_loop_state * s) +void rc_transform_unroll_loops(struct radeon_compiler *c, + struct emulate_loop_state * s) { - struct rc_instruction * ptr = s->C->Program.Instructions.Next; + struct rc_instruction * ptr; + + memset(s, 0, sizeof(struct emulate_loop_state)); + s->C = c; + ptr = s->C->Program.Instructions.Next; while(ptr != &s->C->Program.Instructions) { if(ptr->Type == RC_INSTRUCTION_NORMAL && ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){ @@ -440,7 +428,7 @@ static void rc_transform_loops(struct emulate_loop_state * s) } } -static void rc_unroll_loops(struct emulate_loop_state *s, +void rc_emulate_loops(struct emulate_loop_state *s, unsigned int max_instructions) { int i; @@ -456,19 +444,3 @@ static void rc_unroll_loops(struct emulate_loop_state *s, loop_unroll(s, &s->Loops[i], iterations); } } - -void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions) -{ - struct emulate_loop_state s; - - memset(&s, 0, sizeof(struct emulate_loop_state)); - s.C = c; - - /* We may need to move these two operations to r3xx_(vert|frag)prog.c - * and run the optimization passes between them in order to increase - * the number of unrolls we can do for each loop. - */ - rc_transform_loops(&s); - - rc_unroll_loops(&s, max_instructions); -} diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h index ddcf1c0fabe..7748813c4eb 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h +++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h @@ -7,6 +7,26 @@ struct radeon_compiler; -void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions); +struct loop_info { + struct rc_instruction * BeginLoop; + struct rc_instruction * Cond; + struct rc_instruction * If; + struct rc_instruction * Brk; + struct rc_instruction * EndIf; + struct rc_instruction * EndLoop; +}; + +struct emulate_loop_state { + struct radeon_compiler * C; + struct loop_info * Loops; + unsigned int LoopCount; + unsigned int LoopReserved; +}; + +void rc_transform_unroll_loops(struct radeon_compiler *c, + struct emulate_loop_state * s); + +void rc_emulate_loops(struct emulate_loop_state *s, + unsigned int max_instructions); #endif /* RADEON_EMULATE_LOOPS_H */ -- 2.30.2