r300/compiler: Handle loops in deadcode analysis.
authorTom Stellard <tstellar@gmail.com>
Sat, 12 Jun 2010 06:09:36 +0000 (23:09 -0700)
committerMarek Olšák <maraeo@gmail.com>
Sat, 3 Jul 2010 02:27:09 +0000 (04:27 +0200)
This also allows us to split the loop emulation into two phases.  A
tranformation phase which either unrolls loops or prepares them to be
emulated, and the emulation phase which unrolls remaining loops until the
instruction limit is reached.  The second phase is completed after the
deadcode analysis in order to get a more accurate count of the number of
instructions in the body of loops.

src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h

index bbdfa0d56f997b6f37344c95c1a08a7d6a6988a5..31f556a96afadb16e56e8b48067f968c28df80ea 100644 (file)
@@ -97,6 +97,8 @@ static void debug_program_log(struct r300_fragment_program_compiler* c, const ch
 
 void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 {
+       struct emulate_loop_state loop_state;
+
        rewrite_depth_out(c);
 
        debug_program_log(c, "before compilation");
@@ -104,14 +106,11 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
        /* XXX Ideally this should be done only for r3xx, but since
         * we don't have branching support for r5xx, we use the emulation
         * on all chipsets. */
-       
-       if (c->Base.is_r500) {
-               rc_emulate_loops(&c->Base, R500_PFS_MAX_INST);
-       } else {
-               rc_emulate_loops(&c->Base, R300_PFS_MAX_ALU_INST);
-       }
-       debug_program_log(c, "after emulate loops");
 
+       rc_transform_unroll_loops(&c->Base, &loop_state);
+       
+       debug_program_log(c, "after transform loops");
+       
        rc_emulate_branches(&c->Base);
 
        debug_program_log(c, "after emulate branches");
@@ -161,6 +160,15 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 
        debug_program_log(c, "after deadcode");
 
+       if(c->Base.is_r500){
+               rc_emulate_loops(&loop_state, R500_PFS_MAX_INST);
+       }
+       else{
+               rc_emulate_loops(&loop_state, R300_PFS_MAX_ALU_INST);
+       }
+       
+       debug_program_log(c, "after emulate looops");
+
        rc_optimize(&c->Base);
 
        debug_program_log(c, "after dataflow optimize");
index e984797e2d35924a0721d998f8390caf8b7685c0..bd8d63246a20fdc56c14639960a14a8fd95df6dc 100644 (file)
@@ -593,6 +593,8 @@ static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
 
 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 {
+       struct emulate_loop_state loop_state;
+       
        compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
 
        addArtificialOutputs(compiler);
@@ -602,10 +604,14 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
        /* XXX Ideally this should be done only for r3xx, but since
         * we don't have branching support for r5xx, we use the emulation
         * on all chipsets. */
+       rc_transform_unroll_loops(&compiler->Base, &loop_state);
+       
+       debug_program_log(compiler, "after transform loops");
+       
        if (compiler->Base.is_r500){
-               rc_emulate_loops(&compiler->Base, R500_VS_MAX_ALU);
+               rc_emulate_loops(&loop_state, R500_VS_MAX_ALU);
        } else {
-               rc_emulate_loops(&compiler->Base, R300_VS_MAX_ALU);
+               rc_emulate_loops(&loop_state, R300_VS_MAX_ALU);
        }
        debug_program_log(compiler, "after emulate loops");
 
index e3c2c83c0cfba29da029b589f4a0f659f84d2799..f8bced2532bc9e01d525ca717b2d22f128b06135 100644 (file)
@@ -202,32 +202,61 @@ void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_f
            inst = inst->Prev) {
                const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 
-               if (opcode->IsFlowControl) {
-                       if (opcode->Opcode == RC_OPCODE_ENDIF) {
-                               push_branch(&s);
-                       } else {
-                               if (s.BranchStackSize) {
-                                       struct branchinfo * branch = &s.BranchStack[s.BranchStackSize-1];
-
-                                       if (opcode->Opcode == RC_OPCODE_IF) {
-                                               or_updatemasks(&s.R,
-                                                               &s.R,
-                                                               branch->HaveElse ? &branch->StoreElse : &branch->StoreEndif);
-
-                                               s.BranchStackSize--;
-                                       } else if (opcode->Opcode == RC_OPCODE_ELSE) {
-                                               if (branch->HaveElse) {
-                                                       rc_error(c, "%s: Multiple ELSE for one IF/ENDIF\n", __FUNCTION__);
-                                               } else {
-                                                       memcpy(&branch->StoreElse, &s.R, sizeof(s.R));
-                                                       memcpy(&s.R, &branch->StoreEndif, sizeof(s.R));
-                                                       branch->HaveElse = 1;
-                                               }
+               switch(opcode->Opcode){
+               /* Mark all sources in the loop body as used before doing
+                * normal deadcode analysis.  This is probably not optimal.
+                */
+               case RC_OPCODE_ENDLOOP:
+               {
+                       int endloops = 1;
+                       struct rc_instruction *ptr;
+                       for(ptr = inst->Prev; endloops > 0; ptr = ptr->Prev){
+                               opcode = rc_get_opcode_info(ptr->U.I.Opcode);
+                               if(ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){
+                                       endloops--;
+                                       continue;
+                               }
+                               if(ptr->U.I.Opcode == RC_OPCODE_ENDLOOP){
+                                       endloops++;
+                                       continue;
+                               }
+                               if(opcode->HasDstReg){
+                                       int src = 0;
+                                       unsigned int srcmasks[3];
+                                       rc_compute_sources_for_writemask(ptr,
+                                               ptr->U.I.DstReg.WriteMask, srcmasks);
+                                       for(src=0; src < opcode->NumSrcRegs; src++){
+                                               mark_used(&s,
+                                                       ptr->U.I.SrcReg[src].File,
+                                                       ptr->U.I.SrcReg[src].Index,
+                                                       srcmasks[src]);
+                                       }
+                               }
+                       }
+                       break;
+               }
+               case RC_OPCODE_ENDIF:
+                       push_branch(&s);
+                       break;
+               default:
+                       if (opcode->IsFlowControl && s.BranchStackSize) {
+                               struct branchinfo * branch = &s.BranchStack[s.BranchStackSize-1];
+                               if (opcode->Opcode == RC_OPCODE_IF) {
+                                       or_updatemasks(&s.R,
+                                                       &s.R,
+                                                       branch->HaveElse ? &branch->StoreElse : &branch->StoreEndif);
+
+                                       s.BranchStackSize--;
+                               } else if (opcode->Opcode == RC_OPCODE_ELSE) {
+                                       if (branch->HaveElse) {
+                                               rc_error(c, "%s: Multiple ELSE for one IF/ENDIF\n", __FUNCTION__);
                                        } else {
-                                               rc_error(c, "%s: Unhandled control flow instruction %s\n", __FUNCTION__, opcode->Name);
+                                               memcpy(&branch->StoreElse, &s.R, sizeof(s.R));
+                                               memcpy(&s.R, &branch->StoreEndif, sizeof(s.R));
+                                               branch->HaveElse = 1;
                                        }
                                } else {
-                                       rc_error(c, "%s: Unexpected control flow instruction\n", __FUNCTION__);
+                                       rc_error(c, "%s: Unhandled control flow instruction %s\n", __FUNCTION__, opcode->Name);
                                }
                        }
                }
index 4c5d29f42170ed3cdf5217f72112fa1d937dbbc8..1aaaa6cccd2bc6321e4737d145be7efe6be6df68 100644 (file)
 
 #define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)
 
-struct emulate_loop_state {
-       struct radeon_compiler * C;
-       struct loop_info * Loops;
-       unsigned int LoopCount;
-       unsigned int LoopReserved;
-};
-
-struct loop_info {
-       struct rc_instruction * BeginLoop;
-       struct rc_instruction * Cond;
-       struct rc_instruction * If;
-       struct rc_instruction * Brk;
-       struct rc_instruction * EndIf;
-       struct rc_instruction * EndLoop;
-};
-
 struct const_value {
        
        struct radeon_compiler * C;
@@ -214,8 +198,7 @@ static void get_incr_amount(void * data, struct rc_instruction * inst,
 }
 
 static int transform_const_loop(struct emulate_loop_state * s,
-                                               struct loop_info * loop,
-                                               struct rc_instruction * cond)
+                                               struct loop_info * loop)
 {
        int end_loops = 1;
        int iterations;
@@ -228,13 +211,13 @@ static int transform_const_loop(struct emulate_loop_state * s,
 
        /* Find the counter and the upper limit */
        
-       if(src_reg_is_immediate(&cond->U.I.SrcReg[0], s->C)){
-               limit = &cond->U.I.SrcReg[0];
-               counter = &cond->U.I.SrcReg[1];
+       if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[0], s->C)){
+               limit = &loop->Cond->U.I.SrcReg[0];
+               counter = &loop->Cond->U.I.SrcReg[1];
        }
-       else if(src_reg_is_immediate(&cond->U.I.SrcReg[1], s->C)){
-               limit = &cond->U.I.SrcReg[1];
-               counter = &cond->U.I.SrcReg[0];
+       else if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[1], s->C)){
+               limit = &loop->Cond->U.I.SrcReg[1];
+               counter = &loop->Cond->U.I.SrcReg[0];
        }
        else{
                DBG("No constant limit.\n");
@@ -414,7 +397,7 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
        }
        
        /* Check if the number of loops is known at compile time. */
-       if(transform_const_loop(s, loop, ptr)){
+       if(transform_const_loop(s, loop)){
                return loop->BeginLoop->Next;
        }
 
@@ -425,9 +408,14 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
        return loop->EndLoop;
 }
 
-static void rc_transform_loops(struct emulate_loop_state * s)
+void rc_transform_unroll_loops(struct radeon_compiler *c,
+                                       struct emulate_loop_state * s)
 {
-       struct rc_instruction * ptr = s->C->Program.Instructions.Next;
+       struct rc_instruction * ptr;
+       
+       memset(s, 0, sizeof(struct emulate_loop_state));
+       s->C = c;
+       ptr = s->C->Program.Instructions.Next;
        while(ptr != &s->C->Program.Instructions) {
                if(ptr->Type == RC_INSTRUCTION_NORMAL &&
                                        ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){
@@ -440,7 +428,7 @@ static void rc_transform_loops(struct emulate_loop_state * s)
        }
 }
 
-static void rc_unroll_loops(struct emulate_loop_state *s,
+void rc_emulate_loops(struct emulate_loop_state *s,
                                                unsigned int max_instructions)
 {
        int i;
@@ -456,19 +444,3 @@ static void rc_unroll_loops(struct emulate_loop_state *s,
                loop_unroll(s, &s->Loops[i], iterations);
        }
 }
-
-void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions)
-{
-       struct emulate_loop_state s;
-
-       memset(&s, 0, sizeof(struct emulate_loop_state));
-       s.C = c;
-
-       /* We may need to move these two operations to r3xx_(vert|frag)prog.c
-        * and run the optimization passes between them in order to increase
-        * the number of unrolls we can do for each loop.
-        */
-       rc_transform_loops(&s);
-       
-       rc_unroll_loops(&s, max_instructions);
-}
index ddcf1c0fabee71e5ec2e93799b04865d636ec1ce..7748813c4eb33b0e0309273f87f32b65860c0344 100644 (file)
@@ -7,6 +7,26 @@
 
 struct radeon_compiler;
 
-void rc_emulate_loops(struct radeon_compiler *c, unsigned int max_instructions);
+struct loop_info {
+       struct rc_instruction * BeginLoop;
+       struct rc_instruction * Cond;
+       struct rc_instruction * If;
+       struct rc_instruction * Brk;
+       struct rc_instruction * EndIf;
+       struct rc_instruction * EndLoop;
+};
+
+struct emulate_loop_state {
+       struct radeon_compiler * C;
+       struct loop_info * Loops;
+       unsigned int LoopCount;
+       unsigned int LoopReserved;
+};
+
+void rc_transform_unroll_loops(struct radeon_compiler *c,
+                                       struct emulate_loop_state * s);
+
+void rc_emulate_loops(struct emulate_loop_state *s,
+                                       unsigned int max_instructions);
 
 #endif /* RADEON_EMULATE_LOOPS_H */