r300/compiler: Implement hardware assisted loops for vertex shaders.
authorTom Stellard <tstellar@gmail.com>
Thu, 5 Aug 2010 17:19:00 +0000 (10:19 -0700)
committerTom Stellard <tstellar@gmail.com>
Tue, 10 Aug 2010 20:17:25 +0000 (13:17 -0700)
Single loops work, but nested loops do not.

src/gallium/drivers/r300/r300_emit.c
src/gallium/drivers/r300/r300_reg.h
src/gallium/drivers/r300/r300_state.c
src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
src/mesa/drivers/dri/r300/compiler/radeon_code.h
src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
src/mesa/drivers/dri/r300/r300_reg.h

index 7bd43b6eb5d63f06f781c29fec710cc107afc8b9..98958d1a2e019044b84fdf93cb46a637e2c255fa 100644 (file)
@@ -936,6 +936,22 @@ void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state)
             OUT_CS_TABLE(data, 4);
         }
     }
+
+    /* Emit flow control instructions. */
+    if (code->num_fc_ops) {
+
+        OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_OPC, code->fc_ops);
+        if (r300screen->caps.is_r500) {
+            OUT_CS_REG_SEQ(R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0, code->num_fc_ops * 2);
+            OUT_CS_TABLE(code->fc_op_addrs.r500, code->num_fc_ops * 2);
+        } else {
+            OUT_CS_REG_SEQ(R300_VAP_PVS_FLOW_CNTL_ADDRS_0, code->num_fc_ops);
+            OUT_CS_TABLE(code->fc_op_addrs.r300, code->num_fc_ops);
+        }
+        OUT_CS_REG_SEQ(R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0, code->num_fc_ops);
+        OUT_CS_TABLE(code->fc_loop_index, code->num_fc_ops);
+    }
+
     END_CS;
 }
 
index 99a9d65055186ca3d6b2f35acaa370feb1f69663..60d3b600cb7fcdd1f4b7c658e93bf486d2595335 100644 (file)
@@ -496,6 +496,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_GB_HORZ_CLIP_ADJ                   0x2228
 #define R300_VAP_GB_HORZ_DISC_ADJ                   0x222c
 
+#define R300_VAP_PVS_FLOW_CNTL_ADDRS_0      0x2230
+#define R300_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R300_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 8)
+#define R300_PVS_FC_LAST_INST(x)            ((x) << 16)
+#define R300_PVS_FC_RTN_INST(x)             ((x) << 24)
+
 /* gap */
 
 /* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
@@ -514,6 +520,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_2288_R300                    0x00750000 /* -- nh */
 #       define R300_2288_RV350                   0x0000FFFF /* -- Vladimir */
 
+#define R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 0x2290
+#define R300_PVS_FC_LOOP_INIT_VAL(x)        ((x) << 0)
+#define R300_PVS_FC_LOOP_STEP_VAL(x)        ((x) << 8)
+
 /* gap */
 
 /* Addresses are relative to the vertex program instruction area of the
@@ -548,6 +558,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_PVS_CODE_CNTL_1           0x22D8
 #       define R300_PVS_LAST_VTX_SRC_INST_SHIFT  0
 #define R300_VAP_PVS_FLOW_CNTL_OPC          0x22DC
+#define R300_VAP_PVS_FC_OPC_JUMP(x)         (1 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_LOOP(x)         (2 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_JSR(x)          (3 << (2 * (x)))
 
 /* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
  * immediate vertices
@@ -564,6 +577,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* write 0 to indicate end of packet? */
 #define R300_VAP_VTX_END_OF_PKT             0x24AC
 
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0   0x2500
+#define R500_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R500_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 16)
+
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_UW_0   0x2504
+#define R500_PVS_FC_LAST_INST(x)            ((x) << 0)
+#define R500_PVS_FC_RTN_INST(x)             ((x) << 16)
+
 /* gap */
 
 /* These are values from r300_reg/r300_reg.h - they are known to be correct
index 9db5e9e0545c27f8581b03dd698a91fb027734ce..e62a33daebd47a3188713928e952924837854c0b 100644 (file)
@@ -1758,10 +1758,12 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
     r300->rs_block_state.dirty = TRUE; /* Will be updated before the emission. */
 
     if (r300->screen->caps.has_tcl) {
+        unsigned fc_op_dwords = r300->screen->caps.is_r500 ? 3 : 2;
         r300->vs_state.dirty = TRUE;
         r300->vs_state.size =
                 vs->code.length + 9 +
-                (vs->immediates_count ? vs->immediates_count * 4 + 3 : 0);
+                (vs->immediates_count ? vs->immediates_count * 4 + 3 : 0) +
+        (vs->code.num_fc_ops ? vs->code.num_fc_ops * fc_op_dwords + 4 : 0);
 
         if (vs->externals_count) {
             r300->vs_constants.dirty = TRUE;
index c6246a81a24f8fdd162be8d4241894905780ef0d..d2fa816894ce109e157eb9d33bc571995bfc09c5 100644 (file)
@@ -113,7 +113,7 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
                debug_program_log(c, "after unroll loops");
        }
        else{
-               rc_transform_loops(&c->Base, &loop_state);
+               rc_transform_loops(&c->Base, &loop_state, -1);
                debug_program_log(c, "after transform loops");
 
                rc_emulate_branches(&c->Base);
index e940fedec20abdb89f91a71e1466e7f6027e2b99..7c2ba2fc092b18a2ad9d28fedfefccb67a101875 100644 (file)
 #include "radeon_emulate_branches.h"
 #include "radeon_emulate_loops.h"
 
+struct loop {
+       int BgnLoop;
+
+};
+
 /*
  * Take an already-setup and valid source then swizzle it appropriately to
  * obtain a constant ZERO or ONE source.
@@ -337,6 +342,10 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 {
        struct rc_instruction *rci;
 
+       struct loop * loops;
+       int current_loop_depth = 0;
+       int loops_reserved = 0;
+
        compiler->code->pos_end = 0;    /* Not supported yet */
        compiler->code->length = 0;
 
@@ -385,6 +394,68 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
                case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
                case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
                case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
+               case RC_OPCODE_BGNLOOP:
+               {
+                       struct loop * l;
+
+                       if ((!compiler->Base.is_r500
+                               && loops_reserved >= R300_VS_MAX_LOOP_DEPTH)
+                               || loops_reserved >= R500_VS_MAX_FC_DEPTH) {
+                               rc_error(&compiler->Base,
+                                               "Loops are nested too deep.");
+                               return;
+                       }
+                       memory_pool_array_reserve(&compiler->Base.Pool,
+                                       struct loop, loops, current_loop_depth,
+                                       loops_reserved, 1);
+                       l = &loops[current_loop_depth++];
+                       memset(l , 0, sizeof(struct loop));
+                       l->BgnLoop = (compiler->code->length / 4);
+                       continue;
+               }
+               case RC_OPCODE_ENDLOOP:
+               {
+                       struct loop * l = &loops[current_loop_depth - 1];
+                       unsigned int act_addr = l->BgnLoop - 1;
+                       unsigned int last_addr = (compiler->code->length / 4) - 1;
+                       unsigned int ret_addr = l->BgnLoop;
+
+                       if (loops_reserved >= R300_VS_MAX_FC_OPS) {
+                               rc_error(&compiler->Base,
+                                       "Too many flow control instructions.");
+                               return;
+                       }
+                       if (compiler->Base.is_r500) {
+                               compiler->code->fc_op_addrs.r500
+                                       [compiler->code->num_fc_ops].lw =
+                                       R500_PVS_FC_ACT_ADRS(act_addr)
+                                       | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
+                                       ;
+                               compiler->code->fc_op_addrs.r500
+                                       [compiler->code->num_fc_ops].uw =
+                                       R500_PVS_FC_LAST_INST(last_addr)
+                                       | R500_PVS_FC_RTN_INST(ret_addr)
+                                       ;
+                       } else {
+                               compiler->code->fc_op_addrs.r300
+                                       [compiler->code->num_fc_ops] =
+                                       R300_PVS_FC_ACT_ADRS(act_addr)
+                                       | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
+                                       | R300_PVS_FC_LAST_INST(last_addr)
+                                       | R300_PVS_FC_RTN_INST(ret_addr)
+                                       ;
+                       }
+                       compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
+                               R300_PVS_FC_LOOP_INIT_VAL(0x0)
+                               | R300_PVS_FC_LOOP_STEP_VAL(0x1)
+                               ;
+                       compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
+                                               compiler->code->num_fc_ops);
+                       compiler->code->num_fc_ops++;
+                       current_loop_depth--;
+                       continue;
+               }
+
                default:
                        rc_error(&compiler->Base, "Unknown opcode %s\n", rc_get_opcode_info(vpi->Opcode)->Name);
                        return;
@@ -406,6 +477,7 @@ struct temporary_allocation {
 static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler)
 {
        struct rc_instruction *inst;
+       struct rc_instruction *end_loop = NULL;
        unsigned int num_orig_temps = 0;
        char hwtemps[R300_VS_MAX_TEMPS];
        struct temporary_allocation * ta;
@@ -440,10 +512,35 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
        /* Pass 2: Determine original temporary lifetimes */
        for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
                const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+               /* Instructions inside of loops need to use the ENDLOOP
+                * instruction as their LastRead. */
+               if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+                       int endloops = 1;
+                       struct rc_instruction * ptr;
+                       for(ptr = inst->Next;
+                               ptr != &compiler->Base.Program.Instructions;
+                                                       ptr = ptr->Next){
+                               if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+                                       endloops++;
+                               } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
+                                       endloops--;
+                                       if (endloops <= 0) {
+                                               end_loop = ptr;
+                                               break;
+                                       }
+                               }
+                       }
+               }
+
+               if (inst == end_loop) {
+                       end_loop = NULL;
+                       continue;
+               }
 
                for (i = 0; i < opcode->NumSrcRegs; ++i) {
                        if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY)
-                               ta[inst->U.I.SrcReg[i].Index].LastRead = inst;
+                               ta[inst->U.I.SrcReg[i].Index].LastRead =
+                                               end_loop ? end_loop : inst;
                }
        }
 
@@ -640,17 +737,11 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 
        debug_program_log(compiler, "before compilation");
 
-       /* XXX Ideally this should be done only for r3xx, but since
-        * we don't have branching support for r5xx, we use the emulation
-        * on all chipsets. */
+       if (compiler->Base.is_r500)
+               rc_transform_loops(&compiler->Base, &loop_state, R500_VS_MAX_ALU);
+       else
+               rc_transform_loops(&compiler->Base, &loop_state, R300_VS_MAX_ALU);
 
-       if (compiler->Base.is_r500){
-               rc_transform_loops(&compiler->Base, &loop_state);
-               rc_emulate_loops(&loop_state, R500_VS_MAX_ALU);
-       } else {
-               rc_transform_loops(&compiler->Base, &loop_state);
-               rc_emulate_loops(&loop_state, R300_VS_MAX_ALU);
-       }
        debug_program_log(compiler, "after emulate loops");
 
        rc_emulate_branches(&compiler->Base);
@@ -717,6 +808,6 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 
        if (compiler->Base.Debug) {
                fprintf(stderr, "Final vertex program code:\n");
-               r300_vertex_program_dump(compiler->code);
+               r300_vertex_program_dump(compiler);
        }
 }
index 5800f1a78e1412c28942ac24945d19a868849bb3..66e352d05dc86215b8855af29364c24034d1b8a3 100644 (file)
@@ -20,6 +20,7 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "radeon_compiler.h"
 #include "radeon_code.h"
 
 #include <stdio.h>
@@ -160,8 +161,9 @@ static void r300_vs_src_dump(uint32_t src)
                        r300_vs_swiz_debug[(src >> 22) & 0x7]);
 }
 
-void r300_vertex_program_dump(struct r300_vertex_program_code * vs)
+void r300_vertex_program_dump(struct r300_vertex_program_compiler * c)
 {
+       struct r300_vertex_program_code * vs = c->code;
        unsigned instrcount = vs->length / 4;
        unsigned i;
 
@@ -177,4 +179,21 @@ void r300_vertex_program_dump(struct r300_vertex_program_code * vs)
                        r300_vs_src_dump(vs->body.d[offset+1+src]);
                }
        }
+
+       fprintf(stderr, "Flow Control Ops: 0x%08x\n",vs->fc_ops);
+       for(i = 0; i < vs->num_fc_ops; i++) {
+               switch((vs->fc_ops >> (i * 2)) & 0x3 ) {
+               case 0: fprintf(stderr, "NOP"); break;
+               case 1: fprintf(stderr, "JUMP"); break;
+               case 2: fprintf(stderr, "LOOP"); break;
+               case 3: fprintf(stderr, "JSR"); break;
+               }
+               if (c->Base.is_r500) {
+                       fprintf(stderr,": uw-> 0x%08x lw-> 0x%08x\n",
+                               vs->fc_op_addrs.r500[i].uw,
+                               vs->fc_op_addrs.r500[i].lw);
+               } else {
+                       fprintf(stderr,": 0x%08x\n", vs->fc_op_addrs.r300[i]);
+               }
+       }
 }
index e14a3520dd30e1ff9d421a432a54fe6c8f7c53fb..896246d203535b8584e36f819b6517661cbade57 100644 (file)
@@ -243,6 +243,12 @@ struct rX00_fragment_program_code {
 #define R500_VS_MAX_ALU                1024
 #define R500_VS_MAX_ALU_DWORDS  (R500_VS_MAX_ALU * 4)
 #define R300_VS_MAX_TEMPS      32
+/* This is the max for all chipsets (r300-r500) */
+#define R300_VS_MAX_FC_OPS 16
+/* The r500 maximum depth is not just for loops, but any combination of loops
+ * and subroutine jumps. */
+#define R500_VS_MAX_FC_DEPTH 8
+#define R300_VS_MAX_LOOP_DEPTH 1
 
 #define VSF_MAX_INPUTS 32
 #define VSF_MAX_OUTPUTS 32
@@ -263,9 +269,18 @@ struct r300_vertex_program_code {
 
        uint32_t InputsRead;
        uint32_t OutputsWritten;
-};
 
-void r300_vertex_program_dump(struct r300_vertex_program_code * vs);
+       unsigned int num_fc_ops;
+       uint32_t fc_ops;
+       union {
+               uint32_t r300[R300_VS_MAX_FC_OPS];
+               struct {
+                       uint32_t lw;
+                       uint32_t uw;
+               } r500[R300_VS_MAX_FC_OPS];
+       } fc_op_addrs;
+       int32_t fc_loop_index[R300_VS_MAX_FC_OPS];
+};
 
 #endif /* RADEON_CODE_H */
 
index f15905d79d4c24e7921ac704190527f4b9bf911b..bbd57cca63f0f22fba2c20224989bbcc961cf10b 100644 (file)
@@ -113,5 +113,6 @@ struct r300_vertex_program_compiler {
 };
 
 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* c);
+void r300_vertex_program_dump(struct r300_vertex_program_compiler * c);
 
 #endif /* RADEON_COMPILER_H */
index 2a3306f9065e88dc713e0d5fa0758fdefbcc477f..32d4b45dd6d7dcadf2e8d23ab0606212e8071a04 100644 (file)
@@ -423,7 +423,8 @@ static int build_loop_info(struct radeon_compiler * c, struct loop_info * loop,
  * @param inst A pointer to a BGNLOOP instruction.
  * @return 1 for success, 0 for failure
  */
-int transform_loop(struct emulate_loop_state * s, struct rc_instruction * inst)
+static int transform_loop(struct emulate_loop_state * s,
+                                               struct rc_instruction * inst)
 {
        struct loop_info * loop;
 
@@ -435,7 +436,7 @@ int transform_loop(struct emulate_loop_state * s, struct rc_instruction * inst)
        if (!build_loop_info(s->C, loop, inst))
                return 0;
 
-       if(try_unroll_loop(s->C, loop, -1)){
+       if(try_unroll_loop(s->C, loop, s->prog_inst_limit)){
                return 1;
        }
 
@@ -472,12 +473,13 @@ int transform_loop(struct emulate_loop_state * s, struct rc_instruction * inst)
 }
 
 void rc_transform_loops(struct radeon_compiler *c,
-                                               struct emulate_loop_state * s)
+                       struct emulate_loop_state * s, int prog_inst_limit)
 {
        struct rc_instruction * ptr;
 
        memset(s, 0, sizeof(struct emulate_loop_state));
        s->C = c;
+       s->prog_inst_limit = prog_inst_limit;
        for(ptr = s->C->Program.Instructions.Next;
                        ptr != &s->C->Program.Instructions; ptr = ptr->Next) {
                if(ptr->Type == RC_INSTRUCTION_NORMAL &&
index 86d91ef14bd8546f888746c56dde96cea0b11407..bba1f68e3086d996c1c52dd3b0c87b88f7e527b5 100644 (file)
@@ -21,10 +21,11 @@ struct emulate_loop_state {
        struct loop_info * Loops;
        unsigned int LoopCount;
        unsigned int LoopReserved;
+       int prog_inst_limit;
 };
 
 void rc_transform_loops(struct radeon_compiler *c,
-                                               struct emulate_loop_state * s);
+                       struct emulate_loop_state * s, int prog_inst_limit);
 
 void rc_unroll_loops(struct radeon_compiler * c, int prog_inst_limit);
 
index f25264b6f2d7d5a2768ec0a72a1c971011b54c10..f7705b0f6fe2ffe94a9bc576f787ded207351570 100644 (file)
@@ -441,6 +441,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_GB_HORZ_CLIP_ADJ                   0x2228
 #define R300_VAP_GB_HORZ_DISC_ADJ                   0x222c
 
+#define R300_VAP_PVS_FLOW_CNTL_ADDRS_0      0x2230
+#define R300_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R300_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 8)
+#define R300_PVS_FC_LAST_INST(x)            ((x) << 16)
+#define R300_PVS_FC_RTN_INST(x)             ((x) << 24)
+
 /* gap */
 
 /* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
@@ -459,6 +465,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_2288_R300                    0x00750000 /* -- nh */
 #       define R300_2288_RV350                   0x0000FFFF /* -- Vladimir */
 
+#define R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 0x2290
+#define R300_PVS_FC_LOOP_INIT_VAL(x)        ((x) << 0)
+#define R300_PVS_FC_LOOP_STEP_VAL(x)        ((x) << 8)
+
 /* gap */
 
 /* Addresses are relative to the vertex program instruction area of the
@@ -489,6 +499,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_PVS_CODE_CNTL_1           0x22D8
 #       define R300_PVS_LAST_VTX_SRC_INST_SHIFT  0
 #define R300_VAP_PVS_FLOW_CNTL_OPC          0x22DC
+#define R300_VAP_PVS_FC_OPC_JUMP(x)         (1 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_LOOP(x)         (2 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_JSR(x)          (3 << (2 * (x)))
 
 /* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
  * immediate vertices
@@ -505,6 +518,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* write 0 to indicate end of packet? */
 #define R300_VAP_VTX_END_OF_PKT             0x24AC
 
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0   0x2500
+#define R500_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R500_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 16)
+
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_UW_0   0x2504
+#define R500_PVS_FC_LAST_INST(x)            ((x) << 0)
+#define R500_PVS_FC_RTN_INST(x)             ((x) << 16)
+
 /* gap */
 
 /* These are values from r300_reg/r300_reg.h - they are known to be correct