From f381c52081b2cbff31c2f38abf16dffcc08f681c Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 18 Jun 2010 21:20:57 -0700 Subject: [PATCH] r300/compiler: Use hardware flow control instructions for loops on r500. --- src/gallium/drivers/r300/r300_fs.c | 3 +- .../drivers/dri/r300/compiler/r3xx_fragprog.c | 25 ++--- .../drivers/dri/r300/compiler/r500_fragprog.c | 54 ++++++++-- .../drivers/dri/r300/compiler/r500_fragprog.h | 4 + .../dri/r300/compiler/r500_fragprog_emit.c | 100 ++++++++++++++---- .../r300/compiler/radeon_dataflow_deadcode.c | 4 + .../dri/r300/compiler/radeon_opcodes.c | 6 ++ .../dri/r300/compiler/radeon_opcodes.h | 2 + 8 files changed, 154 insertions(+), 44 deletions(-) diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c index 424f831731d..b145ded6399 100644 --- a/src/gallium/drivers/r300/r300_fs.c +++ b/src/gallium/drivers/r300/r300_fs.c @@ -246,13 +246,14 @@ static void r300_emit_fs_code_to_buffer( if (r300->screen->caps.is_r500) { struct r500_fragment_program_code *code = &generic_code->code.r500; - shader->cb_code_size = 17 + + shader->cb_code_size = 19 + ((code->inst_end + 1) * 6) + imm_count * 7; NEW_CB(shader->cb_code, shader->cb_code_size); OUT_CB_REG(R500_US_CONFIG, R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO); OUT_CB_REG(R500_US_PIXSIZE, code->max_temp_idx); + OUT_CB_REG(R500_US_FC_CTRL, code->us_fc_ctrl); OUT_CB_REG(R500_US_CODE_RANGE, R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(code->inst_end)); OUT_CB_REG(R500_US_CODE_OFFSET, 0); diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c index 147b0710dbc..b53571ab4e7 100644 --- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c +++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c @@ -103,15 +103,14 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c) debug_program_log(c, "before compilation"); - /* XXX Ideally this should be done only for r3xx, but since - * we don't have branching support for r5xx, we use the emulation - * on all chipsets. */ - - rc_transform_unroll_loops(&c->Base, &loop_state); - - debug_program_log(c, "after transform loops"); - - if (!c->Base.is_r500){ + if (c->Base.is_r500){ + r500_transform_unroll_loops(&c->Base, &loop_state); + debug_program_log(c, "after r500 transform loops"); + } + else{ + rc_transform_unroll_loops(&c->Base, &loop_state); + debug_program_log(c, "after transform loops"); + rc_emulate_branches(&c->Base); debug_program_log(c, "after emulate branches"); } @@ -161,14 +160,10 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c) debug_program_log(c, "after deadcode"); - if(c->Base.is_r500){ - rc_emulate_loops(&loop_state, R500_PFS_MAX_INST); - } - else{ + if(!c->Base.is_r500){ rc_emulate_loops(&loop_state, R300_PFS_MAX_ALU_INST); + debug_program_log(c, "after emulate loops"); } - - debug_program_log(c, "after emulate looops"); rc_optimize(&c->Base); diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c index 350ce3a25d8..e6b5522c5b9 100644 --- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c +++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c @@ -30,6 +30,7 @@ #include #include "../r300_reg.h" +#include "radeon_emulate_loops.h" /** * Rewrite IF instructions to use the ALU result special register. @@ -59,6 +60,31 @@ int r500_transform_IF( return 1; } +/** + * Rewrite loops to make them easier to emit. This is not a local + * transformation, because it modifies and reorders an entire block of code. + */ +void r500_transform_unroll_loops(struct radeon_compiler * c, + struct emulate_loop_state *s) +{ + int i; + + rc_transform_unroll_loops(c, s); + + for( i = s->LoopCount - 1; i >= 0; i-- ){ + struct rc_instruction * inst_continue; + if(!s->Loops[i].EndLoop){ + continue; + } + /* Insert a continue instruction at the end of the loop. This + * is required in order to emit loops correctly. */ + inst_continue = rc_insert_new_instruction(c, + s->Loops[i].EndIf->Prev); + inst_continue->U.I.Opcode = RC_OPCODE_CONTINUE; + } + +} + static int r500_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg) { unsigned int relevant; @@ -322,6 +348,11 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c) case R500_INST_TYPE_FC: fprintf(stderr, "\t2:FC_INST 0x%08x:", code->inst[n].inst2); inst = code->inst[n].inst2; + /* JUMP_FUNC JUMP_ANY*/ + fprintf(stderr, "0x%02x %1x ", inst >> 8 & 0xff, + (inst & R500_FC_JUMP_ANY) >> 5); + + /* OP */ switch(inst & 0x7){ case R500_FC_OP_JUMP: fprintf(stderr, "JUMP"); @@ -348,9 +379,8 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c) fprintf(stderr, "CONTINUE"); break; } - fprintf(stderr, " B_ELSE: %1x, JUMP_ANY: %1x", (inst & R500_FC_B_ELSE) >> 4, - (inst & R500_FC_JUMP_ANY) >> 5); - fprintf(stderr, ", A_OP: "); + fprintf(stderr," "); + /* A_OP */ switch(inst & (0x3 << 6)){ case R500_FC_A_OP_NONE: fprintf(stderr, "NONE"); @@ -362,11 +392,9 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c) fprintf(stderr, "PUSH"); break; } - fprintf(stderr, "\n\tJUMP_FUNC 0x%02x, B_POP_CNT: %d", - (inst >> 8) & 0xff, - (inst >> 16) & 0x1f); + /* B_OP0 B_OP1 */ for(i=0; i<2; i++){ - fprintf(stderr, ", B_OP%d: ", i); + fprintf(stderr, " "); switch(inst & (0x3 << (24 + (i * 2)))){ /* R500_FC_B_OP0_NONE * R500_FC_B_OP1_NONE */ @@ -383,9 +411,17 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c) break; } } - fprintf(stderr, ", IGN_UNC: %1x\n", inst & R500_FC_IGNORE_UNCOVERED); + /*POP_CNT B_ELSE */ + fprintf(stderr, " %d %1x", (inst >> 16) & 0x1f, (inst & R500_FC_B_ELSE) >> 4); + inst = code->inst[n].inst3; + /* JUMP_ADDR */ + fprintf(stderr, " %d", inst >> 16); + + if(code->inst[n].inst2 & R500_FC_IGNORE_UNCOVERED){ + fprintf(stderr, " IGN_UNC"); + } inst = code->inst[n].inst3; - fprintf(stderr, "\t3:FC_ADDR 0x%08x:", inst); + fprintf(stderr, "\n\t3:FC_ADDR 0x%08x:", inst); fprintf(stderr, "BOOL: 0x%02x, INT: 0x%02x, JUMP_ADDR: %d, JMP_GLBL: %1x\n", inst & 0x1f, (inst >> 8) & 0x1f, (inst >> 16) & 0x1ff, inst >> 31); break; diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h index 4efbae7ba67..0d005a794ff 100644 --- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h +++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h @@ -36,6 +36,8 @@ #include "radeon_compiler.h" #include "radeon_swizzle.h" +struct emulate_loop_state; + extern void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler); extern void r500FragmentProgramDump(struct rX00_fragment_program_code *c); @@ -47,4 +49,6 @@ extern int r500_transform_IF( struct rc_instruction * inst, void* data); +void r500_transform_unroll_loops(struct radeon_compiler * c, + struct emulate_loop_state * s); #endif diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c index fb2d8b5a9c0..0bd8f0a239f 100644 --- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c @@ -45,6 +45,8 @@ #include "radeon_program_pair.h" +#define MAX_BRANCH_DEPTH_FULL 32 +#define MAX_BRANCH_DEPTH_PARTIAL 4 #define PROG_CODE \ struct r500_fragment_program_code *code = &c->code->code.r500 @@ -61,6 +63,10 @@ struct branch_info { int Endif; }; +struct loop_info { + int LoopStart; +}; + struct emit_state { struct radeon_compiler * C; struct r500_fragment_program_code * Code; @@ -69,7 +75,12 @@ struct emit_state { unsigned int CurrentBranchDepth; unsigned int BranchesReserved; + struct loop_info * Loops; + unsigned int CurrentLoopDepth; + unsigned int LoopsReserved; + unsigned int MaxBranchDepth; + }; static unsigned int translate_rgb_op(struct r300_fragment_program_compiler *c, rc_opcode opcode) @@ -359,16 +370,49 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst s->Code->inst[newip].inst0 = R500_INST_TYPE_FC | R500_INST_ALU_WAIT; - if (inst->U.I.Opcode == RC_OPCODE_IF) { - if (s->CurrentBranchDepth >= 32) { + switch(inst->U.I.Opcode){ + struct branch_info * branch; + struct loop_info * loop; + case RC_OPCODE_BGNLOOP: + memory_pool_array_reserve(&s->C->Pool, struct loop_info, + s->Loops, s->CurrentLoopDepth, s->LoopsReserved, 1); + + loop = &s->Loops[s->CurrentLoopDepth++]; + + /* We don't emit an instruction for BGNLOOP, so we need to + * decrement the instruction counter, but first we need to + * set LoopStart to the current value of inst_end, which + * will end up being the first real instruction in the loop.*/ + loop->LoopStart = s->Code->inst_end--; + break; + + case RC_OPCODE_BRK: + /* Don't emit an instruction for BRK */ + s->Code->inst_end--; + break; + + case RC_OPCODE_CONTINUE: + loop = &s->Loops[s->CurrentLoopDepth - 1]; + s->Code->inst[newip].inst2 = R500_FC_OP_JUMP | + R500_FC_JUMP_FUNC(0xff); + s->Code->inst[newip].inst3 = R500_FC_JUMP_ADDR(loop->LoopStart); + break; + + case RC_OPCODE_ENDLOOP: + /* Don't emit an instruction for ENDLOOP */ + s->Code->inst_end--; + s->CurrentLoopDepth--; + break; + + case RC_OPCODE_IF: + if ( s->CurrentBranchDepth >= MAX_BRANCH_DEPTH_FULL) { rc_error(s->C, "Branch depth exceeds hardware limit"); return; } - memory_pool_array_reserve(&s->C->Pool, struct branch_info, s->Branches, s->CurrentBranchDepth, s->BranchesReserved, 1); - struct branch_info * branch = &s->Branches[s->CurrentBranchDepth++]; + branch = &s->Branches[s->CurrentBranchDepth++]; branch->If = newip; branch->Else = -1; branch->Endif = -1; @@ -377,29 +421,50 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst s->MaxBranchDepth = s->CurrentBranchDepth; /* actual instruction is filled in at ENDIF time */ - } else if (inst->U.I.Opcode == RC_OPCODE_ELSE) { + break; + + case RC_OPCODE_ELSE: if (!s->CurrentBranchDepth) { rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__); return; } - struct branch_info * branch = &s->Branches[s->CurrentBranchDepth - 1]; + branch = &s->Branches[s->CurrentBranchDepth - 1]; branch->Else = newip; /* actual instruction is filled in at ENDIF time */ - } else if (inst->U.I.Opcode == RC_OPCODE_ENDIF) { + break; + + case RC_OPCODE_ENDIF: if (!s->CurrentBranchDepth) { rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__); return; } - struct branch_info * branch = &s->Branches[s->CurrentBranchDepth - 1]; - branch->Endif = newip; - + branch = &s->Branches[s->CurrentBranchDepth - 1]; + + if(inst->Prev->U.I.Opcode == RC_OPCODE_BRK){ + branch->Endif = --s->Code->inst_end; + s->Code->inst[branch->Endif].inst2 |= + R500_FC_B_OP0_DECR; + } + else{ + branch->Endif = newip; + + s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP + | R500_FC_A_OP_NONE /* no address stack */ + | R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */ + | R500_FC_B_OP0_DECR /* decrement branch counter if stay */ + | R500_FC_B_OP1_NONE /* no branch counter if stay */ + | R500_FC_B_POP_CNT(1) + ; + s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1); + } s->Code->inst[branch->If].inst2 = R500_FC_OP_JUMP | R500_FC_A_OP_NONE /* no address stack */ | R500_FC_JUMP_FUNC(0x0f) /* jump if ALU result is false */ | R500_FC_B_OP0_INCR /* increment branch counter if stay */ + | R500_FC_IGNORE_UNCOVERED ; if (branch->Else >= 0) { @@ -421,17 +486,10 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst s->Code->inst[branch->If].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1); } - s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP - | R500_FC_A_OP_NONE /* no address stack */ - | R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */ - | R500_FC_B_OP0_DECR /* decrement branch counter if stay */ - | R500_FC_B_OP1_NONE /* no branch counter if stay */ - | R500_FC_B_POP_CNT(1) - ; - s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1); s->CurrentBranchDepth--; - } else { + break; + default: rc_error(s->C, "%s: unknown opcode %s\n", __FUNCTION__, rc_get_opcode_info(inst->U.I.Opcode)->Name); } } @@ -486,6 +544,10 @@ void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compi code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT; } + /* Use FULL flow control mode if branches are nested deep enough. + * We don not need to enable FULL flow control mode for loops, becasue + * we aren't using the hardware loop instructions. + */ if (s.MaxBranchDepth >= 4) { if (code->max_temp_idx < 1) code->max_temp_idx = 1; diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c index f8bced2532b..fbb4235c223 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c @@ -235,6 +235,10 @@ void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_f } break; } + case RC_OPCODE_CONTINUE: + case RC_OPCODE_BRK: + case RC_OPCODE_BGNLOOP: + break; case RC_OPCODE_ENDIF: push_branch(&s); break; diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c index 1dc16855dc1..128745a5759 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c +++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c @@ -385,6 +385,12 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = { .IsFlowControl = 1, .NumSrcRegs = 0, }, + { + .Opcode = RC_OPCODE_CONTINUE, + .Name = "CONTINUE", + .IsFlowControl = 1, + .NumSrcRegs = 0 + }, { .Opcode = RC_OPCODE_REPL_ALPHA, .Name = "REPL_ALPHA", diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h index 91c82ac0890..e103ce56371 100644 --- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h +++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h @@ -187,6 +187,8 @@ typedef enum { RC_OPCODE_ENDLOOP, + RC_OPCODE_CONTINUE, + /** special instruction, used in R300-R500 fragment program pair instructions * indicates that the result of the alpha operation shall be replicated * across all other channels */ -- 2.30.2