From: Tom Stellard Date: Wed, 21 Sep 2011 04:05:55 +0000 (-0700) Subject: r300/compiler: Fix nested flow control in r500 vertex shaders X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b2df031a959f36743527b9abc89913ce4f895de3;p=mesa.git r300/compiler: Fix nested flow control in r500 vertex shaders --- diff --git a/src/gallium/drivers/r300/Makefile.sources b/src/gallium/drivers/r300/Makefile.sources index e27b14e5702..1e7d31b210f 100644 --- a/src/gallium/drivers/r300/Makefile.sources +++ b/src/gallium/drivers/r300/Makefile.sources @@ -46,6 +46,7 @@ C_SOURCES := \ compiler/radeon_optimize.c \ compiler/radeon_remove_constants.c \ compiler/radeon_rename_regs.c \ + compiler/radeon_vert_fc.c \ compiler/radeon_variable.c \ compiler/r3xx_fragprog.c \ compiler/r300_fragprog.c \ diff --git a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c index a8d8ebc2dc8..94733d7367f 100644 --- a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c +++ b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c @@ -28,17 +28,13 @@ #include "radeon_compiler_util.h" #include "radeon_dataflow.h" +#include "radeon_program.h" #include "radeon_program_alu.h" #include "radeon_swizzle.h" #include "radeon_emulate_branches.h" #include "radeon_emulate_loops.h" #include "radeon_remove_constants.h" -struct loop { - int BgnLoop; - -}; - /* * Take an already-setup and valid source then swizzle it appropriately to * obtain a constant ZERO or ONE source. @@ -359,140 +355,13 @@ static void ei_pow(struct r300_vertex_program_code *vp, inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]); } -static void mark_write(void * userdata, struct rc_instruction * inst, - rc_register_file file, unsigned int index, unsigned int mask) -{ - unsigned int * writemasks = userdata; - - if (file != RC_FILE_TEMPORARY) - return; - - if (index >= R300_VS_MAX_TEMPS) - return; - - writemasks[index] |= mask; -} - -static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler) -{ - return PVS_SRC_OPERAND(compiler->PredicateIndex, - t_swizzle(RC_SWIZZLE_ZERO), - t_swizzle(RC_SWIZZLE_ZERO), - t_swizzle(RC_SWIZZLE_ZERO), - t_swizzle(RC_SWIZZLE_W), - t_src_class(RC_FILE_TEMPORARY), - 0); -} - -static unsigned long t_pred_dst(struct r300_vertex_program_compiler * compiler, - unsigned int hw_opcode, int is_math) -{ - return PVS_OP_DST_OPERAND(hw_opcode, - is_math, - 0, - compiler->PredicateIndex, - RC_MASK_W, - t_dst_class(RC_FILE_TEMPORARY)); - -} - -static void ei_if(struct r300_vertex_program_compiler * compiler, - struct rc_instruction *rci, - unsigned int * inst, - unsigned int branch_depth) -{ - unsigned int predicate_opcode; - int is_math = 0; - - if (!compiler->Base.is_r500) { - rc_error(&compiler->Base,"Opcode IF not supported\n"); - return; - } - - /* Reserve a temporary to use as our predicate stack counter, if we - * don't already have one. */ - if (!compiler->PredicateMask) { - unsigned int writemasks[RC_REGISTER_MAX_INDEX]; - struct rc_instruction * inst; - unsigned int i; - memset(writemasks, 0, sizeof(writemasks)); - for(inst = compiler->Base.Program.Instructions.Next; - inst != &compiler->Base.Program.Instructions; - inst = inst->Next) { - rc_for_all_writes_mask(inst, mark_write, writemasks); - } - for(i = 0; i < compiler->Base.max_temp_regs; i++) { - unsigned int mask = ~writemasks[i] & RC_MASK_XYZW; - /* Only the W component can be used fo the predicate - * stack counter. */ - if (mask & RC_MASK_W) { - compiler->PredicateMask = RC_MASK_W; - compiler->PredicateIndex = i; - break; - } - } - if (i == compiler->Base.max_temp_regs) { - rc_error(&compiler->Base, "No free temporary to use for" - " predicate stack counter.\n"); - return; - } - } - predicate_opcode = - branch_depth ? VE_PRED_SET_NEQ_PUSH : ME_PRED_SET_NEQ; - - rci->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci->U.I.SrcReg[0].Swizzle,0)); - if (branch_depth == 0) { - is_math = 1; - predicate_opcode = ME_PRED_SET_NEQ; - inst[1] = t_src(compiler->code, &rci->U.I.SrcReg[0]); - inst[2] = 0; - } else { - predicate_opcode = VE_PRED_SET_NEQ_PUSH; - inst[1] = t_pred_src(compiler); - inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]); - } - - inst[0] = t_pred_dst(compiler, predicate_opcode, is_math); - inst[3] = 0; - -} - -static void ei_else(struct r300_vertex_program_compiler * compiler, - unsigned int * inst) -{ - if (!compiler->Base.is_r500) { - rc_error(&compiler->Base,"Opcode ELSE not supported\n"); - return; - } - inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1); - inst[1] = t_pred_src(compiler); - inst[2] = 0; - inst[3] = 0; -} - -static void ei_endif(struct r300_vertex_program_compiler *compiler, - unsigned int * inst) -{ - if (!compiler->Base.is_r500) { - rc_error(&compiler->Base,"Opcode ENDIF not supported\n"); - return; - } - inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1); - inst[1] = t_pred_src(compiler); - inst[2] = 0; - inst[3] = 0; -} - static void translate_vertex_program(struct radeon_compiler *c, void *user) { struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c; struct rc_instruction *rci; - struct loop * loops = NULL; - int current_loop_depth = 0; - int loops_reserved = 0; - - unsigned int branch_depth = 0; + unsigned loops[R500_PVS_MAX_LOOP_DEPTH]; + unsigned loop_depth = 0; compiler->code->pos_end = 0; /* Not supported yet */ compiler->code->length = 0; @@ -532,12 +401,9 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break; case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break; case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break; - case RC_OPCODE_ELSE: ei_else(compiler, inst); break; - case RC_OPCODE_ENDIF: ei_endif(compiler, inst); branch_depth--; break; case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break; case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break; case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break; - case RC_OPCODE_IF: ei_if(compiler, rci, inst, branch_depth); branch_depth++; break; case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break; case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break; case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break; @@ -556,37 +422,27 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break; case RC_OPCODE_BGNLOOP: { - struct loop * l; - if ((!compiler->Base.is_r500 - && loops_reserved >= R300_VS_MAX_LOOP_DEPTH) - || loops_reserved >= R500_VS_MAX_FC_DEPTH) { + && loop_depth >= R300_VS_MAX_LOOP_DEPTH) + || loop_depth >= R500_PVS_MAX_LOOP_DEPTH) { rc_error(&compiler->Base, "Loops are nested too deep."); return; } - memory_pool_array_reserve(&compiler->Base.Pool, - struct loop, loops, current_loop_depth, - loops_reserved, 1); - l = &loops[current_loop_depth++]; - memset(l , 0, sizeof(struct loop)); - l->BgnLoop = (compiler->code->length / 4); - continue; + loops[loop_depth++] = ((compiler->code->length)/ 4) + 1; + break; } case RC_OPCODE_ENDLOOP: { - struct loop * l; unsigned int act_addr; unsigned int last_addr; unsigned int ret_addr; - assert(loops); - l = &loops[current_loop_depth - 1]; - act_addr = l->BgnLoop - 1; + ret_addr = loops[--loop_depth]; + act_addr = ret_addr - 1; last_addr = (compiler->code->length / 4) - 1; - ret_addr = l->BgnLoop; - if (loops_reserved >= R300_VS_MAX_FC_OPS) { + if (loop_depth >= R300_VS_MAX_FC_OPS) { rc_error(&compiler->Base, "Too many flow control instructions."); return; @@ -595,7 +451,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) compiler->code->fc_op_addrs.r500 [compiler->code->num_fc_ops].lw = R500_PVS_FC_ACT_ADRS(act_addr) - | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff) + | R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff) ; compiler->code->fc_op_addrs.r500 [compiler->code->num_fc_ops].uw = @@ -618,26 +474,51 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP( compiler->code->num_fc_ops); compiler->code->num_fc_ops++; - current_loop_depth--; - continue; + + break; } + case RC_ME_PRED_SET_CLR: + ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst); + break; + + case RC_ME_PRED_SET_INV: + ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst); + break; + + case RC_ME_PRED_SET_POP: + ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst); + break; + + case RC_ME_PRED_SET_RESTORE: + ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst); + break; + + case RC_ME_PRED_SEQ: + ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst); + break; + + case RC_ME_PRED_SNEQ: + ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst); + break; + + case RC_VE_PRED_SNEQ_PUSH: + ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH, + vpi, inst); + break; + default: rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name); return; } - /* Non-flow control instructions that are inside an if statement - * need to pay attention to the predicate bit. */ - if (branch_depth - && vpi->Opcode != RC_OPCODE_IF - && vpi->Opcode != RC_OPCODE_ELSE - && vpi->Opcode != RC_OPCODE_ENDIF) { - + if (vpi->DstReg.Pred != RC_PRED_DISABLED) { inst[0] |= (PVS_DST_PRED_ENABLE_MASK << PVS_DST_PRED_ENABLE_SHIFT); - inst[0] |= (PVS_DST_PRED_SENSE_MASK + if (vpi->DstReg.Pred == RC_PRED_SET) { + inst[0] |= (PVS_DST_PRED_SENSE_MASK << PVS_DST_PRED_SENSE_SHIFT); + } } /* Update the number of temporaries. */ @@ -650,10 +531,6 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) vpi->SrcReg[i].Index >= compiler->code->num_temporaries) compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1; - if (compiler->PredicateMask) - if (compiler->PredicateIndex >= compiler->code->num_temporaries) - compiler->code->num_temporaries = compiler->PredicateIndex + 1; - if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) { rc_error(&compiler->Base, "Too many temporaries.\n"); return; @@ -1018,7 +895,6 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c) struct radeon_compiler_pass vs_list[] = { /* NAME DUMP PREDICATE FUNCTION PARAM */ {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL}, - {"transform loops", 1, 1, rc_transform_loops, NULL}, {"emulate branches", 1, !is_r500, rc_emulate_branches, NULL}, {"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL}, {"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500}, @@ -1030,6 +906,7 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c) {"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts}, {"register allocation", 1, opt, allocate_temporary_registers, NULL}, {"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table}, + {"lower control flow opcodes", 1, is_r500, rc_vert_fc, NULL}, {"final code validation", 0, 1, rc_validate_final_shader, NULL}, {"machine code generation", 0, 1, translate_vertex_program, NULL}, {"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL}, diff --git a/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c b/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c index 2bc0a87eed8..a41559c8fde 100644 --- a/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c +++ b/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c @@ -190,16 +190,25 @@ void r300_vertex_program_dump(struct radeon_compiler *compiler, void *user) fprintf(stderr, "Flow Control Ops: 0x%08x\n",vs->fc_ops); for(i = 0; i < vs->num_fc_ops; i++) { + unsigned is_loop = 0; switch((vs->fc_ops >> (i * 2)) & 0x3 ) { case 0: fprintf(stderr, "NOP"); break; case 1: fprintf(stderr, "JUMP"); break; - case 2: fprintf(stderr, "LOOP"); break; + case 2: fprintf(stderr, "LOOP"); is_loop = 1; break; case 3: fprintf(stderr, "JSR"); break; } if (c->Base.is_r500) { - fprintf(stderr,": uw-> 0x%08x lw-> 0x%08x\n", + fprintf(stderr,": uw-> 0x%08x lw-> 0x%08x " + "loop data->0x%08x\n", vs->fc_op_addrs.r500[i].uw, - vs->fc_op_addrs.r500[i].lw); + vs->fc_op_addrs.r500[i].lw, + vs->fc_loop_index[i]); + if (is_loop) { + fprintf(stderr, "Before = %u First = %u Last = %u\n", + vs->fc_op_addrs.r500[i].lw & 0xffff, + (vs->fc_op_addrs.r500[i].uw >> 16) & 0xffff, + vs->fc_op_addrs.r500[i].uw & 0xffff); + } } else { fprintf(stderr,": 0x%08x\n", vs->fc_op_addrs.r300[i]); } diff --git a/src/gallium/drivers/r300/compiler/radeon_code.h b/src/gallium/drivers/r300/compiler/radeon_code.h index 4280d664f0a..44d550068fd 100644 --- a/src/gallium/drivers/r300/compiler/radeon_code.h +++ b/src/gallium/drivers/r300/compiler/radeon_code.h @@ -40,6 +40,9 @@ #define R500_PFS_MAX_BRANCH_DEPTH_FULL 32 #define R500_PFS_MAX_BRANCH_DEPTH_PARTIAL 4 +/* The r500 maximum depth is not just for loops, but any combination of loops + * and subroutine jumps. */ +#define R500_PVS_MAX_LOOP_DEPTH 8 #define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0) @@ -262,9 +265,6 @@ struct rX00_fragment_program_code { #define R300_VS_MAX_TEMPS 32 /* This is the max for all chipsets (r300-r500) */ #define R300_VS_MAX_FC_OPS 16 -/* The r500 maximum depth is not just for loops, but any combination of loops - * and subroutine jumps. */ -#define R500_VS_MAX_FC_DEPTH 8 #define R300_VS_MAX_LOOP_DEPTH 1 #define VSF_MAX_INPUTS 32 diff --git a/src/gallium/drivers/r300/compiler/radeon_compiler.h b/src/gallium/drivers/r300/compiler/radeon_compiler.h index e7ccbb732d1..d42cee9cce1 100644 --- a/src/gallium/drivers/r300/compiler/radeon_compiler.h +++ b/src/gallium/drivers/r300/compiler/radeon_compiler.h @@ -137,11 +137,10 @@ struct r300_vertex_program_compiler { void * UserData; void (*SetHwInputOutput)(struct r300_vertex_program_compiler * c); - int PredicateIndex; - unsigned int PredicateMask; }; void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* c); +void rc_vert_fc(struct radeon_compiler *compiler, void *user); void r300_vertex_program_dump(struct radeon_compiler *compiler, void *user); struct radeon_compiler_pass { diff --git a/src/gallium/drivers/r300/compiler/radeon_opcodes.c b/src/gallium/drivers/r300/compiler/radeon_opcodes.c index 3b49ad7114c..9bcb3c990ad 100644 --- a/src/gallium/drivers/r300/compiler/radeon_opcodes.c +++ b/src/gallium/drivers/r300/compiler/radeon_opcodes.c @@ -437,6 +437,78 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = { { .Opcode = RC_OPCODE_KILP, .Name = "KILP", + }, + { + .Opcode = RC_ME_PRED_SEQ, + .Name = "ME_PRED_SEQ", + .NumSrcRegs = 1, + .HasDstReg = 1 + }, + { + .Opcode = RC_ME_PRED_SGT, + .Name = "ME_PRED_SGT", + .NumSrcRegs = 1, + .HasDstReg = 1 + }, + { + .Opcode = RC_ME_PRED_SGE, + .Name = "ME_PRED_SGE", + .NumSrcRegs = 1, + .HasDstReg = 1 + }, + { + .Opcode = RC_ME_PRED_SNEQ, + .Name = "ME_PRED_SNEQ", + .NumSrcRegs = 1, + .HasDstReg = 1 + }, + { + .Opcode = RC_ME_PRED_SET_CLR, + .Name = "ME_PRED_SET_CLEAR", + .NumSrcRegs = 1, + .HasDstReg = 1 + }, + { + .Opcode = RC_ME_PRED_SET_INV, + .Name = "ME_PRED_SET_INV", + .NumSrcRegs = 1, + .HasDstReg = 1 + }, + { + .Opcode = RC_ME_PRED_SET_POP, + .Name = "ME_PRED_SET_POP", + .NumSrcRegs = 1, + .HasDstReg = 1 + }, + { + .Opcode = RC_ME_PRED_SET_RESTORE, + .Name = "ME_PRED_SET_RESTORE", + .NumSrcRegs = 1, + .HasDstReg = 1 + }, + { + .Opcode = RC_VE_PRED_SEQ_PUSH, + .Name = "VE_PRED_SEQ_PUSH", + .NumSrcRegs = 2, + .HasDstReg = 1 + }, + { + .Opcode = RC_VE_PRED_SGT_PUSH, + .Name = "VE_PRED_SGT_PUSH", + .NumSrcRegs = 2, + .HasDstReg = 1 + }, + { + .Opcode = RC_VE_PRED_SGE_PUSH, + .Name = "VE_PRED_SGE_PUSH", + .NumSrcRegs = 2, + .HasDstReg = 1 + }, + { + .Opcode = RC_VE_PRED_SNEQ_PUSH, + .Name = "VE_PRED_SNEQ_PUSH", + .NumSrcRegs = 2, + .HasDstReg = 1 } }; diff --git a/src/gallium/drivers/r300/compiler/radeon_opcodes.h b/src/gallium/drivers/r300/compiler/radeon_opcodes.h index 0b881c2bfe2..9c4b456168a 100644 --- a/src/gallium/drivers/r300/compiler/radeon_opcodes.h +++ b/src/gallium/drivers/r300/compiler/radeon_opcodes.h @@ -217,6 +217,21 @@ typedef enum { /** Stop execution of the shader (GLSL discard) */ RC_OPCODE_KILP, + /* Vertex shader CF Instructions */ + RC_ME_PRED_SEQ, + RC_ME_PRED_SGT, + RC_ME_PRED_SGE, + RC_ME_PRED_SNEQ, + RC_ME_PRED_SET_CLR, + RC_ME_PRED_SET_INV, + RC_ME_PRED_SET_POP, + RC_ME_PRED_SET_RESTORE, + + RC_VE_PRED_SEQ_PUSH, + RC_VE_PRED_SGT_PUSH, + RC_VE_PRED_SGE_PUSH, + RC_VE_PRED_SNEQ_PUSH, + MAX_RC_OPCODE } rc_opcode; diff --git a/src/gallium/drivers/r300/compiler/radeon_program.h b/src/gallium/drivers/r300/compiler/radeon_program.h index e68be935de4..67be1b9f213 100644 --- a/src/gallium/drivers/r300/compiler/radeon_program.h +++ b/src/gallium/drivers/r300/compiler/radeon_program.h @@ -58,6 +58,7 @@ struct rc_dst_register { unsigned int File:3; unsigned int Index:RC_REGISTER_INDEX_BITS; unsigned int WriteMask:4; + unsigned int Pred:2; }; struct rc_presub_instruction { diff --git a/src/gallium/drivers/r300/compiler/radeon_program_constants.h b/src/gallium/drivers/r300/compiler/radeon_program_constants.h index c07c492b0c9..4dbf6497ed9 100644 --- a/src/gallium/drivers/r300/compiler/radeon_program_constants.h +++ b/src/gallium/drivers/r300/compiler/radeon_program_constants.h @@ -203,4 +203,10 @@ static inline int rc_presubtract_src_reg_count(rc_presubtract_op op){ #define RC_SOURCE_RGB 0x1 #define RC_SOURCE_ALPHA 0x2 +typedef enum { + RC_PRED_DISABLED, + RC_PRED_SET, + RC_PRED_INV +} rc_predicate_mode; + #endif /* RADEON_PROGRAM_CONSTANTS_H */ diff --git a/src/gallium/drivers/r300/compiler/radeon_program_print.c b/src/gallium/drivers/r300/compiler/radeon_program_print.c index e3d2104b250..29a349e5da2 100644 --- a/src/gallium/drivers/r300/compiler/radeon_program_print.c +++ b/src/gallium/drivers/r300/compiler/radeon_program_print.c @@ -329,6 +329,12 @@ static void rc_print_normal_instruction(FILE * f, struct rc_instruction * inst, fprintf(f, ")]"); } + if (inst->U.I.DstReg.Pred == RC_PRED_SET) { + fprintf(f, " PRED_SET"); + } else if (inst->U.I.DstReg.Pred == RC_PRED_INV) { + fprintf(f, " PRED_INV"); + } + fprintf(f, "\n"); } diff --git a/src/gallium/drivers/r300/compiler/radeon_vert_fc.c b/src/gallium/drivers/r300/compiler/radeon_vert_fc.c new file mode 100644 index 00000000000..3568b238299 --- /dev/null +++ b/src/gallium/drivers/r300/compiler/radeon_vert_fc.c @@ -0,0 +1,274 @@ + +#include "radeon_compiler.h" +#include "radeon_compiler_util.h" +#include "radeon_dataflow.h" +#include "radeon_program.h" +#include "radeon_program_constants.h" + +struct vert_fc_state { + struct radeon_compiler *C; + unsigned BranchDepth; + unsigned LoopDepth; + unsigned LoopsReserved; + int PredStack[R500_PVS_MAX_LOOP_DEPTH]; + int PredicateReg; + unsigned InCFBreak; +}; + +static void build_pred_src( + struct rc_src_register * src, + struct vert_fc_state * fc_state) +{ + src->Swizzle = RC_MAKE_SWIZZLE(RC_SWIZZLE_UNUSED, RC_SWIZZLE_UNUSED, + RC_SWIZZLE_UNUSED, RC_SWIZZLE_W); + src->File = RC_FILE_TEMPORARY; + src->Index = fc_state->PredicateReg; +} + +static void build_pred_dst( + struct rc_dst_register * dst, + struct vert_fc_state * fc_state) +{ + dst->WriteMask = RC_MASK_W; + dst->File = RC_FILE_TEMPORARY; + dst->Index = fc_state->PredicateReg; +} + +static void mark_write(void * userdata, struct rc_instruction * inst, + rc_register_file file, unsigned int index, unsigned int mask) +{ + unsigned int * writemasks = userdata; + + if (file != RC_FILE_TEMPORARY) + return; + + if (index >= R300_VS_MAX_TEMPS) + return; + + writemasks[index] |= mask; +} + +static int reserve_predicate_reg(struct vert_fc_state * fc_state) +{ + int i; + unsigned int writemasks[RC_REGISTER_MAX_INDEX]; + struct rc_instruction * inst; + memset(writemasks, 0, sizeof(writemasks)); + for(inst = fc_state->C->Program.Instructions.Next; + inst != &fc_state->C->Program.Instructions; + inst = inst->Next) { + rc_for_all_writes_mask(inst, mark_write, writemasks); + } + + for(i = 0; i < fc_state->C->max_temp_regs; i++) { + /* Most of the control flow instructions only write the + * W component of the Predicate Register, but + * the docs say that ME_PRED_SET_CLR and + * ME_PRED_SET_RESTORE write all components of the + * register, so we must reserve a register that has + * all its components free. */ + if (!writemasks[i]) { + fc_state->PredicateReg = i; + break; + } + } + if (i == fc_state->C->max_temp_regs) { + rc_error(fc_state->C, "No free temporary to use for" + " predicate stack counter.\n"); + return -1; + } + return 1; +} + +static void lower_bgnloop( + struct rc_instruction * inst, + struct vert_fc_state * fc_state) +{ + struct rc_instruction * new_inst = + rc_insert_new_instruction(fc_state->C, inst->Prev); + + if ((!fc_state->C->is_r500 + && fc_state->LoopsReserved >= R300_VS_MAX_LOOP_DEPTH) + || fc_state->LoopsReserved >= R500_PVS_MAX_LOOP_DEPTH) { + rc_error(fc_state->C, "Loops are nested too deep."); + return; + } + + if (fc_state->LoopDepth == 0 && fc_state->BranchDepth == 0) { + if (fc_state->PredicateReg == -1) { + if (reserve_predicate_reg(fc_state) == -1) { + return; + } + } + + /* Initialize the predicate bit to true. */ + new_inst->U.I.Opcode = RC_ME_PRED_SEQ; + build_pred_dst(&new_inst->U.I.DstReg, fc_state); + new_inst->U.I.SrcReg[0].Index = 0; + new_inst->U.I.SrcReg[0].File = RC_FILE_NONE; + new_inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000; + } else { + fc_state->PredStack[fc_state->LoopDepth] = + fc_state->PredicateReg; + /* Copy the the current predicate value to this loop's + * predicate register */ + + /* Use the old predicate value for src0 */ + build_pred_src(&new_inst->U.I.SrcReg[0], fc_state); + + /* Reserve this loop's predicate register */ + if (reserve_predicate_reg(fc_state) == -1) { + return; + } + + /* Copy the old predicate value to the new register */ + new_inst->U.I.Opcode = RC_OPCODE_ADD; + build_pred_dst(&new_inst->U.I.DstReg, fc_state); + new_inst->U.I.SrcReg[1].Index = 0; + new_inst->U.I.SrcReg[1].File = RC_FILE_NONE; + new_inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_0000; + } + +} + +static void lower_brk( + struct rc_instruction * inst, + struct vert_fc_state * fc_state) +{ + if (fc_state->LoopDepth == 1) { + inst->U.I.Opcode = RC_OPCODE_RCP; + inst->U.I.DstReg.Pred = RC_PRED_INV; + inst->U.I.SrcReg[0].Index = 0; + inst->U.I.SrcReg[0].File = RC_FILE_NONE; + inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000; + } else { + inst->U.I.Opcode = RC_ME_PRED_SET_CLR; + inst->U.I.DstReg.Pred = RC_PRED_SET; + } + + build_pred_dst(&inst->U.I.DstReg, fc_state); +} + +static void lower_endloop( + struct rc_instruction * inst, + struct vert_fc_state * fc_state) +{ + struct rc_instruction * new_inst = + rc_insert_new_instruction(fc_state->C, inst); + + new_inst->U.I.Opcode = RC_ME_PRED_SET_RESTORE; + build_pred_dst(&new_inst->U.I.DstReg, fc_state); + /* Restore the previous predicate register. */ + fc_state->PredicateReg = fc_state->PredStack[fc_state->LoopDepth - 1]; + build_pred_src(&new_inst->U.I.SrcReg[0], fc_state); +} + +static void lower_if( + struct rc_instruction * inst, + struct vert_fc_state * fc_state) +{ + /* Reserve a temporary to use as our predicate stack counter, if we + * don't already have one. */ + if (fc_state->PredicateReg == -1) { + /* If we are inside a loop, the Predicate Register should + * have already been defined. */ + assert(fc_state->LoopDepth == 0); + + if (reserve_predicate_reg(fc_state) == -1) { + return; + } + } + + if (inst->Next->U.I.Opcode == RC_OPCODE_BRK) { + fc_state->InCFBreak = 1; + } + if ((fc_state->BranchDepth == 0 && fc_state->LoopDepth == 0) + || (fc_state->LoopDepth == 1 && fc_state->InCFBreak)) { + if (fc_state->InCFBreak) { + inst->U.I.Opcode = RC_ME_PRED_SEQ; + inst->U.I.DstReg.Pred = RC_PRED_SET; + } else { + inst->U.I.Opcode = RC_ME_PRED_SNEQ; + } + } else { + unsigned swz; + inst->U.I.Opcode = RC_VE_PRED_SNEQ_PUSH; + memcpy(&inst->U.I.SrcReg[1], &inst->U.I.SrcReg[0], + sizeof(inst->U.I.SrcReg[1])); + swz = rc_get_scalar_src_swz(inst->U.I.SrcReg[1].Swizzle); + /* VE_PRED_SNEQ_PUSH needs to the branch condition to be in the + * w component */ + inst->U.I.SrcReg[1].Swizzle = RC_MAKE_SWIZZLE(RC_SWIZZLE_UNUSED, + RC_SWIZZLE_UNUSED, RC_SWIZZLE_UNUSED, swz); + build_pred_src(&inst->U.I.SrcReg[0], fc_state); + } + build_pred_dst(&inst->U.I.DstReg, fc_state); +} + +void rc_vert_fc(struct radeon_compiler *c, void *user) +{ + struct rc_instruction * inst; + struct vert_fc_state fc_state; + + memset(&fc_state, 0, sizeof(fc_state)); + fc_state.PredicateReg = -1; + fc_state.C = c; + + for(inst = c->Program.Instructions.Next; + inst != &c->Program.Instructions; + inst = inst->Next) { + + switch (inst->U.I.Opcode) { + + case RC_OPCODE_BGNLOOP: + lower_bgnloop(inst, &fc_state); + fc_state.LoopDepth++; + break; + + case RC_OPCODE_BRK: + lower_brk(inst, &fc_state); + break; + + case RC_OPCODE_ENDLOOP: + if (fc_state.BranchDepth != 0 + || fc_state.LoopDepth != 1) { + lower_endloop(inst, &fc_state); + } + fc_state.LoopDepth--; + /* Skip PRED_RESTORE */ + inst = inst->Next; + break; + case RC_OPCODE_IF: + lower_if(inst, &fc_state); + fc_state.BranchDepth++; + break; + + case RC_OPCODE_ELSE: + inst->U.I.Opcode = RC_ME_PRED_SET_INV; + build_pred_dst(&inst->U.I.DstReg, &fc_state); + build_pred_src(&inst->U.I.SrcReg[0], &fc_state); + break; + + case RC_OPCODE_ENDIF: + if (fc_state.LoopDepth == 1 && fc_state.InCFBreak) { + struct rc_instruction * to_delete = inst; + inst = inst->Prev; + rc_remove_instruction(to_delete); + /* XXX: Delete the endif instruction */ + } else { + inst->U.I.Opcode = RC_ME_PRED_SET_POP; + build_pred_dst(&inst->U.I.DstReg, &fc_state); + build_pred_src(&inst->U.I.SrcReg[0], &fc_state); + } + fc_state.InCFBreak = 0; + fc_state.BranchDepth--; + break; + + default: + if (fc_state.BranchDepth || fc_state.LoopDepth) { + inst->U.I.DstReg.Pred = RC_PRED_SET; + } + break; + } + } +}