X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fr300%2Fcompiler%2Fr3xx_vertprog.c;h=d347b4df9cd6c17aed1a175c140fdd5b31928f74;hb=9fa64ea67544d252c6713e21d3b9c17a2bbb0603;hp=14dd36354d6ee22f63d5c9f34ff120be51f50f12;hpb=d6a304800b2385740f3b90efab45564e1e6203b2;p=mesa.git diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c index 14dd36354d6..d347b4df9cd 100644 --- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c +++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c @@ -22,14 +22,15 @@ #include "radeon_compiler.h" +#include + #include "../r300_reg.h" -#include "radeon_nqssadce.h" -#include "radeon_program.h" +#include "radeon_dataflow.h" #include "radeon_program_alu.h" - -#include "shader/prog_print.h" - +#include "radeon_swizzle.h" +#include "radeon_emulate_branches.h" +#include "radeon_emulate_loops.h" /* * Take an already-setup and valid source then swizzle it appropriately to @@ -42,103 +43,83 @@ t_swizzle(y), \ t_swizzle(y), \ t_src_class(vpi->SrcReg[x].File), \ - NEGATE_NONE) | (vpi->SrcReg[x].RelAddr << 4)) + RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4)) -static unsigned long t_dst_mask(GLuint mask) +static unsigned long t_dst_mask(unsigned int mask) { - /* WRITEMASK_* is equivalent to VSF_FLAG_* */ - return mask & WRITEMASK_XYZW; + /* RC_MASK_* is equivalent to VSF_FLAG_* */ + return mask & RC_MASK_XYZW; } -static unsigned long t_dst_class(gl_register_file file) +static unsigned long t_dst_class(rc_register_file file) { - switch (file) { - case PROGRAM_TEMPORARY: + default: + fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file); + /* fall-through */ + case RC_FILE_TEMPORARY: return PVS_DST_REG_TEMPORARY; - case PROGRAM_OUTPUT: + case RC_FILE_OUTPUT: return PVS_DST_REG_OUT; - case PROGRAM_ADDRESS: + case RC_FILE_ADDRESS: return PVS_DST_REG_A0; - /* - case PROGRAM_INPUT: - case PROGRAM_LOCAL_PARAM: - case PROGRAM_ENV_PARAM: - case PROGRAM_NAMED_PARAM: - case PROGRAM_STATE_VAR: - case PROGRAM_WRITE_ONLY: - case PROGRAM_ADDRESS: - */ - default: - fprintf(stderr, "problem in %s", __FUNCTION__); - _mesa_exit(-1); - return -1; } } static unsigned long t_dst_index(struct r300_vertex_program_code *vp, - struct prog_dst_register *dst) + struct rc_dst_register *dst) { - if (dst->File == PROGRAM_OUTPUT) + if (dst->File == RC_FILE_OUTPUT) return vp->outputs[dst->Index]; return dst->Index; } -static unsigned long t_src_class(gl_register_file file) +static unsigned long t_src_class(rc_register_file file) { switch (file) { - case PROGRAM_TEMPORARY: + default: + fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file); + /* fall-through */ + case RC_FILE_NONE: + case RC_FILE_TEMPORARY: return PVS_SRC_REG_TEMPORARY; - case PROGRAM_INPUT: + case RC_FILE_INPUT: return PVS_SRC_REG_INPUT; - case PROGRAM_LOCAL_PARAM: - case PROGRAM_ENV_PARAM: - case PROGRAM_NAMED_PARAM: - case PROGRAM_CONSTANT: - case PROGRAM_STATE_VAR: + case RC_FILE_CONSTANT: return PVS_SRC_REG_CONSTANT; - /* - case PROGRAM_OUTPUT: - case PROGRAM_WRITE_ONLY: - case PROGRAM_ADDRESS: - */ - default: - fprintf(stderr, "problem in %s", __FUNCTION__); - _mesa_exit(-1); - return -1; } } -static GLboolean t_src_conflict(struct prog_src_register a, struct prog_src_register b) +static int t_src_conflict(struct rc_src_register a, struct rc_src_register b) { unsigned long aclass = t_src_class(a.File); unsigned long bclass = t_src_class(b.File); if (aclass != bclass) - return GL_FALSE; + return 0; if (aclass == PVS_SRC_REG_TEMPORARY) - return GL_FALSE; + return 0; if (a.RelAddr || b.RelAddr) - return GL_TRUE; + return 1; if (a.Index != b.Index) - return GL_TRUE; + return 1; - return GL_FALSE; + return 0; } -static INLINE unsigned long t_swizzle(GLubyte swizzle) +static inline unsigned long t_swizzle(unsigned int swizzle) { - /* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */ + /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */ return swizzle; } static unsigned long t_src_index(struct r300_vertex_program_code *vp, - struct prog_src_register *src) + struct rc_src_register *src) { - if (src->File == PROGRAM_INPUT) { + if (src->File == RC_FILE_INPUT) { assert(vp->inputs[src->Index] != -1); return vp->inputs[src->Index]; } else { @@ -154,9 +135,9 @@ static unsigned long t_src_index(struct r300_vertex_program_code *vp, /* these two functions should probably be merged... */ static unsigned long t_src(struct r300_vertex_program_code *vp, - struct prog_src_register *src) + struct rc_src_register *src) { - /* src->Negate uses the NEGATE_ flags from program_instruction.h, + /* src->Negate uses the RC_MASK_ flags from program_instruction.h, * which equal our VSF_FLAGS_ values, so it's safe to just pass it here. */ return PVS_SRC_OPERAND(t_src_index(vp, src), @@ -165,13 +146,14 @@ static unsigned long t_src(struct r300_vertex_program_code *vp, t_swizzle(GET_SWZ(src->Swizzle, 2)), t_swizzle(GET_SWZ(src->Swizzle, 3)), t_src_class(src->File), - src->Negate) | (src->RelAddr << 4); + src->Negate) | + (src->RelAddr << 4) | (src->Abs << 3); } static unsigned long t_src_scalar(struct r300_vertex_program_code *vp, - struct prog_src_register *src) + struct rc_src_register *src) { - /* src->Negate uses the NEGATE_ flags from program_instruction.h, + /* src->Negate uses the RC_MASK_ flags from program_instruction.h, * which equal our VSF_FLAGS_ values, so it's safe to just pass it here. */ return PVS_SRC_OPERAND(t_src_index(vp, src), @@ -180,79 +162,79 @@ static unsigned long t_src_scalar(struct r300_vertex_program_code *vp, t_swizzle(GET_SWZ(src->Swizzle, 0)), t_swizzle(GET_SWZ(src->Swizzle, 0)), t_src_class(src->File), - src->Negate ? NEGATE_XYZW : NEGATE_NONE) | - (src->RelAddr << 4); + src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) | + (src->RelAddr << 4) | (src->Abs << 3); } -static GLboolean valid_dst(struct r300_vertex_program_code *vp, - struct prog_dst_register *dst) +static int valid_dst(struct r300_vertex_program_code *vp, + struct rc_dst_register *dst) { - if (dst->File == PROGRAM_OUTPUT && vp->outputs[dst->Index] == -1) { - return GL_FALSE; - } else if (dst->File == PROGRAM_ADDRESS) { + if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) { + return 0; + } else if (dst->File == RC_FILE_ADDRESS) { assert(dst->Index == 0); } - return GL_TRUE; + return 1; } static void ei_vector1(struct r300_vertex_program_code *vp, - GLuint hw_opcode, - struct prog_instruction *vpi, - GLuint * inst) + unsigned int hw_opcode, + struct rc_sub_instruction *vpi, + unsigned int * inst) { inst[0] = PVS_OP_DST_OPERAND(hw_opcode, - GL_FALSE, - GL_FALSE, + 0, + 0, t_dst_index(vp, &vpi->DstReg), t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File)); inst[1] = t_src(vp, &vpi->SrcReg[0]); - inst[2] = __CONST(0, SWIZZLE_ZERO); - inst[3] = __CONST(0, SWIZZLE_ZERO); + inst[2] = __CONST(0, RC_SWIZZLE_ZERO); + inst[3] = __CONST(0, RC_SWIZZLE_ZERO); } static void ei_vector2(struct r300_vertex_program_code *vp, - GLuint hw_opcode, - struct prog_instruction *vpi, - GLuint * inst) + unsigned int hw_opcode, + struct rc_sub_instruction *vpi, + unsigned int * inst) { inst[0] = PVS_OP_DST_OPERAND(hw_opcode, - GL_FALSE, - GL_FALSE, + 0, + 0, t_dst_index(vp, &vpi->DstReg), t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File)); inst[1] = t_src(vp, &vpi->SrcReg[0]); inst[2] = t_src(vp, &vpi->SrcReg[1]); - inst[3] = __CONST(1, SWIZZLE_ZERO); + inst[3] = __CONST(1, RC_SWIZZLE_ZERO); } static void ei_math1(struct r300_vertex_program_code *vp, - GLuint hw_opcode, - struct prog_instruction *vpi, - GLuint * inst) + unsigned int hw_opcode, + struct rc_sub_instruction *vpi, + unsigned int * inst) { inst[0] = PVS_OP_DST_OPERAND(hw_opcode, - GL_TRUE, - GL_FALSE, + 1, + 0, t_dst_index(vp, &vpi->DstReg), t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File)); inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]); - inst[2] = __CONST(0, SWIZZLE_ZERO); - inst[3] = __CONST(0, SWIZZLE_ZERO); + inst[2] = __CONST(0, RC_SWIZZLE_ZERO); + inst[3] = __CONST(0, RC_SWIZZLE_ZERO); } static void ei_lit(struct r300_vertex_program_code *vp, - struct prog_instruction *vpi, - GLuint * inst) + struct rc_sub_instruction *vpi, + unsigned int * inst) { //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W} inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX, - GL_TRUE, - GL_FALSE, + 1, + 0, t_dst_index(vp, &vpi->DstReg), t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File)); @@ -262,127 +244,94 @@ static void ei_lit(struct r300_vertex_program_code *vp, PVS_SRC_SELECT_FORCE_0, // Z t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y t_src_class(vpi->SrcReg[0].File), - vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) | + vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | (vpi->SrcReg[0].RelAddr << 4); inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W PVS_SRC_SELECT_FORCE_0, // Z t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X t_src_class(vpi->SrcReg[0].File), - vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) | + vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | (vpi->SrcReg[0].RelAddr << 4); inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X PVS_SRC_SELECT_FORCE_0, // Z t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W t_src_class(vpi->SrcReg[0].File), - vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) | + vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | (vpi->SrcReg[0].RelAddr << 4); } static void ei_mad(struct r300_vertex_program_code *vp, - struct prog_instruction *vpi, - GLuint * inst) + struct rc_sub_instruction *vpi, + unsigned int * inst) { - inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD, - GL_FALSE, - GL_TRUE, - t_dst_index(vp, &vpi->DstReg), - t_dst_mask(vpi->DstReg.WriteMask), - t_dst_class(vpi->DstReg.File)); + /* Remarks about hardware limitations of MAD + * (please preserve this comment, as this information is _NOT_ + * in the documentation provided by AMD). + * + * As described in the documentation, MAD with three unique temporary + * source registers requires the use of the macro version. + * + * However (and this is not mentioned in the documentation), apparently + * the macro version is _NOT_ a full superset of the normal version. + * In particular, the macro version does not always work when relative + * addressing is used in the source operands. + * + * This limitation caused incorrect rendering in Sauerbraten's OpenGL + * assembly shader path when using medium quality animations + * (i.e. animations with matrix blending instead of quaternion blending). + * + * Unfortunately, I (nha) have been unable to extract a Piglit regression + * test for this issue - for some reason, it is possible to have vertex + * programs whose prefix is *exactly* the same as the prefix of the + * offending program in Sauerbraten up to the offending instruction + * without causing any trouble. + * + * Bottom line: Only use the macro version only when really necessary; + * according to AMD docs, this should improve performance by one clock + * as a nice side bonus. + */ + if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY && + vpi->SrcReg[1].File == RC_FILE_TEMPORARY && + vpi->SrcReg[2].File == RC_FILE_TEMPORARY && + vpi->SrcReg[0].Index != vpi->SrcReg[1].Index && + vpi->SrcReg[0].Index != vpi->SrcReg[2].Index && + vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) { + inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD, + 0, + 1, + t_dst_index(vp, &vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask), + t_dst_class(vpi->DstReg.File)); + } else { + inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD, + 0, + 0, + t_dst_index(vp, &vpi->DstReg), + t_dst_mask(vpi->DstReg.WriteMask), + t_dst_class(vpi->DstReg.File)); + } inst[1] = t_src(vp, &vpi->SrcReg[0]); inst[2] = t_src(vp, &vpi->SrcReg[1]); inst[3] = t_src(vp, &vpi->SrcReg[2]); } static void ei_pow(struct r300_vertex_program_code *vp, - struct prog_instruction *vpi, - GLuint * inst) + struct rc_sub_instruction *vpi, + unsigned int * inst) { inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF, - GL_TRUE, - GL_FALSE, + 1, + 0, t_dst_index(vp, &vpi->DstReg), t_dst_mask(vpi->DstReg.WriteMask), t_dst_class(vpi->DstReg.File)); inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]); - inst[2] = __CONST(0, SWIZZLE_ZERO); + inst[2] = __CONST(0, RC_SWIZZLE_ZERO); inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]); } -static void t_inputs_outputs(struct r300_vertex_program_compiler * c) -{ - int i; - int cur_reg; - GLuint OutputsWritten, InputsRead; - - OutputsWritten = c->Base.Program.OutputsWritten; - InputsRead = c->Base.Program.InputsRead; - - cur_reg = -1; - for (i = 0; i < VERT_ATTRIB_MAX; i++) { - if (InputsRead & (1 << i)) - c->code->inputs[i] = ++cur_reg; - else - c->code->inputs[i] = -1; - } - - cur_reg = 0; - for (i = 0; i < VERT_RESULT_MAX; i++) - c->code->outputs[i] = -1; - - assert(OutputsWritten & (1 << VERT_RESULT_HPOS)); - - if (OutputsWritten & (1 << VERT_RESULT_HPOS)) { - c->code->outputs[VERT_RESULT_HPOS] = cur_reg++; - } - - if (OutputsWritten & (1 << VERT_RESULT_PSIZ)) { - c->code->outputs[VERT_RESULT_PSIZ] = cur_reg++; - } - - /* If we're writing back facing colors we need to send - * four colors to make front/back face colors selection work. - * If the vertex program doesn't write all 4 colors, lets - * pretend it does by skipping output index reg so the colors - * get written into appropriate output vectors. - */ - if (OutputsWritten & (1 << VERT_RESULT_COL0)) { - c->code->outputs[VERT_RESULT_COL0] = cur_reg++; - } else if (OutputsWritten & (1 << VERT_RESULT_BFC0) || - OutputsWritten & (1 << VERT_RESULT_BFC1)) { - cur_reg++; - } - - if (OutputsWritten & (1 << VERT_RESULT_COL1)) { - c->code->outputs[VERT_RESULT_COL1] = cur_reg++; - } else if (OutputsWritten & (1 << VERT_RESULT_BFC0) || - OutputsWritten & (1 << VERT_RESULT_BFC1)) { - cur_reg++; - } - - if (OutputsWritten & (1 << VERT_RESULT_BFC0)) { - c->code->outputs[VERT_RESULT_BFC0] = cur_reg++; - } else if (OutputsWritten & (1 << VERT_RESULT_BFC1)) { - cur_reg++; - } - - if (OutputsWritten & (1 << VERT_RESULT_BFC1)) { - c->code->outputs[VERT_RESULT_BFC1] = cur_reg++; - } else if (OutputsWritten & (1 << VERT_RESULT_BFC0)) { - cur_reg++; - } - - for (i = VERT_RESULT_TEX0; i <= VERT_RESULT_TEX7; i++) { - if (OutputsWritten & (1 << i)) { - c->code->outputs[i] = cur_reg++; - } - } - - if (OutputsWritten & (1 << VERT_RESULT_FOGC)) { - c->code->outputs[VERT_RESULT_FOGC] = cur_reg++; - } -} static void translate_vertex_program(struct r300_vertex_program_compiler * compiler) { @@ -391,44 +340,53 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi compiler->code->pos_end = 0; /* Not supported yet */ compiler->code->length = 0; - t_inputs_outputs(compiler); + compiler->SetHwInputOutput(compiler); for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) { - struct prog_instruction *vpi = &rci->I; - GLuint *inst = compiler->code->body.d + compiler->code->length; + struct rc_sub_instruction *vpi = &rci->U.I; + unsigned int *inst = compiler->code->body.d + compiler->code->length; /* Skip instructions writing to non-existing destination */ if (!valid_dst(compiler->code, &vpi->DstReg)) continue; - if (compiler->code->length >= VSF_MAX_FRAGMENT_LENGTH) { + if (compiler->code->length >= R500_VS_MAX_ALU_DWORDS || + (compiler->code->length >= R300_VS_MAX_ALU_DWORDS && !compiler->Base.is_r500)) { rc_error(&compiler->Base, "Vertex program has too many instructions\n"); return; } + assert(compiler->Base.is_r500 || + (vpi->Opcode != RC_OPCODE_SEQ && + vpi->Opcode != RC_OPCODE_SNE)); + switch (vpi->Opcode) { - case OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break; - case OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break; - case OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break; - case OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break; - case OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break; - case OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break; - case OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break; - case OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break; - case OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break; - case OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break; - case OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break; - case OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break; - case OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break; - case OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break; - case OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break; - case OPCODE_POW: ei_pow(compiler->code, vpi, inst); break; - case OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break; - case OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break; - case OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break; - case OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break; + case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break; + case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break; + case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break; + case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break; + case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break; + case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break; + case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break; + case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break; + case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break; + case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break; + case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break; + case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break; + case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break; + case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break; + case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break; + case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break; + case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break; + case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break; + case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break; + case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break; + case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break; + case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break; + case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break; + case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break; default: - rc_error(&compiler->Base, "Unknown opcode %i\n", vpi->Opcode); + rc_error(&compiler->Base, "Unknown opcode %s\n", rc_get_opcode_info(vpi->Opcode)->Name); return; } @@ -440,38 +398,37 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi } struct temporary_allocation { - GLuint Allocated:1; - GLuint HwTemp:15; + unsigned int Allocated:1; + unsigned int HwTemp:15; struct rc_instruction * LastRead; }; static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler) { struct rc_instruction *inst; - GLuint num_orig_temps = 0; - GLboolean hwtemps[VSF_MAX_FRAGMENT_TEMPS]; + unsigned int num_orig_temps = 0; + char hwtemps[R300_VS_MAX_TEMPS]; struct temporary_allocation * ta; - GLuint i, j; + unsigned int i, j; compiler->code->num_temporaries = 0; memset(hwtemps, 0, sizeof(hwtemps)); /* Pass 1: Count original temporaries and allocate structures */ for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { - GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode); - GLuint numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode); + const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); - for (i = 0; i < numsrcs; ++i) { - if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY) { - if (inst->I.SrcReg[i].Index >= num_orig_temps) - num_orig_temps = inst->I.SrcReg[i].Index + 1; + for (i = 0; i < opcode->NumSrcRegs; ++i) { + if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { + if (inst->U.I.SrcReg[i].Index >= num_orig_temps) + num_orig_temps = inst->U.I.SrcReg[i].Index + 1; } } - if (numdsts) { - if (inst->I.DstReg.File == PROGRAM_TEMPORARY) { - if (inst->I.DstReg.Index >= num_orig_temps) - num_orig_temps = inst->I.DstReg.Index + 1; + if (opcode->HasDstReg) { + if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) { + if (inst->U.I.DstReg.Index >= num_orig_temps) + num_orig_temps = inst->U.I.DstReg.Index + 1; } } } @@ -482,100 +439,137 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c /* Pass 2: Determine original temporary lifetimes */ for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { - GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode); + const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); - for (i = 0; i < numsrcs; ++i) { - if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY) - ta[inst->I.SrcReg[i].Index].LastRead = inst; + for (i = 0; i < opcode->NumSrcRegs; ++i) { + if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) + ta[inst->U.I.SrcReg[i].Index].LastRead = inst; } } /* Pass 3: Register allocation */ for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { - GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode); - GLuint numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode); + const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); - for (i = 0; i < numsrcs; ++i) { - if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY) { - GLuint orig = inst->I.SrcReg[i].Index; - inst->I.SrcReg[i].Index = ta[orig].HwTemp; + for (i = 0; i < opcode->NumSrcRegs; ++i) { + if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { + unsigned int orig = inst->U.I.SrcReg[i].Index; + inst->U.I.SrcReg[i].Index = ta[orig].HwTemp; if (ta[orig].Allocated && inst == ta[orig].LastRead) - hwtemps[ta[orig].HwTemp] = GL_FALSE; + hwtemps[ta[orig].HwTemp] = 0; } } - if (numdsts) { - if (inst->I.DstReg.File == PROGRAM_TEMPORARY) { - GLuint orig = inst->I.DstReg.Index; + if (opcode->HasDstReg) { + if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) { + unsigned int orig = inst->U.I.DstReg.Index; if (!ta[orig].Allocated) { - for(j = 0; j < VSF_MAX_FRAGMENT_TEMPS; ++j) { + for(j = 0; j < R300_VS_MAX_TEMPS; ++j) { if (!hwtemps[j]) break; } - if (j >= VSF_MAX_FRAGMENT_TEMPS) { + if (j >= R300_VS_MAX_TEMPS) { fprintf(stderr, "Out of hw temporaries\n"); } else { - ta[orig].Allocated = GL_TRUE; + ta[orig].Allocated = 1; ta[orig].HwTemp = j; - hwtemps[j] = GL_TRUE; + hwtemps[j] = 1; if (j >= compiler->code->num_temporaries) compiler->code->num_temporaries = j + 1; } } - inst->I.DstReg.Index = ta[orig].HwTemp; + inst->U.I.DstReg.Index = ta[orig].HwTemp; } } } } +/** + * R3xx-R4xx vertex engine does not support the Absolute source operand modifier + * and the Saturate opcode modifier. Only Absolute is currently transformed. + */ +static int transform_nonnative_modifiers( + struct radeon_compiler *c, + struct rc_instruction *inst, + void* unused) +{ + const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode); + unsigned i; + + /* Transform ABS(a) to MAX(a, -a). */ + for (i = 0; i < opcode->NumSrcRegs; i++) { + if (inst->U.I.SrcReg[i].Abs) { + struct rc_instruction *new_inst; + unsigned temp; + + inst->U.I.SrcReg[i].Abs = 0; + + temp = rc_find_free_temporary(c); + + new_inst = rc_insert_new_instruction(c, inst->Prev); + new_inst->U.I.Opcode = RC_OPCODE_MAX; + new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY; + new_inst->U.I.DstReg.Index = temp; + new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i]; + new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i]; + new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW; + + memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i])); + inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY; + inst->U.I.SrcReg[i].Index = temp; + inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW; + } + } + return 1; +} /** * Vertex engine cannot read two inputs or two constants at the same time. * Introduce intermediate MOVs to temporary registers to account for this. */ -static GLboolean transform_source_conflicts( +static int transform_source_conflicts( struct radeon_compiler *c, struct rc_instruction* inst, void* unused) { - GLuint num_operands = _mesa_num_inst_src_regs(inst->I.Opcode); + const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); - if (num_operands == 3) { - if (t_src_conflict(inst->I.SrcReg[1], inst->I.SrcReg[2]) - || t_src_conflict(inst->I.SrcReg[0], inst->I.SrcReg[2])) { + if (opcode->NumSrcRegs == 3) { + if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]) + || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) { int tmpreg = rc_find_free_temporary(c); struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev); - inst_mov->I.Opcode = OPCODE_MOV; - inst_mov->I.DstReg.File = PROGRAM_TEMPORARY; - inst_mov->I.DstReg.Index = tmpreg; - inst_mov->I.SrcReg[0] = inst->I.SrcReg[2]; - - reset_srcreg(&inst->I.SrcReg[2]); - inst->I.SrcReg[2].File = PROGRAM_TEMPORARY; - inst->I.SrcReg[2].Index = tmpreg; + inst_mov->U.I.Opcode = RC_OPCODE_MOV; + inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY; + inst_mov->U.I.DstReg.Index = tmpreg; + inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2]; + + reset_srcreg(&inst->U.I.SrcReg[2]); + inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY; + inst->U.I.SrcReg[2].Index = tmpreg; } } - if (num_operands >= 2) { - if (t_src_conflict(inst->I.SrcReg[1], inst->I.SrcReg[0])) { + if (opcode->NumSrcRegs >= 2) { + if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) { int tmpreg = rc_find_free_temporary(c); struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev); - inst_mov->I.Opcode = OPCODE_MOV; - inst_mov->I.DstReg.File = PROGRAM_TEMPORARY; - inst_mov->I.DstReg.Index = tmpreg; - inst_mov->I.SrcReg[0] = inst->I.SrcReg[1]; - - reset_srcreg(&inst->I.SrcReg[1]); - inst->I.SrcReg[1].File = PROGRAM_TEMPORARY; - inst->I.SrcReg[1].Index = tmpreg; + inst_mov->U.I.Opcode = RC_OPCODE_MOV; + inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY; + inst_mov->U.I.DstReg.Index = tmpreg; + inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1]; + + reset_srcreg(&inst->U.I.SrcReg[1]); + inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY; + inst->U.I.SrcReg[1].Index = tmpreg; } } - return GL_TRUE; + return 1; } static void addArtificialOutputs(struct r300_vertex_program_compiler * compiler) @@ -586,74 +580,110 @@ static void addArtificialOutputs(struct r300_vertex_program_compiler * compiler) if ((compiler->RequiredOutputs & (1 << i)) && !(compiler->Base.Program.OutputsWritten & (1 << i))) { struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev); - inst->I.Opcode = OPCODE_MOV; + inst->U.I.Opcode = RC_OPCODE_MOV; - inst->I.DstReg.File = PROGRAM_OUTPUT; - inst->I.DstReg.Index = i; - inst->I.DstReg.WriteMask = WRITEMASK_XYZW; + inst->U.I.DstReg.File = RC_FILE_OUTPUT; + inst->U.I.DstReg.Index = i; + inst->U.I.DstReg.WriteMask = RC_MASK_XYZW; - inst->I.SrcReg[0].File = PROGRAM_CONSTANT; - inst->I.SrcReg[0].Index = 0; - inst->I.SrcReg[0].Swizzle = SWIZZLE_XYZW; + inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT; + inst->U.I.SrcReg[0].Index = 0; + inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW; compiler->Base.Program.OutputsWritten |= 1 << i; } } } -static void nqssadceInit(struct nqssadce_state* s) +static void dataflow_outputs_mark_used(void * userdata, void * data, + void (*callback)(void *, unsigned int, unsigned int)) { - struct r300_vertex_program_compiler * compiler = s->UserData; + struct r300_vertex_program_compiler * c = userdata; int i; - for(i = 0; i < VERT_RESULT_MAX; ++i) { - if (compiler->RequiredOutputs & (1 << i)) - s->Outputs[i].Sourced = WRITEMASK_XYZW; + for(i = 0; i < 32; ++i) { + if (c->RequiredOutputs & (1 << i)) + callback(data, i, RC_MASK_XYZW); } } -static GLboolean swizzleIsNative(GLuint opcode, struct prog_src_register reg) +static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg) { (void) opcode; (void) reg; - return GL_TRUE; + return 1; } +static void debug_program_log(struct r300_vertex_program_compiler* c, const char * where) +{ + if (c->Base.Debug) { + fprintf(stderr, "Vertex Program: %s\n", where); + rc_print_program(&c->Base.Program); + } +} + + +static struct rc_swizzle_caps r300_vertprog_swizzle_caps = { + .IsNative = &swizzle_is_native, + .Split = 0 /* should never be called */ +}; void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler) { - rc_mesa_to_rc_program(&compiler->Base, compiler->program); - compiler->program = 0; + struct emulate_loop_state loop_state; + + compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps; - rc_move_output(&compiler->Base, VERT_RESULT_PSIZ, VERT_RESULT_PSIZ, WRITEMASK_X); + addArtificialOutputs(compiler); - if (compiler->state.WPosAttr != FRAG_ATTRIB_MAX) { - rc_copy_output(&compiler->Base, - VERT_RESULT_HPOS, - compiler->state.WPosAttr - FRAG_ATTRIB_TEX0 + VERT_RESULT_TEX0); + debug_program_log(compiler, "before compilation"); + + /* XXX Ideally this should be done only for r3xx, but since + * we don't have branching support for r5xx, we use the emulation + * on all chipsets. */ + rc_transform_unroll_loops(&compiler->Base, &loop_state); + + debug_program_log(compiler, "after transform loops"); + + if (compiler->Base.is_r500){ + rc_emulate_loops(&loop_state, R500_VS_MAX_ALU); + } else { + rc_emulate_loops(&loop_state, R300_VS_MAX_ALU); } + debug_program_log(compiler, "after emulate loops"); - if (compiler->state.FogAttr != FRAG_ATTRIB_MAX) { - rc_move_output(&compiler->Base, - VERT_RESULT_FOGC, - compiler->state.FogAttr - FRAG_ATTRIB_TEX0 + VERT_RESULT_TEX0, WRITEMASK_X); - } + rc_emulate_branches(&compiler->Base); - addArtificialOutputs(compiler); + debug_program_log(compiler, "after emulate branches"); - { + if (compiler->Base.is_r500) { struct radeon_program_transformation transformations[] = { { &r300_transform_vertex_alu, 0 }, + { &r300_transform_trig_scale_vertex, 0 } }; - radeonLocalTransform(&compiler->Base, 1, transformations); - } + radeonLocalTransform(&compiler->Base, 2, transformations); - if (compiler->Base.Debug) { - fprintf(stderr, "Vertex program after native rewrite:\n"); - rc_print_program(&compiler->Base.Program); - fflush(stdout); + debug_program_log(compiler, "after native rewrite"); + } else { + struct radeon_program_transformation transformations[] = { + { &r300_transform_vertex_alu, 0 }, + { &radeonTransformTrigSimple, 0 } + }; + radeonLocalTransform(&compiler->Base, 2, transformations); + + debug_program_log(compiler, "after native rewrite"); + + /* Note: This pass has to be done seperately from ALU rewrite, + * because it needs to check every instruction. + */ + struct radeon_program_transformation transformations2[] = { + { &transform_nonnative_modifiers, 0 }, + }; + radeonLocalTransform(&compiler->Base, 1, transformations2); + + debug_program_log(compiler, "after emulate modifiers"); } { @@ -667,29 +697,17 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler) radeonLocalTransform(&compiler->Base, 1, transformations); } - if (compiler->Base.Debug) { - fprintf(stderr, "Vertex program after source conflict resolve:\n"); - rc_print_program(&compiler->Base.Program); - fflush(stdout); - } + debug_program_log(compiler, "after source conflict resolve"); - { - struct radeon_nqssadce_descr nqssadce = { - .Init = &nqssadceInit, - .IsNativeSwizzle = &swizzleIsNative, - .BuildSwizzle = NULL - }; - radeonNqssaDce(&compiler->Base, &nqssadce, compiler); + rc_dataflow_deadcode(&compiler->Base, &dataflow_outputs_mark_used, compiler); - /* We need this step for reusing temporary registers */ - allocate_temporary_registers(compiler); + debug_program_log(compiler, "after deadcode"); - if (compiler->Base.Debug) { - fprintf(stderr, "Vertex program after NQSSADCE:\n"); - rc_print_program(&compiler->Base.Program); - fflush(stdout); - } - } + rc_dataflow_swizzles(&compiler->Base); + + allocate_temporary_registers(compiler); + + debug_program_log(compiler, "after dataflow"); translate_vertex_program(compiler); @@ -697,4 +715,9 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler) compiler->code->InputsRead = compiler->Base.Program.InputsRead; compiler->code->OutputsWritten = compiler->Base.Program.OutputsWritten; + + if (compiler->Base.Debug) { + fprintf(stderr, "Final vertex program code:\n"); + r300_vertex_program_dump(compiler->code); + } }