From: Nicolai Haehnle Date: Fri, 11 Jul 2008 23:14:35 +0000 (+0200) Subject: r500_fragprog: Major refactoring of final emit X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b6765c34993b08bba4acf20738c8938413ed4daf;p=mesa.git r500_fragprog: Major refactoring of final emit Use an abstracted instruction scheduling and register allocation algorithm that we will be able to share with r300_fragprog. Unlike the original emit code, this code tries to pair instructions that only use the RGB part of the ALU with instructions that only use the alpha part. However, the pairing algorithm still has some shortcomings; for example, it doesn't generate optimal code for the emulation of LIT. --- diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile index 1dc75a30625..9baa1e71312 100644 --- a/src/mesa/drivers/dri/r300/Makefile +++ b/src/mesa/drivers/dri/r300/Makefile @@ -38,6 +38,7 @@ DRIVER_SOURCES = \ r300_texstate.c \ radeon_program.c \ radeon_program_alu.c \ + radeon_program_pair.c \ radeon_nqssadce.c \ r300_vertprog.c \ r300_fragprog.c \ diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h index a69beba9a7b..8e9c5cee5f3 100644 --- a/src/mesa/drivers/dri/r300/r300_context.h +++ b/src/mesa/drivers/dri/r300/r300_context.h @@ -777,9 +777,6 @@ struct r500_fragment_program_code { GLuint inst4; GLuint inst5; } inst[512]; - /* TODO: This is magic! */ - - int temp_reg_offset; int inst_offset; int inst_end; diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h index cd232c5b7b2..ec2b58377c5 100644 --- a/src/mesa/drivers/dri/r300/r300_reg.h +++ b/src/mesa/drivers/dri/r300/r300_reg.h @@ -2921,6 +2921,7 @@ enum { # define R500_RGB_SRCP_OP_RGB1_PLUS_RGB0 (2 << 30) # define R500_RGB_SRCP_OP_1_MINUS_RGB0 (3 << 30) #define R500_US_CMN_INST_0 0xb800 +# define R500_INST_TYPE_MASK (3 << 0) # define R500_INST_TYPE_ALU (0 << 0) # define R500_INST_TYPE_OUT (1 << 0) # define R500_INST_TYPE_FC (2 << 0) diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c index c92ea8f5e6d..7b18efa69d6 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog.c @@ -408,12 +408,10 @@ void r500TranslateFragmentShader(r300ContextPtr r300, r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM); if (RADEON_DEBUG & DEBUG_PIXEL) { - fprintf(stderr, "Mesa program:\n"); - fprintf(stderr, "-------------\n"); - _mesa_print_program(&fp->mesa_program.Base); - fflush(stdout); - if (fp->translated) + if (fp->translated) { + _mesa_printf("Machine-readable code:\n"); dump_program(&fp->code); + } } } diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c index 275911679dc..b6f52474e26 100644 --- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c +++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c @@ -43,913 +43,281 @@ * */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" -#include "shader/prog_instruction.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" - -#include "r300_context.h" #include "r500_fragprog.h" -#include "r300_reg.h" -#include "r300_state.h" -/* Mapping Mesa registers to R500 temporaries */ -struct reg_acc { - int reg; /* Assigned hw temp */ - unsigned int refcount; /* Number of uses by mesa program */ -}; - -/** - * Describe the current lifetime information for an R300 temporary - */ -struct reg_lifetime { - /* Index of the first slot where this register is free in the sense - that it can be used as a new destination register. - This is -1 if the register has been assigned to a Mesa register - and the last access to the register has not yet been emitted */ - int free; - - /* Index of the first slot where this register is currently reserved. - This is used to stop e.g. a scalar operation from being moved - before the allocation time of a register that was first allocated - for a vector operation. */ - int reserved; - - /* Index of the first slot in which the register can be used as a - source without losing the value that is written by the last - emitted instruction that writes to the register */ - int vector_valid; - int scalar_valid; - - /* Index to the slot where the register was last read. - This is also the first slot in which the register may be written again */ - int vector_lastread; - int scalar_lastread; -}; - -/** - * Store information during compilation of fragment programs. - */ -struct r500_pfs_compile_state { - struct r500_fragment_program_compiler *compiler; +#include "radeon_program_pair.h" - /* number of ALU slots used so far */ - int nrslots; - /* Used to map Mesa's inputs/temps onto hardware temps */ - int temp_in_use; - struct reg_acc inputs[32]; /* don't actually need 32... */ -}; +#define PROG_CODE \ + struct r500_fragment_program_compiler *c = (struct r500_fragment_program_compiler*)data; \ + struct r500_fragment_program_code *code = c->code -/* - * Useful macros and values - */ -#define ERROR(fmt, args...) do { \ +#define error(fmt, args...) do { \ fprintf(stderr, "%s::%s(): " fmt "\n", \ __FILE__, __FUNCTION__, ##args); \ - cs->compiler->fp->error = GL_TRUE; \ } while(0) -#define PROG_CODE struct r500_fragment_program_code *code = cs->compiler->code - -#define R500_US_NUM_TEMP_REGS 128 -#define R500_US_NUM_CONST_REGS 256 - -/* "Register" flags */ -#define REG_CONSTANT (1 << 8) -#define REG_SRC_REL (1 << 9) -#define REG_DEST_REL (1 << 7) - -/* Swizzle tools */ -#define R500_SWIZZLE_ZERO 4 -#define R500_SWIZZLE_HALF 5 -#define R500_SWIZZLE_ONE 6 -#define R500_SWIZ_RGB_ZERO ((4 << 0) | (4 << 3) | (4 << 6)) -#define R500_SWIZ_RGB_ONE ((6 << 0) | (6 << 3) | (6 << 6)) -#define R500_SWIZ_RGB_RGB ((0 << 0) | (1 << 3) | (2 << 6)) -#define R500_SWIZ_MOD_NEG 1 -#define R500_SWIZ_MOD_ABS 2 -#define R500_SWIZ_MOD_NEG_ABS 3 -/* Swizzles for inst2 */ -#define MAKE_SWIZ_TEX_STRQ(x) (x << 8) -#define MAKE_SWIZ_TEX_RGBA(x) (x << 24) -/* Swizzles for inst3 */ -#define MAKE_SWIZ_RGB_A(x) (x << 2) -#define MAKE_SWIZ_RGB_B(x) (x << 15) -/* Swizzles for inst4 */ -#define MAKE_SWIZ_ALPHA_A(x) (x << 14) -#define MAKE_SWIZ_ALPHA_B(x) (x << 21) -/* Swizzle for inst5 */ -#define MAKE_SWIZ_RGBA_C(x) (x << 14) -#define MAKE_SWIZ_ALPHA_C(x) (x << 27) - -/* Writemasks */ -#define R500_WRITEMASK_G 0x2 -#define R500_WRITEMASK_B 0x4 -#define R500_WRITEMASK_RGB 0x7 -#define R500_WRITEMASK_A 0x8 -#define R500_WRITEMASK_AR 0x9 -#define R500_WRITEMASK_AG 0xA -#define R500_WRITEMASK_ARG 0xB -#define R500_WRITEMASK_AB 0xC -#define R500_WRITEMASK_ARGB 0xF - - -static const struct prog_dst_register dstreg_template = { - .File = PROGRAM_TEMPORARY, - .Index = 0, - .WriteMask = WRITEMASK_XYZW -}; -static INLINE GLuint fix_hw_swizzle(GLuint swz) -{ - if (swz == 5) swz = 6; - if (swz == SWIZZLE_NIL) swz = 4; - return swz; -} - -static INLINE GLuint make_rgb_swizzle(struct prog_src_register src) { - GLuint swiz = 0x0; - GLuint temp; - /* This could be optimized, but it should be plenty fast already. */ - int i; - int negatebase = 0; - for (i = 0; i < 3; i++) { - temp = GET_SWZ(src.Swizzle, i); - if (temp != SWIZZLE_NIL && GET_BIT(src.NegateBase, i)) - negatebase = 1; - temp = fix_hw_swizzle(temp); - swiz |= temp << i*3; - } - if (src.Abs) - swiz |= R500_SWIZ_MOD_ABS << 9; - else if (negatebase) - swiz |= R500_SWIZ_MOD_NEG << 9; - if (src.NegateAbs) - swiz ^= R500_SWIZ_MOD_NEG << 9; - return swiz; -} - -static INLINE GLuint make_rgba_swizzle(GLuint src) { - GLuint swiz = 0x0; - GLuint temp; - int i; - for (i = 0; i < 4; i++) { - temp = GET_SWZ(src, i); - temp = fix_hw_swizzle(temp); - swiz |= temp << i*3; - } - return swiz; -} - -static INLINE GLuint make_alpha_swizzle(struct prog_src_register src) { - GLuint swiz = GET_SWZ(src.Swizzle, 3); - - swiz = fix_hw_swizzle(swiz); - - if (src.Abs) { - swiz |= R500_SWIZ_MOD_ABS << 3; - } else if (src.NegateBase & 8) { - swiz |= R500_SWIZ_MOD_NEG << 3; - } - if (src.NegateAbs) - swiz ^= R500_SWIZ_MOD_NEG << 3; - - return swiz; -} - -static INLINE GLuint make_sop_swizzle(struct prog_src_register src) { - GLuint swiz = GET_SWZ(src.Swizzle, 0); - - swiz = fix_hw_swizzle(swiz); - - if (src.Abs) { - swiz |= R500_SWIZ_MOD_ABS << 3; - } else if (src.NegateBase & 1) { - swiz |= R500_SWIZ_MOD_NEG << 3; - } - if (src.NegateAbs) - swiz ^= R500_SWIZ_MOD_NEG << 3; - - return swiz; -} - -static INLINE GLuint make_strq_swizzle(struct prog_src_register src) { - GLuint swiz = 0x0, temp = 0x0; - int i; - for (i = 0; i < 4; i++) { - temp = GET_SWZ(src.Swizzle, i) & 0x3; - swiz |= temp << i*2; - } - return swiz; -} - -static int get_temp(struct r500_pfs_compile_state *cs, int slot) { - - PROG_CODE; - - int r = code->temp_reg_offset + cs->temp_in_use + slot; - - if (r > R500_US_NUM_TEMP_REGS) { - ERROR("Too many temporary registers requested, can't compile!\n"); - } - - return r; -} - -/* Borrowed verbatim from r300_fragprog since it hasn't changed. */ -static GLuint emit_const4fv(struct r500_pfs_compile_state *cs, - struct prog_src_register srcreg) +/** + * Callback to register hardware constants. + */ +static GLboolean emit_const(void *data, GLuint file, GLuint idx, GLuint *hwindex) { PROG_CODE; - GLuint reg = 0x0; - int index; - - for (index = 0; index < code->const_nr; ++index) { - if (code->constant[index].File == srcreg.File && - code->constant[index].Index == srcreg.Index) + for (*hwindex = 0; *hwindex < code->const_nr; ++*hwindex) { + if (code->constant[*hwindex].File == file && + code->constant[*hwindex].Index == idx) break; } - if (index >= code->const_nr) { - if (index >= R500_US_NUM_CONST_REGS) { - ERROR("Out of hw constants!\n"); - return reg; + if (*hwindex >= code->const_nr) { + if (*hwindex >= PFS_NUM_CONST_REGS) { + error("Out of hw constants!\n"); + return GL_FALSE; } code->const_nr++; - code->constant[index] = srcreg; - } - - reg = index | REG_CONSTANT; - return reg; -} - -static GLuint make_src(struct r500_pfs_compile_state *cs, struct prog_src_register src) { - PROG_CODE; - GLuint reg; - switch (src.File) { - case PROGRAM_TEMPORARY: - reg = src.Index + code->temp_reg_offset; - break; - case PROGRAM_INPUT: - reg = cs->inputs[src.Index].reg; - break; - case PROGRAM_LOCAL_PARAM: - case PROGRAM_ENV_PARAM: - case PROGRAM_STATE_VAR: - case PROGRAM_NAMED_PARAM: - case PROGRAM_CONSTANT: - reg = emit_const4fv(cs, src); - break; - case PROGRAM_BUILTIN: - reg = 0x0; - break; - default: - ERROR("Can't handle src.File %x\n", src.File); - reg = 0x0; - break; + code->constant[*hwindex].File = file; + code->constant[*hwindex].Index = idx; } - return reg; -} -static GLuint make_dest(struct r500_pfs_compile_state *cs, struct prog_dst_register dest) { - PROG_CODE; - GLuint reg; - switch (dest.File) { - case PROGRAM_TEMPORARY: - reg = dest.Index + code->temp_reg_offset; - break; - case PROGRAM_OUTPUT: - /* Eventually we may need to handle multiple - * rendering targets... */ - reg = dest.Index; - break; - case PROGRAM_BUILTIN: - reg = 0x0; - break; - default: - ERROR("Can't handle dest.File %x\n", dest.File); - reg = 0x0; - break; - } - return reg; -} - -static int emit_slot(struct r500_pfs_compile_state *cs) -{ - if (cs->nrslots >= 512) { - ERROR("Too many instructions"); - cs->nrslots = 1; - return 0; - } - return cs->nrslots++; + return GL_TRUE; } -static int emit_tex(struct r500_pfs_compile_state *cs, - struct prog_instruction *fpi, int dest) +static GLuint translate_rgb_op(GLuint opcode) { - PROG_CODE; - int hwsrc, hwdest; - GLuint mask; - int counter = emit_slot(cs); - - mask = fpi->DstReg.WriteMask << 11; - hwsrc = make_src(cs, fpi->SrcReg[0]); - - if (fpi->DstReg.File == PROGRAM_OUTPUT) { - hwdest = get_temp(cs, 0); - } else { - hwdest = dest; - } - - code->inst[counter].inst0 = R500_INST_TYPE_TEX | mask - | R500_INST_TEX_SEM_WAIT; - - code->inst[counter].inst1 = R500_TEX_ID(fpi->TexSrcUnit) - | R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED; - - if (fpi->TexSrcTarget == TEXTURE_RECT_INDEX) - code->inst[counter].inst1 |= R500_TEX_UNSCALED; - - switch (fpi->Opcode) { - case OPCODE_KIL: - code->inst[counter].inst1 |= R500_TEX_INST_TEXKILL; - break; - case OPCODE_TEX: - code->inst[counter].inst1 |= R500_TEX_INST_LD; - break; - case OPCODE_TXB: - code->inst[counter].inst1 |= R500_TEX_INST_LODBIAS; - break; - case OPCODE_TXP: - code->inst[counter].inst1 |= R500_TEX_INST_PROJ; - break; + switch(opcode) { + case OPCODE_CMP: return R500_ALU_RGBA_OP_CMP; + case OPCODE_DP3: return R500_ALU_RGBA_OP_DP3; + case OPCODE_DP4: return R500_ALU_RGBA_OP_DP4; + case OPCODE_FRC: return R500_ALU_RGBA_OP_FRC; default: - ERROR("emit_tex can't handle opcode %x\n", fpi->Opcode); + error("translate_rgb_op(%d): unknown opcode\n", opcode); + /* fall through */ + case OPCODE_NOP: + /* fall through */ + case OPCODE_MAD: return R500_ALU_RGBA_OP_MAD; + case OPCODE_MAX: return R500_ALU_RGBA_OP_MAX; + case OPCODE_MIN: return R500_ALU_RGBA_OP_MIN; + case OPCODE_REPL_ALPHA: return R500_ALU_RGBA_OP_SOP; } - - code->inst[counter].inst2 = R500_TEX_SRC_ADDR(hwsrc) - | MAKE_SWIZ_TEX_STRQ(make_strq_swizzle(fpi->SrcReg[0])) - /* | R500_TEX_SRC_S_SWIZ_R | R500_TEX_SRC_T_SWIZ_G - | R500_TEX_SRC_R_SWIZ_B | R500_TEX_SRC_Q_SWIZ_A */ - | R500_TEX_DST_ADDR(hwdest) - | R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G - | R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A; - - code->inst[counter].inst3 = 0x0; - code->inst[counter].inst4 = 0x0; - code->inst[counter].inst5 = 0x0; - - if (fpi->DstReg.File == PROGRAM_OUTPUT) { - counter++; - code->inst[counter].inst0 = R500_INST_TYPE_OUT - | R500_INST_TEX_SEM_WAIT | (mask << 4); - code->inst[counter].inst1 = R500_RGB_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst2 = R500_ALPHA_ADDR0(get_temp(cs, 0)); - code->inst[counter].inst3 = R500_ALU_RGB_SEL_A_SRC0 - | MAKE_SWIZ_RGB_A(R500_SWIZ_RGB_RGB) - | R500_ALU_RGB_SEL_B_SRC0 - | MAKE_SWIZ_RGB_B(R500_SWIZ_RGB_RGB) - | R500_ALU_RGB_OMOD_DISABLE; - code->inst[counter].inst4 = R500_ALPHA_OP_CMP - | R500_ALPHA_ADDRD(dest) - | R500_ALPHA_SEL_A_SRC0 | MAKE_SWIZ_ALPHA_A(R500_ALPHA_SWIZ_A_A) - | R500_ALPHA_SEL_B_SRC0 | MAKE_SWIZ_ALPHA_B(R500_ALPHA_SWIZ_A_A) - | R500_ALPHA_OMOD_DISABLE; - code->inst[counter].inst5 = R500_ALU_RGBA_OP_CMP - | R500_ALU_RGBA_ADDRD(dest) - | MAKE_SWIZ_RGBA_C(R500_SWIZ_RGB_ZERO) - | MAKE_SWIZ_ALPHA_C(R500_SWIZZLE_ZERO); - } - - return counter; } -/* Do not call directly */ -static int _helper_emit_alu(struct r500_pfs_compile_state *cs, GLuint rgbop, GLuint alphaop, - int File, int Index, int WriteMask) +static GLuint translate_alpha_op(GLuint opcode) { - PROG_CODE; - int counter = emit_slot(cs); - - code->inst[counter].inst4 = alphaop; - code->inst[counter].inst5 = rgbop; - - if (File == PROGRAM_OUTPUT) { - code->inst[counter].inst0 = R500_INST_TYPE_OUT; - - if (Index == FRAG_RESULT_COLR) { - code->inst[counter].inst0 |= WriteMask << 15; - } else if (Index == FRAG_RESULT_DEPR) { - code->inst[counter].inst4 |= R500_ALPHA_W_OMASK; - cs->compiler->fp->writes_depth = GL_TRUE; - } - } else { - int dest = Index + code->temp_reg_offset; - - code->inst[counter].inst0 = R500_INST_TYPE_ALU - | (WriteMask << 11); - code->inst[counter].inst4 |= R500_ALPHA_ADDRD(dest); - code->inst[counter].inst5 |= R500_ALU_RGBA_ADDRD(dest); + switch(opcode) { + case OPCODE_CMP: return R500_ALPHA_OP_CMP; + case OPCODE_COS: return R500_ALPHA_OP_COS; + case OPCODE_DP3: return R500_ALPHA_OP_DP; + case OPCODE_DP4: return R500_ALPHA_OP_DP; + case OPCODE_EX2: return R500_ALPHA_OP_EX2; + case OPCODE_FRC: return R500_ALPHA_OP_FRC; + case OPCODE_LG2: return R500_ALPHA_OP_LN2; + default: + error("translate_alpha_op(%d): unknown opcode\n", opcode); + /* fall through */ + case OPCODE_NOP: + /* fall through */ + case OPCODE_MAD: return R500_ALPHA_OP_MAD; + case OPCODE_MAX: return R500_ALPHA_OP_MAX; + case OPCODE_MIN: return R500_ALPHA_OP_MIN; + case OPCODE_RCP: return R500_ALPHA_OP_RCP; + case OPCODE_RSQ: return R500_ALPHA_OP_RSQ; + case OPCODE_SIN: return R500_ALPHA_OP_SIN; } - - code->inst[counter].inst0 |= R500_INST_TEX_SEM_WAIT; - - return counter; } -/** - * Prepare an ALU slot with the given RGB operation, ALPHA operation, and - * destination register. - */ -static int emit_alu(struct r500_pfs_compile_state *cs, GLuint rgbop, GLuint alphaop, struct prog_dst_register dst) -{ - return _helper_emit_alu(cs, rgbop, alphaop, dst.File, dst.Index, dst.WriteMask); -} - -/** - * Set an instruction's source 0 (both RGB and ALPHA) to the given hardware index. - */ -static void set_src0_direct(struct r500_pfs_compile_state *cs, int ip, GLuint src) +static GLuint fix_hw_swizzle(GLuint swz) { - PROG_CODE; - code->inst[ip].inst1 |= R500_RGB_ADDR0(src); - code->inst[ip].inst2 |= R500_ALPHA_ADDR0(src); + if (swz == 5) swz = 6; + if (swz == SWIZZLE_NIL) swz = 4; + return swz; } -/** - * Set an instruction's source 1 (both RGB and ALPHA) to the given hardware index. - */ -static void set_src1_direct(struct r500_pfs_compile_state *cs, int ip, GLuint src) +static GLuint translate_arg_rgb(struct radeon_pair_instruction *inst, int arg) { - PROG_CODE; - code->inst[ip].inst1 |= R500_RGB_ADDR1(src); - code->inst[ip].inst2 |= R500_ALPHA_ADDR1(src); -} + GLuint t = inst->RGB.Arg[arg].Source; + int comp; + t |= inst->RGB.Arg[arg].Negate << 11; + t |= inst->RGB.Arg[arg].Abs << 12; -/** - * Set an instruction's source 2 (both RGB and ALPHA) to the given hardware index. - */ -static void set_src2_direct(struct r500_pfs_compile_state *cs, int ip, GLuint src) -{ - PROG_CODE; - code->inst[ip].inst1 |= R500_RGB_ADDR2(src); - code->inst[ip].inst2 |= R500_ALPHA_ADDR2(src); -} + for(comp = 0; comp < 3; ++comp) + t |= fix_hw_swizzle(GET_SWZ(inst->RGB.Arg[arg].Swizzle, comp)) << (3*comp + 2); -/** - * Set an instruction's source 0 (both RGB and ALPHA) according to the given source register. - */ -static void set_src0(struct r500_pfs_compile_state *cs, int ip, struct prog_src_register srcreg) -{ - set_src0_direct(cs, ip, make_src(cs, srcreg)); + return t; } -/** - * Set an instruction's source 1 (both RGB and ALPHA) according to the given source register. - */ -static void set_src1(struct r500_pfs_compile_state *cs, int ip, struct prog_src_register srcreg) +static GLuint translate_arg_alpha(struct radeon_pair_instruction *inst, int i) { - set_src1_direct(cs, ip, make_src(cs, srcreg)); + GLuint t = inst->Alpha.Arg[i].Source; + t |= fix_hw_swizzle(inst->Alpha.Arg[i].Swizzle) << 2; + t |= inst->Alpha.Arg[i].Negate << 5; + t |= inst->Alpha.Arg[i].Abs << 6; + return t; } -/** - * Set an instruction's source 2 (both RGB and ALPHA) according to the given source register. - */ -static void set_src2(struct r500_pfs_compile_state *cs, int ip, struct prog_src_register srcreg) +static void use_temporary(struct r500_fragment_program_code* code, GLuint index) { - set_src2_direct(cs, ip, make_src(cs, srcreg)); + if (index > code->max_temp_idx) + code->max_temp_idx = index; } -/** - * Set an instruction's argument A (both RGB and ALPHA) from the given source, - * taking swizzles+neg+abs as specified (see also _reg version below). - */ -static void set_argA(struct r500_pfs_compile_state *cs, int ip, int source, GLuint swizRGB, GLuint swizA) +static GLuint use_source(struct r500_fragment_program_code* code, struct radeon_pair_instruction_source src) { - PROG_CODE; - code->inst[ip].inst3 |= (source << R500_ALU_RGB_SEL_A_SHIFT) | MAKE_SWIZ_RGB_A(swizRGB); - code->inst[ip].inst4 |= (source << R500_ALPHA_SEL_A_SHIFT) | MAKE_SWIZ_ALPHA_A(swizA); + if (!src.Constant) + use_temporary(code, src.Index); + return src.Index | src.Constant << 8; } -/** - * Set an instruction's argument B (both RGB and ALPHA) from the given source, - * taking swizzles+neg+abs as specified (see also _reg version below). - */ -static void set_argB(struct r500_pfs_compile_state *cs, int ip, int source, GLuint swizRGB, GLuint swizA) -{ - PROG_CODE; - code->inst[ip].inst3 |= (source << R500_ALU_RGB_SEL_B_SHIFT) | MAKE_SWIZ_RGB_B(swizRGB); - code->inst[ip].inst4 |= (source << R500_ALPHA_SEL_B_SHIFT) | MAKE_SWIZ_ALPHA_B(swizA); -} /** - * Set an instruction's argument C (both RGB and ALPHA) from the given source, - * taking swizzles+neg+abs as specified (see also _reg version below). + * Emit a paired ALU instruction. */ -static void set_argC(struct r500_pfs_compile_state *cs, int ip, int source, GLuint swizRGB, GLuint swizA) +static GLboolean emit_paired(void *data, struct radeon_pair_instruction *inst) { PROG_CODE; - code->inst[ip].inst5 |= - (source << R500_ALU_RGBA_SEL_C_SHIFT) | - MAKE_SWIZ_RGBA_C(swizRGB) | - (source << R500_ALU_RGBA_ALPHA_SEL_C_SHIFT) | - MAKE_SWIZ_ALPHA_C(swizA); -} -/** - * Set an instruction's argument A (both RGB and ALPHA) from the given source, - * taking swizzles, negation and absolute value from the given source register. - */ -static void set_argA_reg(struct r500_pfs_compile_state *cs, int ip, int source, struct prog_src_register srcreg) -{ - set_argA(cs, ip, source, make_rgb_swizzle(srcreg), make_alpha_swizzle(srcreg)); -} - -/** - * Set an instruction's argument B (both RGB and ALPHA) from the given source, - * taking swizzles, negation and absolute value from the given source register. - */ -static void set_argB_reg(struct r500_pfs_compile_state *cs, int ip, int source, struct prog_src_register srcreg) -{ - set_argB(cs, ip, source, make_rgb_swizzle(srcreg), make_alpha_swizzle(srcreg)); -} - -/** - * Set an instruction's argument C (both RGB and ALPHA) from the given source, - * taking swizzles, negation and absolute value from the given source register. - */ -static void set_argC_reg(struct r500_pfs_compile_state *cs, int ip, int source, struct prog_src_register srcreg) -{ - set_argC(cs, ip, source, make_rgb_swizzle(srcreg), make_alpha_swizzle(srcreg)); -} - -/** - * Emit a special scalar operation. - */ -static int emit_sop(struct r500_pfs_compile_state *cs, - int opcode, struct prog_dst_register dstreg, GLuint src, GLuint swiz) -{ - int ip = emit_alu(cs, R500_ALU_RGBA_OP_SOP, opcode, dstreg); - set_src0_direct(cs, ip, src); - set_argA(cs, ip, 0, R500_SWIZ_RGB_ZERO, swiz); - return ip; -} + if (code->inst_end >= 511) { + error("emit_alu: Too many instructions"); + return GL_FALSE; + } + int ip = ++code->inst_end; -static void do_inst(struct r500_pfs_compile_state *cs, struct prog_instruction *fpi) { - PROG_CODE; - GLuint src[3], dest = 0; - int ip; + code->inst[ip].inst5 = translate_rgb_op(inst->RGB.Opcode); + code->inst[ip].inst4 = translate_alpha_op(inst->Alpha.Opcode); - if (fpi->Opcode != OPCODE_KIL) { - dest = make_dest(cs, fpi->DstReg); - } + if (inst->RGB.OutputWriteMask || inst->Alpha.OutputWriteMask || inst->Alpha.DepthWriteMask) + code->inst[ip].inst0 = R500_INST_TYPE_OUT; + else + code->inst[ip].inst0 = R500_INST_TYPE_ALU; + code->inst[ip].inst0 |= R500_INST_TEX_SEM_WAIT; - switch (fpi->Opcode) { - case OPCODE_ADD: - /* Variation on MAD: 1*src0+src1 */ - ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_argA(cs, ip, 0, R500_SWIZ_RGB_ONE, R500_SWIZZLE_ONE); - set_argB_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argC_reg(cs, ip, 1, fpi->SrcReg[1]); - break; - case OPCODE_CMP: - /* This inst's selects need to be swapped as follows: - * 0 -> C ; 1 -> B ; 2 -> A */ - ip = emit_alu(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_CMP, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_src2(cs, ip, fpi->SrcReg[2]); - set_argA_reg(cs, ip, 2, fpi->SrcReg[2]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - set_argC_reg(cs, ip, 0, fpi->SrcReg[0]); - break; - case OPCODE_COS: - src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, R500_ALPHA_OP_COS, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); - break; - case OPCODE_DP3: - ip = emit_alu(cs, R500_ALU_RGBA_OP_DP3, R500_ALPHA_OP_DP, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - break; - case OPCODE_DP4: - ip = emit_alu(cs, R500_ALU_RGBA_OP_DP4, R500_ALPHA_OP_DP, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - break; - case OPCODE_EX2: - src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, R500_ALPHA_OP_EX2, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); - break; - case OPCODE_FRC: - ip = emit_alu(cs, R500_ALU_RGBA_OP_FRC, R500_ALPHA_OP_FRC, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - break; - case OPCODE_LG2: - src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, R500_ALPHA_OP_LN2, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); - break; - case OPCODE_MAD: - ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_src2(cs, ip, fpi->SrcReg[2]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - set_argC_reg(cs, ip, 2, fpi->SrcReg[2]); - break; - case OPCODE_MAX: - ip = emit_alu(cs, R500_ALU_RGBA_OP_MAX, R500_ALPHA_OP_MAX, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - break; - case OPCODE_MIN: - ip = emit_alu(cs, R500_ALU_RGBA_OP_MIN, R500_ALPHA_OP_MIN, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - break; - case OPCODE_MOV: - ip = emit_alu(cs, R500_ALU_RGBA_OP_CMP, R500_ALPHA_OP_CMP, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); - code->inst[ip].inst3 |= R500_ALU_RGB_OMOD_DISABLE; - code->inst[ip].inst4 |= R500_ALPHA_OMOD_DISABLE; - break; - case OPCODE_MUL: - /* Variation on MAD: src0*src1+0 */ - ip = emit_alu(cs, R500_ALU_RGBA_OP_MAD, R500_ALPHA_OP_MAD, fpi->DstReg); - set_src0(cs, ip, fpi->SrcReg[0]); - set_src1(cs, ip, fpi->SrcReg[1]); - set_argA_reg(cs, ip, 0, fpi->SrcReg[0]); - set_argB_reg(cs, ip, 1, fpi->SrcReg[1]); - set_argC(cs, ip, 0, R500_SWIZ_RGB_ZERO, R500_SWIZZLE_ZERO); - break; - case OPCODE_RCP: - src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, R500_ALPHA_OP_RCP, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); - break; - case OPCODE_RSQ: - src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, R500_ALPHA_OP_RSQ, fpi->DstReg, src[0], - (make_sop_swizzle(fpi->SrcReg[0]) | (R500_SWIZ_MOD_ABS<<3)) & ~(R500_SWIZ_MOD_NEG<<3)); - break; - case OPCODE_SIN: - src[0] = make_src(cs, fpi->SrcReg[0]); - emit_sop(cs, R500_ALPHA_OP_SIN, fpi->DstReg, src[0], make_sop_swizzle(fpi->SrcReg[0])); - break; - case OPCODE_KIL: - case OPCODE_TEX: - case OPCODE_TXB: - case OPCODE_TXP: - emit_tex(cs, fpi, dest); - break; - default: - ERROR("unknown fpi->Opcode %s\n", _mesa_opcode_string(fpi->Opcode)); - break; + code->inst[ip].inst0 |= (inst->RGB.WriteMask << 11) | (inst->Alpha.WriteMask << 14); + code->inst[ip].inst0 |= (inst->RGB.OutputWriteMask << 15) | (inst->Alpha.OutputWriteMask << 18); + if (inst->Alpha.DepthWriteMask) { + code->inst[ip].inst4 |= R500_ALPHA_W_OMASK; + c->fp->writes_depth = GL_TRUE; } - /* Finishing touches */ - if (fpi->SaturateMode == SATURATE_ZERO_ONE) { - code->inst[cs->nrslots-1].inst0 |= R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP; - } -} + code->inst[ip].inst4 |= R500_ALPHA_ADDRD(inst->Alpha.DestIndex); + code->inst[ip].inst5 |= R500_ALU_RGBA_ADDRD(inst->RGB.DestIndex); + use_temporary(code, inst->Alpha.DestIndex); + use_temporary(code, inst->RGB.DestIndex); -static GLboolean parse_program(struct r500_pfs_compile_state *cs) -{ - PROG_CODE; - struct prog_instruction* fpi; + if (inst->RGB.Saturate) + code->inst[ip].inst0 |= R500_INST_RGB_CLAMP; + if (inst->Alpha.Saturate) + code->inst[ip].inst0 |= R500_INST_ALPHA_CLAMP; - for(fpi = cs->compiler->program->Instructions; fpi->Opcode != OPCODE_END; ++fpi) { - do_inst(cs, fpi); + code->inst[ip].inst1 |= R500_RGB_ADDR0(use_source(code, inst->RGB.Src[0])); + code->inst[ip].inst1 |= R500_RGB_ADDR1(use_source(code, inst->RGB.Src[1])); + code->inst[ip].inst1 |= R500_RGB_ADDR2(use_source(code, inst->RGB.Src[2])); - if (cs->compiler->fp->error) - return GL_FALSE; - } + code->inst[ip].inst2 |= R500_ALPHA_ADDR0(use_source(code, inst->Alpha.Src[0])); + code->inst[ip].inst2 |= R500_ALPHA_ADDR1(use_source(code, inst->Alpha.Src[1])); + code->inst[ip].inst2 |= R500_ALPHA_ADDR2(use_source(code, inst->Alpha.Src[2])); - /* Finish him! (If it's an ALU/OUT instruction...) */ - if ((code->inst[cs->nrslots-1].inst0 & 0x3) == 1) { - code->inst[cs->nrslots-1].inst0 |= R500_INST_LAST; - } else { - /* We still need to put an output inst, right? */ - WARN_ONCE("Final FP instruction is not an OUT.\n"); - } + code->inst[ip].inst3 |= translate_arg_rgb(inst, 0) << R500_ALU_RGB_SEL_A_SHIFT; + code->inst[ip].inst3 |= translate_arg_rgb(inst, 1) << R500_ALU_RGB_SEL_B_SHIFT; + code->inst[ip].inst5 |= translate_arg_rgb(inst, 2) << R500_ALU_RGBA_SEL_C_SHIFT; - code->max_temp_idx++; + code->inst[ip].inst4 |= translate_arg_alpha(inst, 0) << R500_ALPHA_SEL_A_SHIFT; + code->inst[ip].inst4 |= translate_arg_alpha(inst, 1) << R500_ALPHA_SEL_B_SHIFT; + code->inst[ip].inst5 |= translate_arg_alpha(inst, 2) << R500_ALU_RGBA_ALPHA_SEL_C_SHIFT; return GL_TRUE; } -static void init_program(struct r500_pfs_compile_state *cs) +static GLuint translate_strq_swizzle(struct prog_src_register src) { - PROG_CODE; - struct gl_fragment_program *mp = &cs->compiler->fp->mesa_program; - struct prog_instruction *fpi; - GLuint InputsRead = mp->Base.InputsRead; - GLuint temps_used = 0; + GLuint swiz = 0; int i; + for (i = 0; i < 4; i++) + swiz |= (GET_SWZ(src.Swizzle, i) & 0x3) << i*2; + return swiz; +} - /* New compile, reset tracking data */ - cs->compiler->fp->optimization = - driQueryOptioni(&cs->compiler->r300->radeon.optionCache, "fp_optimization"); - cs->compiler->fp->translated = GL_FALSE; - cs->compiler->fp->error = GL_FALSE; +/** + * Emit a single TEX instruction + */ +static GLboolean emit_tex(void *data, struct prog_instruction *inst) +{ + PROG_CODE; - _mesa_bzero(code, sizeof(*code)); - code->max_temp_idx = 1; /* Size of pixel stack, plus 1. */ - cs->nrslots = 0; - cs->compiler->fp->writes_depth = GL_FALSE; - - /* Work out what temps the Mesa inputs correspond to, this must match - * what setup_rs_unit does, which shouldn't be a problem as rs_unit - * configures itself based on the fragprog's InputsRead - * - * NOTE: this depends on get_hw_temp() allocating registers in order, - * starting from register 0, so we're just going to do that instead. - */ - - /* Texcoords come first */ - for (i = 0; i < cs->compiler->fp->ctx->Const.MaxTextureUnits; i++) { - if (InputsRead & (FRAG_BIT_TEX0 << i)) { - cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0; - cs->inputs[FRAG_ATTRIB_TEX0 + i].reg = - code->temp_reg_offset; - code->temp_reg_offset++; - } - } - InputsRead &= ~FRAG_BITS_TEX_ANY; - - /* fragment position treated as a texcoord */ - if (InputsRead & FRAG_BIT_WPOS) { - cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0; - cs->inputs[FRAG_ATTRIB_WPOS].reg = - code->temp_reg_offset; - code->temp_reg_offset++; - } - InputsRead &= ~FRAG_BIT_WPOS; - - /* Then primary colour */ - if (InputsRead & FRAG_BIT_COL0) { - cs->inputs[FRAG_ATTRIB_COL0].refcount = 0; - cs->inputs[FRAG_ATTRIB_COL0].reg = - code->temp_reg_offset; - code->temp_reg_offset++; - } - InputsRead &= ~FRAG_BIT_COL0; - - /* Secondary color */ - if (InputsRead & FRAG_BIT_COL1) { - cs->inputs[FRAG_ATTRIB_COL1].refcount = 0; - cs->inputs[FRAG_ATTRIB_COL1].reg = - code->temp_reg_offset; - code->temp_reg_offset++; - } - InputsRead &= ~FRAG_BIT_COL1; - - /* Anything else */ - if (InputsRead) { - WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead); - /* force read from hwreg 0 for now */ - for (i = 0; i < 32; i++) - if (InputsRead & (1 << i)) - cs->inputs[i].reg = 0; + if (code->inst_end >= 511) { + error("emit_tex: Too many instructions"); + return GL_FALSE; } - int ip; + int ip = ++code->inst_end; - for (ip = 0; ip < cs->compiler->program->NumInstructions; ip++) { - fpi = cs->compiler->program->Instructions + ip; - for (i = 0; i < 3; i++) { - if (fpi->SrcReg[i].File == PROGRAM_TEMPORARY) { - if (fpi->SrcReg[i].Index >= temps_used) - temps_used = fpi->SrcReg[i].Index + 1; - } - } - } + code->inst[ip].inst0 = R500_INST_TYPE_TEX + | (inst->DstReg.WriteMask << 11) + | R500_INST_TEX_SEM_WAIT; + code->inst[ip].inst1 = R500_TEX_ID(inst->TexSrcUnit) + | R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED; + if (inst->TexSrcTarget == TEXTURE_RECT_INDEX) + code->inst[ip].inst1 |= R500_TEX_UNSCALED; - cs->temp_in_use = temps_used + 1; + switch (inst->Opcode) { + case OPCODE_KIL: + code->inst[ip].inst1 |= R500_TEX_INST_TEXKILL; + break; + case OPCODE_TEX: + code->inst[ip].inst1 |= R500_TEX_INST_LD; + break; + case OPCODE_TXB: + code->inst[ip].inst1 |= R500_TEX_INST_LODBIAS; + break; + case OPCODE_TXP: + code->inst[ip].inst1 |= R500_TEX_INST_PROJ; + break; + default: + error("emit_tex can't handle opcode %x\n", inst->Opcode); + } - code->max_temp_idx = code->temp_reg_offset + cs->temp_in_use; + code->inst[ip].inst2 = R500_TEX_SRC_ADDR(inst->SrcReg[0].Index) + | (translate_strq_swizzle(inst->SrcReg[0]) << 8) + | R500_TEX_DST_ADDR(inst->DstReg.Index) + | R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G + | R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A; - if (RADEON_DEBUG & DEBUG_PIXEL) - fprintf(stderr, "FP temp indices: code->max_temp_idx: %d cs->temp_in_use: %d\n", code->max_temp_idx, cs->temp_in_use); + return GL_TRUE; } -static void dumb_shader(struct r500_pfs_compile_state *cs) -{ - PROG_CODE; - code->inst[0].inst0 = R500_INST_TYPE_TEX - | R500_INST_TEX_SEM_WAIT - | R500_INST_RGB_WMASK_R - | R500_INST_RGB_WMASK_G - | R500_INST_RGB_WMASK_B - | R500_INST_ALPHA_WMASK - | R500_INST_RGB_CLAMP - | R500_INST_ALPHA_CLAMP; - code->inst[0].inst1 = R500_TEX_ID(0) - | R500_TEX_INST_LD - | R500_TEX_SEM_ACQUIRE - | R500_TEX_IGNORE_UNCOVERED; - code->inst[0].inst2 = R500_TEX_SRC_ADDR(0) - | R500_TEX_SRC_S_SWIZ_R - | R500_TEX_SRC_T_SWIZ_G - | R500_TEX_DST_ADDR(0) - | R500_TEX_DST_R_SWIZ_R - | R500_TEX_DST_G_SWIZ_G - | R500_TEX_DST_B_SWIZ_B - | R500_TEX_DST_A_SWIZ_A; - code->inst[0].inst3 = R500_DX_ADDR(0) - | R500_DX_S_SWIZ_R - | R500_DX_T_SWIZ_R - | R500_DX_R_SWIZ_R - | R500_DX_Q_SWIZ_R - | R500_DY_ADDR(0) - | R500_DY_S_SWIZ_R - | R500_DY_T_SWIZ_R - | R500_DY_R_SWIZ_R - | R500_DY_Q_SWIZ_R; - code->inst[0].inst4 = 0x0; - code->inst[0].inst5 = 0x0; - - code->inst[1].inst0 = R500_INST_TYPE_OUT | - R500_INST_TEX_SEM_WAIT | - R500_INST_LAST | - R500_INST_RGB_OMASK_R | - R500_INST_RGB_OMASK_G | - R500_INST_RGB_OMASK_B | - R500_INST_ALPHA_OMASK; - code->inst[1].inst1 = R500_RGB_ADDR0(0) | - R500_RGB_ADDR1(0) | - R500_RGB_ADDR1_CONST | - R500_RGB_ADDR2(0) | - R500_RGB_ADDR2_CONST | - R500_RGB_SRCP_OP_1_MINUS_2RGB0; - code->inst[1].inst2 = R500_ALPHA_ADDR0(0) | - R500_ALPHA_ADDR1(0) | - R500_ALPHA_ADDR1_CONST | - R500_ALPHA_ADDR2(0) | - R500_ALPHA_ADDR2_CONST | - R500_ALPHA_SRCP_OP_1_MINUS_2A0; - code->inst[1].inst3 = R500_ALU_RGB_SEL_A_SRC0 | - R500_ALU_RGB_R_SWIZ_A_R | - R500_ALU_RGB_G_SWIZ_A_G | - R500_ALU_RGB_B_SWIZ_A_B | - R500_ALU_RGB_SEL_B_SRC0 | - R500_ALU_RGB_R_SWIZ_B_1 | - R500_ALU_RGB_B_SWIZ_B_1 | - R500_ALU_RGB_G_SWIZ_B_1; - code->inst[1].inst4 = R500_ALPHA_OP_MAD | - R500_ALPHA_SWIZ_A_A | - R500_ALPHA_SWIZ_B_1; - code->inst[1].inst5 = R500_ALU_RGBA_OP_MAD | - R500_ALU_RGBA_R_SWIZ_0 | - R500_ALU_RGBA_G_SWIZ_0 | - R500_ALU_RGBA_B_SWIZ_0 | - R500_ALU_RGBA_A_SWIZ_0; - - cs->nrslots = 2; -} +static const struct radeon_pair_handler pair_handler = { + .EmitConst = emit_const, + .EmitPaired = emit_paired, + .EmitTex = emit_tex, + .MaxHwTemps = 128 +}; GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler) { - struct r500_pfs_compile_state cs; struct r500_fragment_program_code *code = compiler->code; - _mesa_memset(&cs, 0, sizeof(cs)); - cs.compiler = compiler; - init_program(&cs); - - if (!parse_program(&cs)) { -#if 0 - ERROR("Huh. Couldn't parse program. There should be additional errors explaining why.\nUsing dumb shader...\n"); - dumb_shader(fp); - code->inst_offset = 0; - code->inst_end = cs.nrslots - 1; -#endif + _mesa_bzero(code, sizeof(*code)); + code->max_temp_idx = 1; + code->inst_offset = 0; + code->inst_end = -1; + + if (!radeonPairProgram(compiler->r300->radeon.glCtx, compiler->program, &pair_handler, compiler)) return GL_FALSE; - } - code->inst_offset = 0; - code->inst_end = cs.nrslots - 1; + if ((code->inst[code->inst_end].inst0 & R500_INST_TYPE_MASK) != R500_INST_TYPE_OUT) { + /* This may happen when dead-code elimination is disabled or + * when most of the fragment program logic is leading to a KIL */ + if (code->inst_end >= 511) { + error("Introducing fake OUT: Too many instructions"); + return GL_FALSE; + } + + int ip = ++code->inst_end; + code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT; + } return GL_TRUE; } diff --git a/src/mesa/drivers/dri/r300/radeon_program.h b/src/mesa/drivers/dri/r300/radeon_program.h index ba76bc47cfb..2e01dd496b3 100644 --- a/src/mesa/drivers/dri/r300/radeon_program.h +++ b/src/mesa/drivers/dri/r300/radeon_program.h @@ -45,6 +45,10 @@ enum { PROGRAM_BUILTIN = PROGRAM_FILE_MAX /**< not a real register, but a special swizzle constant */ }; +enum { + OPCODE_REPL_ALPHA = MAX_OPCODE /**< used in paired instructions */ +}; + #define SWIZZLE_0000 MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO) #define SWIZZLE_1111 MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE) diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.c b/src/mesa/drivers/dri/r300/radeon_program_pair.c new file mode 100644 index 00000000000..86180edcb5e --- /dev/null +++ b/src/mesa/drivers/dri/r300/radeon_program_pair.c @@ -0,0 +1,970 @@ +/* + * Copyright (C) 2008 Nicolai Haehnle. + * + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +/** + * @file + * + * Perform temporary register allocation and attempt to pair off instructions + * in RGB and Alpha pairs. Also attempts to optimize the TEX instruction + * vs. ALU instruction scheduling. + */ + +#include "radeon_program_pair.h" + +#include "radeon_context.h" + +#include "shader/prog_print.h" + +#define error(fmt, args...) do { \ + _mesa_problem(s->Ctx, "%s::%s(): " fmt "\n", \ + __FILE__, __FUNCTION__, ##args); \ + s->Error = GL_TRUE; \ +} while(0) + +struct pair_state_instruction { + GLuint IsTex:1; /**< Is a texture instruction */ + GLuint NeedRGB:1; /**< Needs the RGB ALU */ + GLuint NeedAlpha:1; /**< Needs the Alpha ALU */ + GLuint IsTranscendent:1; /**< Is a special transcendent instruction */ + + /** + * Number of (read and write) dependencies that must be resolved before + * this instruction can be scheduled. + */ + GLuint NumDependencies:5; + + /** + * Next instruction in the linked list of ready instructions. + */ + struct pair_state_instruction *NextReady; + + /** + * Values that this instruction writes + */ + struct reg_value *Values[4]; +}; + + +/** + * Used to keep track of which instructions read a value. + */ +struct reg_value_reader { + GLuint IP; /**< IP of the instruction that performs this access */ + struct reg_value_reader *Next; +}; + +/** + * Used to keep track which values are stored in each component of a + * PROGRAM_TEMPORARY. + */ +struct reg_value { + GLuint IP; /**< IP of the instruction that writes this value */ + struct reg_value *Next; /**< Pointer to the next value to be written to the same PROGRAM_TEMPORARY component */ + + /** + * Unordered linked list of instructions that read from this value. + */ + struct reg_value_reader *Readers; + + /** + * Number of readers of this value. This is calculated during @ref scan_instructions + * and continually decremented during code emission. + * When this count reaches zero, the instruction that writes the @ref Next value + * can be scheduled. + */ + GLuint NumReaders; +}; + +/** + * Used to translate a PROGRAM_INPUT or PROGRAM_TEMPORARY Mesa register + * to the proper hardware temporary. + */ +struct pair_register_translation { + GLuint Allocated:1; + GLuint HwIndex:8; + GLuint RefCount:23; /**< # of times this occurs in an unscheduled instruction SrcReg or DstReg */ + + /** + * Notes the value that is currently contained in each component + * (only used for PROGRAM_TEMPORARY registers). + */ + struct reg_value *Value[4]; +}; + +struct pair_state { + GLcontext *Ctx; + struct gl_program *Program; + const struct radeon_pair_handler *Handler; + GLboolean Error; + GLboolean Debug; + GLboolean Verbose; + void *UserData; + + /** + * Translate Mesa registers to hardware registers + */ + struct pair_register_translation Inputs[FRAG_ATTRIB_MAX]; + struct pair_register_translation Temps[MAX_PROGRAM_TEMPS]; + + /** + * Derived information about program instructions. + */ + struct pair_state_instruction *Instructions; + + struct { + GLuint RefCount; /**< # of times this occurs in an unscheduled SrcReg or DstReg */ + } HwTemps[128]; + + /** + * Linked list of instructions that can be scheduled right now, + * based on which ALU/TEX resources they require. + */ + struct pair_state_instruction *ReadyFullALU; + struct pair_state_instruction *ReadyRGB; + struct pair_state_instruction *ReadyAlpha; + struct pair_state_instruction *ReadyTEX; + + /** + * Pool of @ref reg_value structures for fast allocation. + */ + struct reg_value *ValuePool; + GLuint ValuePoolUsed; + struct reg_value_reader *ReaderPool; + GLuint ReaderPoolUsed; +}; + + +static struct pair_register_translation *get_register(struct pair_state *s, GLuint file, GLuint index) +{ + switch(file) { + case PROGRAM_TEMPORARY: return &s->Temps[index]; + case PROGRAM_INPUT: return &s->Inputs[index]; + default: return 0; + } +} + + +static GLuint get_hw_reg(struct pair_state *s, GLuint file, GLuint index) +{ + GLuint hwindex; + + struct pair_register_translation *t = get_register(s, file, index); + if (!t) { + _mesa_problem(s->Ctx, "get_hw_reg: %i[%i]\n", file, index); + return 0; + } + + if (t->Allocated) + return t->HwIndex; + + for(hwindex = 0; hwindex < s->Handler->MaxHwTemps; ++hwindex) + if (!s->HwTemps[hwindex].RefCount) + break; + + if (hwindex >= s->Handler->MaxHwTemps) { + error("Ran out of hardware temporaries"); + return 0; + } + + s->HwTemps[hwindex].RefCount = t->RefCount; + t->Allocated = 1; + t->HwIndex = hwindex; + return hwindex; +} + + +static void deref_hw_reg(struct pair_state *s, GLuint hwindex) +{ + if (!s->HwTemps[hwindex].RefCount) { + error("Hwindex %i refcount error", hwindex); + return; + } + + s->HwTemps[hwindex].RefCount--; +} + +static void add_pairinst_to_list(struct pair_state_instruction **list, struct pair_state_instruction *pairinst) +{ + pairinst->NextReady = *list; + *list = pairinst; +} + +/** + * The instruction at the given IP has become ready. Link it into the ready + * instructions. + */ +static void instruction_ready(struct pair_state *s, int ip) +{ + struct pair_state_instruction *pairinst = s->Instructions + ip; + + if (s->Verbose) + _mesa_printf("instruction_ready(%i)\n", ip); + + if (pairinst->IsTex) + add_pairinst_to_list(&s->ReadyTEX, pairinst); + else if (!pairinst->NeedAlpha) + add_pairinst_to_list(&s->ReadyRGB, pairinst); + else if (!pairinst->NeedRGB) + add_pairinst_to_list(&s->ReadyAlpha, pairinst); + else + add_pairinst_to_list(&s->ReadyFullALU, pairinst); +} + + +/** + * Finally rewrite ADD, MOV, MUL as the appropriate native instruction + * and reverse the order of arguments for CMP. + */ +static void final_rewrite(struct pair_state *s, struct prog_instruction *inst) +{ + struct prog_src_register tmp; + + switch(inst->Opcode) { + case OPCODE_ADD: + inst->SrcReg[2] = inst->SrcReg[1]; + inst->SrcReg[1].File = PROGRAM_BUILTIN; + inst->SrcReg[1].Swizzle = SWIZZLE_1111; + inst->SrcReg[1].NegateBase = 0; + inst->SrcReg[1].NegateAbs = 0; + inst->Opcode = OPCODE_MAD; + break; + case OPCODE_CMP: + tmp = inst->SrcReg[2]; + inst->SrcReg[2] = inst->SrcReg[0]; + inst->SrcReg[0] = tmp; + break; + case OPCODE_MOV: + inst->SrcReg[1] = inst->SrcReg[0]; + inst->SrcReg[2].File = PROGRAM_BUILTIN; + inst->SrcReg[2].Swizzle = SWIZZLE_0000; + inst->Opcode = OPCODE_CMP; + // TODO: disable output modifiers on R500 + break; + case OPCODE_MUL: + inst->SrcReg[2].File = PROGRAM_BUILTIN; + inst->SrcReg[2].Swizzle = SWIZZLE_0000; + inst->Opcode = OPCODE_MAD; + break; + default: + /* nothing to do */ + break; + } +} + + +/** + * Classify an instruction according to which ALUs etc. it needs + */ +static void classify_instruction(struct pair_state *s, + struct prog_instruction *inst, struct pair_state_instruction *pairinst) +{ + pairinst->NeedRGB = (inst->DstReg.WriteMask & WRITEMASK_XYZ) ? 1 : 0; + pairinst->NeedAlpha = (inst->DstReg.WriteMask & WRITEMASK_W) ? 1 : 0; + + switch(inst->Opcode) { + case OPCODE_ADD: + case OPCODE_CMP: + case OPCODE_FRC: + case OPCODE_MAD: + case OPCODE_MAX: + case OPCODE_MIN: + case OPCODE_MOV: + case OPCODE_MUL: + break; + case OPCODE_COS: + case OPCODE_EX2: + case OPCODE_LG2: + case OPCODE_RCP: + case OPCODE_RSQ: + case OPCODE_SIN: + pairinst->IsTranscendent = 1; + pairinst->NeedAlpha = 1; + break; + case OPCODE_DP4: + pairinst->NeedAlpha = 1; + /* fall through */ + case OPCODE_DP3: + pairinst->NeedRGB = 1; + break; + case OPCODE_KIL: + case OPCODE_TEX: + case OPCODE_TXB: + case OPCODE_TXP: + case OPCODE_END: + pairinst->IsTex = 1; + break; + default: + error("Unknown opcode %d\n", inst->Opcode); + break; + } +} + + +/** + * Count which (input, temporary) register is read and written how often, + * and scan the instruction stream to find dependencies. + */ +static void scan_instructions(struct pair_state *s) +{ + struct prog_instruction *inst; + struct pair_state_instruction *pairinst; + GLuint ip; + + for(inst = s->Program->Instructions, pairinst = s->Instructions, ip = 0; + inst->Opcode != OPCODE_END; + ++inst, ++pairinst, ++ip) { + final_rewrite(s, inst); + classify_instruction(s, inst, pairinst); + + int nsrc = _mesa_num_inst_src_regs(inst->Opcode); + int j; + for(j = 0; j < nsrc; j++) { + struct pair_register_translation *t = + get_register(s, inst->SrcReg[j].File, inst->SrcReg[j].Index); + if (!t) + continue; + + t->RefCount++; + + if (inst->SrcReg[j].File == PROGRAM_TEMPORARY) { + int i; + for(i = 0; i < 4; ++i) { + GLuint swz = GET_SWZ(inst->SrcReg[j].Swizzle, i); + if (swz >= 4) + continue; /* constant or NIL swizzle */ + if (!t->Value[swz]) + continue; /* this is an undefined read */ + + /* Do not add a dependency if this instruction + * also rewrites the value. The code below adds + * a dependency for the DstReg, which is a superset + * of the SrcReg dependency. */ + if (inst->DstReg.File == PROGRAM_TEMPORARY && + inst->DstReg.Index == inst->SrcReg[j].Index && + GET_BIT(inst->DstReg.WriteMask, swz)) + continue; + + struct reg_value_reader* r = &s->ReaderPool[s->ReaderPoolUsed++]; + pairinst->NumDependencies++; + t->Value[swz]->NumReaders++; + r->IP = ip; + r->Next = t->Value[swz]->Readers; + t->Value[swz]->Readers = r; + } + } + } + + int ndst = _mesa_num_inst_dst_regs(inst->Opcode); + if (ndst) { + struct pair_register_translation *t = + get_register(s, inst->DstReg.File, inst->DstReg.Index); + if (t) { + t->RefCount++; + + if (inst->DstReg.File == PROGRAM_TEMPORARY) { + int j; + for(j = 0; j < 4; ++j) { + if (!GET_BIT(inst->DstReg.WriteMask, j)) + continue; + + struct reg_value* v = &s->ValuePool[s->ValuePoolUsed++]; + v->IP = ip; + if (t->Value[j]) { + pairinst->NumDependencies++; + t->Value[j]->Next = v; + } + t->Value[j] = v; + pairinst->Values[j] = v; + } + } + } + } + + if (s->Verbose) + _mesa_printf("scan(%i): NumDeps = %i\n", ip, pairinst->NumDependencies); + + if (!pairinst->NumDependencies) + instruction_ready(s, ip); + } + + /* Clear the PROGRAM_TEMPORARY state */ + int i, j; + for(i = 0; i < MAX_PROGRAM_TEMPS; ++i) { + for(j = 0; j < 4; ++j) + s->Temps[i].Value[j] = 0; + } +} + + +/** + * Reserve hardware temporary registers for the program inputs. + * + * @note This allocation is performed explicitly, because the order of inputs + * is determined by the RS hardware. + */ +static void allocate_input_registers(struct pair_state *s) +{ + GLuint InputsRead = s->Program->InputsRead; + int i; + + /* Texcoords come first */ + for (i = 0; i < s->Ctx->Const.MaxTextureUnits; i++) { + if (InputsRead & (FRAG_BIT_TEX0 << i)) + get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_TEX0+i); + } + InputsRead &= ~FRAG_BITS_TEX_ANY; + + /* fragment position treated as a texcoord */ + if (InputsRead & FRAG_BIT_WPOS) + get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_WPOS); + InputsRead &= ~FRAG_BIT_WPOS; + + /* Then primary colour */ + if (InputsRead & FRAG_BIT_COL0) + get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL0); + InputsRead &= ~FRAG_BIT_COL0; + + /* Secondary color */ + if (InputsRead & FRAG_BIT_COL1) + get_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL1); + InputsRead &= ~FRAG_BIT_COL1; + + /* Anything else */ + if (InputsRead) + error("Don't know how to handle inputs 0x%x\n", InputsRead); +} + + +static void decrement_dependencies(struct pair_state *s, int ip) +{ + struct pair_state_instruction *pairinst = s->Instructions + ip; + ASSERT(pairinst->NumDependencies > 0); + if (!--pairinst->NumDependencies) + instruction_ready(s, ip); +} + +/** + * Update the dependency tracking state based on what the instruction + * at the given IP does. + */ +static void commit_instruction(struct pair_state *s, int ip) +{ + struct prog_instruction *inst = s->Program->Instructions + ip; + struct pair_state_instruction *pairinst = s->Instructions + ip; + + if (s->Verbose) + _mesa_printf("commit_instruction(%i)\n", ip); + + if (inst->DstReg.File == PROGRAM_TEMPORARY) { + struct pair_register_translation *t = &s->Temps[inst->DstReg.Index]; + deref_hw_reg(s, t->HwIndex); + + int i; + for(i = 0; i < 4; ++i) { + if (!GET_BIT(inst->DstReg.WriteMask, i)) + continue; + + t->Value[i] = pairinst->Values[i]; + if (t->Value[i]->NumReaders) { + struct reg_value_reader *r; + for(r = pairinst->Values[i]->Readers; r; r = r->Next) + decrement_dependencies(s, r->IP); + } else if (t->Value[i]->Next) { + /* This happens when the only reader writes + * the register at the same time */ + decrement_dependencies(s, t->Value[i]->Next->IP); + } + } + } + + int nsrc = _mesa_num_inst_src_regs(inst->Opcode); + int i; + for(i = 0; i < nsrc; i++) { + struct pair_register_translation *t = get_register(s, inst->SrcReg[i].File, inst->SrcReg[i].Index); + if (!t) + continue; + + deref_hw_reg(s, get_hw_reg(s, inst->SrcReg[i].File, inst->SrcReg[i].Index)); + + if (inst->SrcReg[i].File != PROGRAM_TEMPORARY) + continue; + + int j; + for(j = 0; j < 4; ++j) { + GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j); + if (swz >= 4) + continue; + if (!t->Value[swz]) + continue; + + /* Do not free a dependency if this instruction + * also rewrites the value. See scan_instructions. */ + if (inst->DstReg.File == PROGRAM_TEMPORARY && + inst->DstReg.Index == inst->SrcReg[i].Index && + GET_BIT(inst->DstReg.WriteMask, swz)) + continue; + + if (!--t->Value[swz]->NumReaders) { + if (t->Value[swz]->Next) + decrement_dependencies(s, t->Value[swz]->Next->IP); + } + } + } +} + + +/** + * Emit all ready texture instructions in a single block. + * + * Emit as a single block to (hopefully) sample many textures in parallel, + * and to avoid hardware indirections on R300. + * + * In R500, we don't really know when the result of a texture instruction + * arrives. So allocate all destinations first, to make sure they do not + * arrive early and overwrite a texture coordinate we're going to use later + * in the block. + */ +static void emit_all_tex(struct pair_state *s) +{ + struct pair_state_instruction *readytex; + struct pair_state_instruction *pairinst; + + ASSERT(s->ReadyTEX); + + // Don't let the ready list change under us! + readytex = s->ReadyTEX; + s->ReadyTEX = 0; + + // Allocate destination hardware registers in one block to avoid conflicts. + for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) { + int ip = pairinst - s->Instructions; + struct prog_instruction *inst = s->Program->Instructions + ip; + if (inst->Opcode != OPCODE_KIL) + get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index); + } + + if (s->Debug) + _mesa_printf(" BEGIN_TEX\n"); + + for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) { + int ip = pairinst - s->Instructions; + struct prog_instruction *inst = s->Program->Instructions + ip; + commit_instruction(s, ip); + + if (inst->Opcode != OPCODE_KIL) + inst->DstReg.Index = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index); + inst->SrcReg[0].Index = get_hw_reg(s, inst->SrcReg[0].File, inst->SrcReg[0].Index); + + if (s->Debug) { + _mesa_printf(" "); + _mesa_print_instruction(inst); + } + s->Error = s->Error || !s->Handler->EmitTex(s->UserData, inst); + } + + if (s->Handler->EndTexBlock) + s->Handler->EndTexBlock(s->UserData); + + if (s->Debug) + _mesa_printf(" END_TEX\n"); +} + + +static int alloc_pair_source(struct pair_state *s, struct radeon_pair_instruction *pair, + struct prog_src_register src, GLboolean rgb, GLboolean alpha) +{ + int candidate = -1; + int candidate_quality = -1; + int i; + + if (!rgb && !alpha) + return 0; + + GLuint constant; + GLuint index; + + if (src.File == PROGRAM_TEMPORARY || src.File == PROGRAM_INPUT) { + constant = 0; + index = get_hw_reg(s, src.File, src.Index); + } else { + constant = 1; + s->Error |= !s->Handler->EmitConst(s->UserData, src.File, src.Index, &index); + } + + for(i = 0; i < 3; ++i) { + int q = 0; + if (rgb) { + if (pair->RGB.Src[i].Used) { + if (pair->RGB.Src[i].Constant != constant || + pair->RGB.Src[i].Index != index) + continue; + q++; + } + } + if (alpha) { + if (pair->Alpha.Src[i].Used) { + if (pair->Alpha.Src[i].Constant != constant || + pair->Alpha.Src[i].Index != index) + continue; + q++; + } + } + if (q > candidate_quality) { + candidate_quality = q; + candidate = i; + } + } + + if (candidate >= 0) { + if (rgb) { + pair->RGB.Src[candidate].Used = 1; + pair->RGB.Src[candidate].Constant = constant; + pair->RGB.Src[candidate].Index = index; + } + if (alpha) { + pair->Alpha.Src[candidate].Used = 1; + pair->Alpha.Src[candidate].Constant = constant; + pair->Alpha.Src[candidate].Index = index; + } + } + + return candidate; +} + + + +/** + * Fill the given ALU instruction's opcodes and source operands into the given pair, + * if possible. + */ +static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_pair_instruction *pair, int ip) +{ + struct pair_state_instruction *pairinst = s->Instructions + ip; + struct prog_instruction *inst = s->Program->Instructions + ip; + + ASSERT(!pairinst->NeedRGB || pair->RGB.Opcode == OPCODE_NOP); + ASSERT(!pairinst->NeedAlpha || pair->Alpha.Opcode == OPCODE_NOP); + + if (pairinst->NeedRGB) { + if (pairinst->IsTranscendent) + pair->RGB.Opcode = OPCODE_REPL_ALPHA; + else + pair->RGB.Opcode = inst->Opcode; + } + if (pairinst->NeedAlpha) + pair->Alpha.Opcode = inst->Opcode; + + int nargs = _mesa_num_inst_src_regs(inst->Opcode); + int i; + + for(i = 0; i < nargs; ++i) { + int source; + if (pairinst->NeedRGB && !pairinst->IsTranscendent) { + GLboolean srcrgb = GL_FALSE; + GLboolean srcalpha = GL_FALSE; + GLuint negatebase = 0; + int j; + for(j = 0; j < 3; ++j) { + GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j); + if (swz < 3) + srcrgb = GL_TRUE; + else if (swz < 4) + srcalpha = GL_TRUE; + if (swz != SWIZZLE_NIL && GET_BIT(inst->SrcReg[i].NegateBase, j)) + negatebase = 1; + } + source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha); + if (source < 0) + return GL_FALSE; + pair->RGB.Arg[i].Source = source; + pair->RGB.Arg[i].Swizzle = inst->SrcReg[i].Swizzle & 0x1ff; + pair->RGB.Arg[i].Abs = inst->SrcReg[i].Abs; + pair->RGB.Arg[i].Negate = (negatebase & ~pair->RGB.Arg[i].Abs) ^ inst->SrcReg[i].NegateAbs; + } + if (pairinst->NeedAlpha) { + GLboolean srcrgb = GL_FALSE; + GLboolean srcalpha = GL_FALSE; + GLuint negatebase = GET_BIT(inst->SrcReg[i].NegateBase, pairinst->IsTranscendent ? 0 : 3); + GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, pairinst->IsTranscendent ? 0 : 3); + if (swz < 3) + srcrgb = GL_TRUE; + else if (swz < 4) + srcalpha = GL_TRUE; + source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha); + if (source < 0) + return GL_FALSE; + pair->Alpha.Arg[i].Source = source; + pair->Alpha.Arg[i].Swizzle = swz; + pair->Alpha.Arg[i].Abs = inst->SrcReg[i].Abs; + pair->Alpha.Arg[i].Negate = (negatebase & ~pair->RGB.Arg[i].Abs) ^ inst->SrcReg[i].NegateAbs; + } + } + + return GL_TRUE; +} + + +/** + * Fill in the destination register information. + * + * This is split from filling in source registers because we want + * to avoid allocating hardware temporaries for destinations until + * we are absolutely certain that we're going to emit a certain + * instruction pairing. + */ +static void fill_dest_into_pair(struct pair_state *s, struct radeon_pair_instruction *pair, int ip) +{ + struct pair_state_instruction *pairinst = s->Instructions + ip; + struct prog_instruction *inst = s->Program->Instructions + ip; + + if (inst->DstReg.File == PROGRAM_OUTPUT) { + if (inst->DstReg.Index == FRAG_RESULT_COLR) { + pair->RGB.OutputWriteMask |= inst->DstReg.WriteMask & WRITEMASK_XYZ; + pair->Alpha.OutputWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3); + } else if (inst->DstReg.Index == FRAG_RESULT_DEPR) { + pair->Alpha.DepthWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3); + } + } else { + GLuint hwindex = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index); + if (pairinst->NeedRGB) { + pair->RGB.DestIndex = hwindex; + pair->RGB.WriteMask |= inst->DstReg.WriteMask & WRITEMASK_XYZ; + } + if (pairinst->NeedAlpha) { + pair->Alpha.DestIndex = hwindex; + pair->Alpha.WriteMask |= GET_BIT(inst->DstReg.WriteMask, 3); + } + } +} + + +/** + * Find a good ALU instruction or pair of ALU instruction and emit it. + * + * Prefer emitting full ALU instructions, so that when we reach a point + * where no full ALU instruction can be emitted, we have more candidates + * for RGB/Alpha pairing. + */ +static void emit_alu(struct pair_state *s) +{ + struct radeon_pair_instruction pair; + + if (s->ReadyFullALU || !(s->ReadyRGB && s->ReadyAlpha)) { + int ip; + if (s->ReadyFullALU) { + ip = s->ReadyFullALU - s->Instructions; + s->ReadyFullALU = s->ReadyFullALU->NextReady; + } else if (s->ReadyRGB) { + ip = s->ReadyRGB - s->Instructions; + s->ReadyRGB = s->ReadyRGB->NextReady; + } else { + ip = s->ReadyAlpha - s->Instructions; + s->ReadyAlpha = s->ReadyAlpha->NextReady; + } + + _mesa_bzero(&pair, sizeof(pair)); + fill_instruction_into_pair(s, &pair, ip); + fill_dest_into_pair(s, &pair, ip); + commit_instruction(s, ip); + } else { + struct pair_state_instruction **prgb; + struct pair_state_instruction **palpha; + + /* Some pairings might fail because they require too + * many source slots; try all possible pairings if necessary */ + for(prgb = &s->ReadyRGB; *prgb; prgb = &(*prgb)->NextReady) { + for(palpha = &s->ReadyAlpha; *palpha; palpha = &(*palpha)->NextReady) { + int rgbip = *prgb - s->Instructions; + int alphaip = *palpha - s->Instructions; + _mesa_bzero(&pair, sizeof(pair)); + fill_instruction_into_pair(s, &pair, rgbip); + if (!fill_instruction_into_pair(s, &pair, alphaip)) + continue; + *prgb = (*prgb)->NextReady; + *palpha = (*palpha)->NextReady; + fill_dest_into_pair(s, &pair, rgbip); + fill_dest_into_pair(s, &pair, alphaip); + commit_instruction(s, rgbip); + commit_instruction(s, alphaip); + goto success; + } + } + + /* No success in pairing; just take the first RGB instruction */ + int ip = s->ReadyRGB - s->Instructions; + s->ReadyRGB = s->ReadyRGB->NextReady; + _mesa_bzero(&pair, sizeof(pair)); + fill_instruction_into_pair(s, &pair, ip); + fill_dest_into_pair(s, &pair, ip); + commit_instruction(s, ip); + success: ; + } + + if (s->Debug) + radeonPrintPairInstruction(&pair); + + s->Error = s->Error || !s->Handler->EmitPaired(s->UserData, &pair); +} + + +GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program, + const struct radeon_pair_handler* handler, void *userdata) +{ + struct pair_state s; + + _mesa_bzero(&s, sizeof(s)); + s.Ctx = ctx; + s.Program = program; + s.Handler = handler; + s.UserData = userdata; + s.Debug = (RADEON_DEBUG & DEBUG_PIXEL) ? GL_TRUE : GL_FALSE; + s.Verbose = GL_FALSE && s.Debug; + + s.Instructions = (struct pair_state_instruction*)_mesa_calloc( + sizeof(struct pair_state_instruction)*s.Program->NumInstructions); + s.ValuePool = (struct reg_value*)_mesa_calloc(sizeof(struct reg_value)*s.Program->NumInstructions*4); + s.ReaderPool = (struct reg_value_reader*)_mesa_calloc( + sizeof(struct reg_value_reader)*s.Program->NumInstructions*12); + + if (s.Debug) + _mesa_printf("Emit paired program\n"); + + scan_instructions(&s); + allocate_input_registers(&s); + + while(!s.Error && + (s.ReadyTEX || s.ReadyRGB || s.ReadyAlpha || s.ReadyFullALU)) { + if (s.ReadyTEX) + emit_all_tex(&s); + + while(s.ReadyFullALU || s.ReadyRGB || s.ReadyAlpha) + emit_alu(&s); + } + + if (s.Debug) + _mesa_printf(" END\n"); + + _mesa_free(s.Instructions); + _mesa_free(s.ValuePool); + _mesa_free(s.ReaderPool); + + return !s.Error; +} + + +static void print_pair_src(int i, struct radeon_pair_instruction_source* src) +{ + _mesa_printf(" Src%i = %s[%i]", i, src->Constant ? "CNST" : "TEMP", src->Index); +} + +static const char* opcode_string(GLuint opcode) +{ + if (opcode == OPCODE_REPL_ALPHA) + return "SOP"; + else + return _mesa_opcode_string(opcode); +} + +static int num_pairinst_args(GLuint opcode) +{ + if (opcode == OPCODE_REPL_ALPHA) + return 0; + else + return _mesa_num_inst_src_regs(opcode); +} + +static char swizzle_char(GLuint swz) +{ + switch(swz) { + case SWIZZLE_X: return 'x'; + case SWIZZLE_Y: return 'y'; + case SWIZZLE_Z: return 'z'; + case SWIZZLE_W: return 'w'; + case SWIZZLE_ZERO: return '0'; + case SWIZZLE_ONE: return '1'; + case SWIZZLE_NIL: return '_'; + default: return '?'; + } +} + +void radeonPrintPairInstruction(struct radeon_pair_instruction *inst) +{ + int nargs; + int i; + + _mesa_printf(" RGB: "); + for(i = 0; i < 3; ++i) { + if (inst->RGB.Src[i].Used) + print_pair_src(i, inst->RGB.Src + i); + } + _mesa_printf("\n"); + _mesa_printf(" Alpha:"); + for(i = 0; i < 3; ++i) { + if (inst->Alpha.Src[i].Used) + print_pair_src(i, inst->Alpha.Src + i); + } + _mesa_printf("\n"); + + _mesa_printf(" %s%s", opcode_string(inst->RGB.Opcode), inst->RGB.Saturate ? "_SAT" : ""); + if (inst->RGB.WriteMask) + _mesa_printf(" TEMP[%i].%s%s%s", inst->RGB.DestIndex, + (inst->RGB.WriteMask & 1) ? "x" : "", + (inst->RGB.WriteMask & 2) ? "y" : "", + (inst->RGB.WriteMask & 4) ? "z" : ""); + if (inst->RGB.OutputWriteMask) + _mesa_printf(" COLOR.%s%s%s", + (inst->RGB.OutputWriteMask & 1) ? "x" : "", + (inst->RGB.OutputWriteMask & 2) ? "y" : "", + (inst->RGB.OutputWriteMask & 4) ? "z" : ""); + nargs = num_pairinst_args(inst->RGB.Opcode); + for(i = 0; i < nargs; ++i) { + const char* abs = inst->RGB.Arg[i].Abs ? "|" : ""; + const char* neg = inst->RGB.Arg[i].Negate ? "-" : ""; + _mesa_printf(", %s%sSrc%i.%c%c%c%s", neg, abs, inst->RGB.Arg[i].Source, + swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 0)), + swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 1)), + swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 2)), + abs); + } + _mesa_printf("\n"); + + _mesa_printf(" %s%s", opcode_string(inst->Alpha.Opcode), inst->Alpha.Saturate ? "_SAT" : ""); + if (inst->Alpha.WriteMask) + _mesa_printf(" TEMP[%i].w", inst->Alpha.DestIndex); + if (inst->Alpha.OutputWriteMask) + _mesa_printf(" COLOR.w"); + if (inst->Alpha.DepthWriteMask) + _mesa_printf(" DEPTH.w"); + nargs = num_pairinst_args(inst->Alpha.Opcode); + for(i = 0; i < nargs; ++i) { + const char* abs = inst->Alpha.Arg[i].Abs ? "|" : ""; + const char* neg = inst->Alpha.Arg[i].Negate ? "-" : ""; + _mesa_printf(", %s%sSrc%i.%c%s", neg, abs, inst->Alpha.Arg[i].Source, + swizzle_char(inst->Alpha.Arg[i].Swizzle), abs); + } + _mesa_printf("\n"); +} diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.h b/src/mesa/drivers/dri/r300/radeon_program_pair.h new file mode 100644 index 00000000000..b2bdd08d27c --- /dev/null +++ b/src/mesa/drivers/dri/r300/radeon_program_pair.h @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2008 Nicolai Haehnle. + * + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RADEON_PROGRAM_PAIR_H_ +#define __RADEON_PROGRAM_PAIR_H_ + +#include "radeon_program.h" + + +/** + * Represents a paired instruction, as found in R300 and R500 + * fragment programs. + */ +struct radeon_pair_instruction_source { + GLuint Index:8; + GLuint Constant:1; + GLuint Used:1; +}; + +struct radeon_pair_instruction_rgb { + GLuint Opcode:8; + GLuint DestIndex:8; + GLuint WriteMask:3; + GLuint OutputWriteMask:3; + GLuint Saturate:1; + + struct radeon_pair_instruction_source Src[3]; + + struct { + GLuint Source:2; + GLuint Swizzle:9; + GLuint Abs:1; + GLuint Negate:1; + } Arg[3]; +}; + +struct radeon_pair_instruction_alpha { + GLuint Opcode:8; + GLuint DestIndex:8; + GLuint WriteMask:1; + GLuint OutputWriteMask:1; + GLuint DepthWriteMask:1; + GLuint Saturate:1; + + struct radeon_pair_instruction_source Src[3]; + + struct { + GLuint Source:2; + GLuint Swizzle:3; + GLuint Abs:1; + GLuint Negate:1; + } Arg[3]; +}; + +struct radeon_pair_instruction { + struct radeon_pair_instruction_rgb RGB; + struct radeon_pair_instruction_alpha Alpha; +}; + + +/** + * + */ +struct radeon_pair_handler { + /** + * Fill in the proper hardware index for the given constant register. + * + * @return GL_FALSE on error. + */ + GLboolean (*EmitConst)(void*, GLuint file, GLuint index, GLuint *hwindex); + + /** + * Write a paired instruction to the hardware. + * + * @return GL_FALSE on error. + */ + GLboolean (*EmitPaired)(void*, struct radeon_pair_instruction*); + + /** + * Write a texture instruction to the hardware. + * Register indices have already been rewritten to the allocated + * hardware register numbers. + * + * @return GL_FALSE on error. + */ + GLboolean (*EmitTex)(void*, struct prog_instruction*); + + /** + * Called after a block of contiguous, independent texture + * instructions has been emitted. + */ + void (*EndTexBlock)(void*); + + GLuint MaxHwTemps; +}; + +GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program, + const struct radeon_pair_handler*, void *userdata); + +void radeonPrintPairInstruction(struct radeon_pair_instruction *inst); + +#endif /* __RADEON_PROGRAM_PAIR_H_ */