X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_vs_emit.c;h=1638ef81115910bcbebd8524e5446e13b3f3a7ed;hb=0f68236a2487dbeb0396b996debcda595b0b54a1;hp=108e19cdbcff84ec28a0bc9775bf24fed6e08c7e;hpb=9053bb0d405db0b1a2a917317c7ec41e63b54f09;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c index 108e19cdbcf..326bb1e562f 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_emit.c +++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c @@ -31,12 +31,51 @@ #include "main/macros.h" -#include "shader/program.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" +#include "program/program.h" +#include "program/prog_parameter.h" +#include "program/prog_print.h" #include "brw_context.h" #include "brw_vs.h" +/* Return the SrcReg index of the channels that can be immediate float operands + * instead of usage of PROGRAM_CONSTANT values through push/pull. + */ +static GLboolean +brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg) +{ + int opcode_array[] = { + [OPCODE_MOV] = 1, + [OPCODE_ADD] = 2, + [OPCODE_CMP] = 3, + [OPCODE_DP2] = 2, + [OPCODE_DP3] = 2, + [OPCODE_DP4] = 2, + [OPCODE_DPH] = 2, + [OPCODE_MAX] = 2, + [OPCODE_MIN] = 2, + [OPCODE_MUL] = 2, + [OPCODE_SEQ] = 2, + [OPCODE_SGE] = 2, + [OPCODE_SGT] = 2, + [OPCODE_SLE] = 2, + [OPCODE_SLT] = 2, + [OPCODE_SNE] = 2, + [OPCODE_XPD] = 2, + }; + + /* These opcodes get broken down in a way that allow two + * args to be immediates. + */ + if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) { + if (arg == 1 || arg == 2) + return GL_TRUE; + } + + if (opcode > ARRAY_SIZE(opcode_array)) + return GL_FALSE; + + return arg == opcode_array[opcode] - 1; +} static struct brw_reg get_tmp( struct brw_vs_compile *c ) { @@ -59,6 +98,39 @@ static void release_tmps( struct brw_vs_compile *c ) c->last_tmp = c->first_tmp; } +static int +get_first_reladdr_output(struct gl_vertex_program *vp) +{ + int i; + int first_reladdr_output = VERT_RESULT_MAX; + + for (i = 0; i < vp->Base.NumInstructions; i++) { + struct prog_instruction *inst = vp->Base.Instructions + i; + + if (inst->DstReg.File == PROGRAM_OUTPUT && + inst->DstReg.RelAddr && + inst->DstReg.Index < first_reladdr_output) + first_reladdr_output = inst->DstReg.Index; + } + + return first_reladdr_output; +} + +/* Clears the record of which vp_const_buffer elements have been + * loaded into our constant buffer registers, for the starts of new + * blocks after control flow. + */ +static void +clear_current_const(struct brw_vs_compile *c) +{ + unsigned int i; + + if (c->vp->use_const_buffer) { + for (i = 0; i < 3; i++) { + c->current_const[i].index = -1; + } + } +} /** * Preallocate GRF register before code emit. @@ -67,8 +139,14 @@ static void release_tmps( struct brw_vs_compile *c ) */ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) { - GLuint i, reg = 0, mrf; + struct intel_context *intel = &c->func.brw->intel; + GLuint i, reg = 0, mrf, j; int attributes_in_vue; + int first_reladdr_output; + int max_constant; + int constant = 0; + int vert_result_reoder[VERT_RESULT_MAX]; + int bfc = 0; /* Determine whether to use a real constant buffer or use a block * of GRF registers for constants. The later is faster but only @@ -91,33 +169,97 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) /* User clip planes from curbe: */ if (c->key.nr_userclip) { - for (i = 0; i < c->key.nr_userclip; i++) { - c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1); - } + if (intel->gen >= 6) { + for (i = 0; i < c->key.nr_userclip; i++) { + c->userplane[i] = stride(brw_vec4_grf(reg + i / 2, + (i % 2) * 4), 0, 4, 1); + } + reg += ALIGN(c->key.nr_userclip, 2) / 2; + } else { + for (i = 0; i < c->key.nr_userclip; i++) { + c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2, + (i % 2) * 4), 0, 4, 1); + } + reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2; + } - /* Deal with curbe alignment: - */ - reg += ((6 + c->key.nr_userclip + 3) / 4) * 2; } - /* Vertex program parameters from curbe: + /* Assign some (probably all) of the vertex program constants to + * the push constant buffer/CURBE. + * + * There's an obvious limit to the numer of push constants equal to + * the number of register available, and that number is smaller + * than the minimum maximum number of vertex program parameters, so + * support for pull constants is required if we overflow. + * Additionally, on gen6 the number of push constants is even + * lower. + * + * When there's relative addressing, we don't know what range of + * Mesa IR registers can be accessed. And generally, when relative + * addressing is used we also have too many constants to load them + * all as push constants. So, we'll just support relative + * addressing out of the pull constant buffers, and try to load as + * many statically-accessed constants into the push constant buffer + * as we can. */ - if (c->vp->use_const_buffer) { - /* get constants from a real constant buffer */ - c->prog_data.curb_read_length = 0; - c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */ + if (intel->gen >= 6) { + /* We can only load 32 regs of push constants. */ + max_constant = 32 * 2 - c->key.nr_userclip; + } else { + max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries; } - else { - /* use a section of the GRF for constants */ - GLuint nr_params = c->vp->program.Base.Parameters->NumParameters; - for (i = 0; i < nr_params; i++) { - c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1); + + /* constant_map maps from ParameterValues[] index to index in the + * push constant buffer, or -1 if it's only in the pull constant + * buffer. + */ + memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters); + for (i = 0; + i < c->vp->program.Base.NumInstructions && constant < max_constant; + i++) { + struct prog_instruction *inst = &c->vp->program.Base.Instructions[i]; + int arg; + + for (arg = 0; arg < 3 && constant < max_constant; arg++) { + if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR && + inst->SrcReg[arg].File != PROGRAM_CONSTANT && + inst->SrcReg[arg].File != PROGRAM_UNIFORM && + inst->SrcReg[arg].File != PROGRAM_ENV_PARAM && + inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) { + continue; + } + + if (inst->SrcReg[arg].RelAddr) { + c->vp->use_const_buffer = GL_TRUE; + continue; + } + + if (c->constant_map[inst->SrcReg[arg].Index] == -1) { + c->constant_map[inst->SrcReg[arg].Index] = constant++; + } } - reg += (nr_params + 1) / 2; - c->prog_data.curb_read_length = reg - 1; + } - c->prog_data.nr_params = nr_params * 4; + /* If we ran out of push constant space, then we'll also upload all + * constants through the pull constant buffer so that they can be + * accessed no matter what. For relative addressing (the common + * case) we need them all in place anyway. + */ + if (constant == max_constant) + c->vp->use_const_buffer = GL_TRUE; + + for (i = 0; i < constant; i++) { + c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2, + (i % 2) * 4), + 0, 4, 1); } + reg += (constant + 1) / 2; + c->prog_data.curb_read_length = reg - 1; + c->prog_data.nr_params = constant * 4; + /* XXX 0 causes a bug elsewhere... */ + if (intel->gen < 6 && c->prog_data.nr_params == 0) + c->prog_data.nr_params = 4; /* Allocate input regs: */ @@ -141,13 +283,47 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) c->first_output = reg; c->first_overflow_output = 0; - if (BRW_IS_IGDNG(c->func.brw)) - mrf = 8; + if (intel->gen >= 6) { + mrf = 3; + if (c->key.nr_userclip) + mrf += 2; + } else if (intel->gen == 5) + mrf = 8; else - mrf = 4; + mrf = 4; + + first_reladdr_output = get_first_reladdr_output(&c->vp->program); + + for (i = 0; i < VERT_RESULT_MAX; i++) + vert_result_reoder[i] = i; + + /* adjust attribute order in VUE for BFC0/BFC1 on Gen6+ */ + if (intel->gen >= 6 && c->key.two_side_color) { + if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL1)) && + (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC1))) { + assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)); + assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0)); + bfc = 2; + } else if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)) && + (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0))) + bfc = 1; + + if (bfc) { + for (i = 0; i < bfc; i++) { + vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 0] = VERT_RESULT_COL0 + i; + vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 1] = VERT_RESULT_BFC0 + i; + } + + for (i = VERT_RESULT_COL0 + bfc * 2; i < VERT_RESULT_BFC0 + bfc; i++) { + vert_result_reoder[i] = i - bfc; + } + } + } - for (i = 0; i < VERT_RESULT_MAX; i++) { - if (c->prog_data.outputs_written & (1 << i)) { + for (j = 0; j < VERT_RESULT_MAX; j++) { + i = vert_result_reoder[j]; + + if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) { c->nr_outputs++; assert(i < Elements(c->regs[PROGRAM_OUTPUT])); if (i == VERT_RESULT_HPOS) { @@ -157,19 +333,32 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) else if (i == VERT_RESULT_PSIZ) { c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0); reg++; - mrf++; /* just a placeholder? XXX fix later stages & remove this */ } else { - if (mrf < 16) { + /* Two restrictions on our compute-to-MRF here. The + * message length for all SEND messages is restricted to + * [1,15], so we can't use mrf 15, as that means a length + * of 16. + * + * Additionally, URB writes are aligned to URB rows, so we + * need to put an even number of registers of URB data in + * each URB write so that the later write is aligned. A + * message length of 15 means 1 message header reg plus 14 + * regs of URB data. + * + * For attributes beyond the compute-to-MRF, we compute to + * GRFs and they will be written in the second URB_WRITE. + */ + if (first_reladdr_output > i && mrf < 15) { c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf); mrf++; } else { - /* too many vertex results to fit in MRF, use GRF for overflow */ - if (!c->first_overflow_output) + if (mrf >= 15 && !c->first_overflow_output) c->first_overflow_output = i; c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0); reg++; + mrf++; } } } @@ -200,10 +389,10 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) if (c->vp->use_const_buffer) { for (i = 0; i < 3; i++) { - c->current_const[i].index = -1; c->current_const[i].reg = brw_vec8_grf(reg, 0); reg++; } + clear_current_const(c); } for (i = 0; i < 128; i++) { @@ -213,8 +402,10 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) } } - c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0); - reg += 2; + if (c->needs_stack) { + c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0); + reg += 2; + } /* Some opcodes need an internal temporary: */ @@ -238,17 +429,26 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) */ attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs); - if (BRW_IS_IGDNG(c->func.brw)) - c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4; + /* See emit_vertex_write() for where the VUE's overhead on top of the + * attributes comes from. + */ + if (intel->gen >= 6) { + int header_regs = 2; + if (c->key.nr_userclip) + header_regs += 2; + + c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 7) / 8; + } else if (intel->gen == 5) + c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4; else - c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4; + c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4; c->prog_data.total_grf = reg; - if (INTEL_DEBUG & DEBUG_VS) { - _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs); - _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries); - _mesa_printf("%s reg = %d\n", __FUNCTION__, reg); + if (unlikely(INTEL_DEBUG & DEBUG_VS)) { + printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs); + printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries); + printf("%s reg = %d\n", __FUNCTION__, reg); } } @@ -331,63 +531,93 @@ static void unalias3( struct brw_vs_compile *c, } } -static void emit_sop( struct brw_compile *p, +static void emit_sop( struct brw_vs_compile *c, struct brw_reg dst, struct brw_reg arg0, struct brw_reg arg1, GLuint cond) { + struct brw_compile *p = &c->func; + brw_MOV(p, dst, brw_imm_f(0.0f)); brw_CMP(p, brw_null_reg(), cond, arg0, arg1); brw_MOV(p, dst, brw_imm_f(1.0f)); brw_set_predicate_control_flag_value(p, 0xff); } -static void emit_seq( struct brw_compile *p, +static void emit_seq( struct brw_vs_compile *c, struct brw_reg dst, struct brw_reg arg0, struct brw_reg arg1 ) { - emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ); + emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ); } -static void emit_sne( struct brw_compile *p, +static void emit_sne( struct brw_vs_compile *c, struct brw_reg dst, struct brw_reg arg0, struct brw_reg arg1 ) { - emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ); + emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ); } -static void emit_slt( struct brw_compile *p, +static void emit_slt( struct brw_vs_compile *c, struct brw_reg dst, struct brw_reg arg0, struct brw_reg arg1 ) { - emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L); + emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L); } -static void emit_sle( struct brw_compile *p, +static void emit_sle( struct brw_vs_compile *c, struct brw_reg dst, struct brw_reg arg0, struct brw_reg arg1 ) { - emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE); + emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE); } -static void emit_sgt( struct brw_compile *p, +static void emit_sgt( struct brw_vs_compile *c, struct brw_reg dst, struct brw_reg arg0, struct brw_reg arg1 ) { - emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G); + emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G); } -static void emit_sge( struct brw_compile *p, +static void emit_sge( struct brw_vs_compile *c, struct brw_reg dst, struct brw_reg arg0, struct brw_reg arg1 ) { - emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE); + emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE); +} + +static void emit_cmp( struct brw_compile *p, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1, + struct brw_reg arg2 ) +{ + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0)); + brw_SEL(p, dst, arg1, arg2); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); +} + +static void emit_sign(struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0) +{ + struct brw_compile *p = &c->func; + + brw_MOV(p, dst, brw_imm_f(0)); + + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0)); + brw_MOV(p, dst, brw_imm_f(-1.0)); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0)); + brw_MOV(p, dst, brw_imm_f(1.0)); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); } static void emit_max( struct brw_compile *p, @@ -395,9 +625,18 @@ static void emit_max( struct brw_compile *p, struct brw_reg arg0, struct brw_reg arg1 ) { - brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1); - brw_SEL(p, dst, arg1, arg0); - brw_set_predicate_control(p, BRW_PREDICATE_NONE); + struct intel_context *intel = &p->brw->intel; + + if (intel->gen >= 6) { + brw_set_conditionalmod(p, BRW_CONDITIONAL_GE); + brw_SEL(p, dst, arg0, arg1); + brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + } else { + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1); + brw_SEL(p, dst, arg0, arg1); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + } } static void emit_min( struct brw_compile *p, @@ -405,17 +644,41 @@ static void emit_min( struct brw_compile *p, struct brw_reg arg0, struct brw_reg arg1 ) { - brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1); - brw_SEL(p, dst, arg0, arg1); - brw_set_predicate_control(p, BRW_PREDICATE_NONE); + struct intel_context *intel = &p->brw->intel; + + if (intel->gen >= 6) { + brw_set_conditionalmod(p, BRW_CONDITIONAL_L); + brw_SEL(p, dst, arg0, arg1); + brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + } else { + brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1); + brw_SEL(p, dst, arg0, arg1); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + } } +static void emit_arl(struct brw_compile *p, + struct brw_reg dst, + struct brw_reg src) +{ + struct intel_context *intel = &p->brw->intel; -static void emit_math1( struct brw_vs_compile *c, - GLuint function, - struct brw_reg dst, - struct brw_reg arg0, - GLuint precision) + if (intel->gen >= 6) { + struct brw_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F); + + brw_RNDD(p, dst_f, src); + brw_MOV(p, dst, dst_f); + } else { + brw_RNDD(p, dst, src); + } +} + +static void emit_math1_gen4(struct brw_vs_compile *c, + GLuint function, + struct brw_reg dst, + struct brw_reg arg0, + GLuint precision) { /* There are various odd behaviours with SEND on the simulator. In * addition there are documented issues with the fact that the GEN4 @@ -426,10 +689,13 @@ static void emit_math1( struct brw_vs_compile *c, */ struct brw_compile *p = &c->func; struct brw_reg tmp = dst; - GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf || - dst.file != BRW_GENERAL_REGISTER_FILE); + GLboolean need_tmp = GL_FALSE; - if (need_tmp) + if (dst.file != BRW_GENERAL_REGISTER_FILE || + dst.dw1.bits.writemask != 0xf) + need_tmp = GL_TRUE; + + if (need_tmp) tmp = get_tmp(c); brw_math(p, @@ -447,8 +713,59 @@ static void emit_math1( struct brw_vs_compile *c, } } +static void +emit_math1_gen6(struct brw_vs_compile *c, + GLuint function, + struct brw_reg dst, + struct brw_reg arg0, + GLuint precision) +{ + struct brw_compile *p = &c->func; + struct brw_reg tmp_src, tmp_dst; -static void emit_math2( struct brw_vs_compile *c, + /* Something is strange on gen6 math in 16-wide mode, though the + * docs say it's supposed to work. Punt to using align1 mode, + * which doesn't do writemasking and swizzles. + */ + tmp_src = get_tmp(c); + tmp_dst = get_tmp(c); + + brw_MOV(p, tmp_src, arg0); + + brw_set_access_mode(p, BRW_ALIGN_1); + brw_math(p, + tmp_dst, + function, + BRW_MATH_SATURATE_NONE, + 2, + tmp_src, + BRW_MATH_DATA_SCALAR, + precision); + brw_set_access_mode(p, BRW_ALIGN_16); + + brw_MOV(p, dst, tmp_dst); + + release_tmp(c, tmp_src); + release_tmp(c, tmp_dst); +} + +static void +emit_math1(struct brw_vs_compile *c, + GLuint function, + struct brw_reg dst, + struct brw_reg arg0, + GLuint precision) +{ + struct brw_compile *p = &c->func; + struct intel_context *intel = &p->brw->intel; + + if (intel->gen >= 6) + emit_math1_gen6(c, function, dst, arg0, precision); + else + emit_math1_gen4(c, function, dst, arg0, precision); +} + +static void emit_math2_gen4( struct brw_vs_compile *c, GLuint function, struct brw_reg dst, struct brw_reg arg0, @@ -457,8 +774,11 @@ static void emit_math2( struct brw_vs_compile *c, { struct brw_compile *p = &c->func; struct brw_reg tmp = dst; - GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf || - dst.file != BRW_GENERAL_REGISTER_FILE); + GLboolean need_tmp = GL_FALSE; + + if (dst.file != BRW_GENERAL_REGISTER_FILE || + dst.dw1.bits.writemask != 0xf) + need_tmp = GL_TRUE; if (need_tmp) tmp = get_tmp(c); @@ -480,6 +800,53 @@ static void emit_math2( struct brw_vs_compile *c, } } +static void emit_math2_gen6( struct brw_vs_compile *c, + GLuint function, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1, + GLuint precision) +{ + struct brw_compile *p = &c->func; + struct brw_reg tmp_src0, tmp_src1, tmp_dst; + + tmp_src0 = get_tmp(c); + tmp_src1 = get_tmp(c); + tmp_dst = get_tmp(c); + + brw_MOV(p, tmp_src0, arg0); + brw_MOV(p, tmp_src1, arg1); + + brw_set_access_mode(p, BRW_ALIGN_1); + brw_math2(p, + tmp_dst, + function, + tmp_src0, + tmp_src1); + brw_set_access_mode(p, BRW_ALIGN_16); + + brw_MOV(p, dst, tmp_dst); + + release_tmp(c, tmp_src0); + release_tmp(c, tmp_src1); + release_tmp(c, tmp_dst); +} + +static void emit_math2( struct brw_vs_compile *c, + GLuint function, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1, + GLuint precision) +{ + struct brw_compile *p = &c->func; + struct intel_context *intel = &p->brw->intel; + + if (intel->gen >= 6) + emit_math2_gen6(c, function, dst, arg0, arg1, precision); + else + emit_math2_gen4(c, function, dst, arg0, arg1, precision); +} static void emit_exp_noalias( struct brw_vs_compile *c, struct brw_reg dst, @@ -748,15 +1115,12 @@ get_constant(struct brw_vs_compile *c, { const struct prog_src_register *src = &inst->SrcReg[argIndex]; struct brw_compile *p = &c->func; - struct brw_reg const_reg; - struct brw_reg const2_reg; - const GLboolean relAddr = src->RelAddr; + struct brw_reg const_reg = c->current_const[argIndex].reg; assert(argIndex < 3); - if (c->current_const[argIndex].index != src->Index || relAddr) { - struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0]; - + if (c->current_const[argIndex].index != src->Index) { + /* Keep track of the last constant loaded in this slot, for reuse. */ c->current_const[argIndex].index = src->Index; #if 0 @@ -765,49 +1129,58 @@ get_constant(struct brw_vs_compile *c, #endif /* need to fetch the constant now */ brw_dp_READ_4_vs(p, - c->current_const[argIndex].reg,/* writeback dest */ - 0, /* oword */ - relAddr, /* relative indexing? */ - addrReg, /* address register */ + const_reg, /* writeback dest */ 16 * src->Index, /* byte offset */ SURF_INDEX_VERT_CONST_BUFFER /* binding table index */ ); - - if (relAddr) { - /* second read */ - const2_reg = get_tmp(c); - - /* use upper half of address reg for second read */ - addrReg = stride(addrReg, 0, 4, 0); - addrReg.subnr = 16; - - brw_dp_READ_4_vs(p, - const2_reg, /* writeback dest */ - 1, /* oword */ - relAddr, /* relative indexing? */ - addrReg, /* address register */ - 16 * src->Index, /* byte offset */ - SURF_INDEX_VERT_CONST_BUFFER - ); - } } - const_reg = c->current_const[argIndex].reg; + /* replicate lower four floats into upper half (to get XYZWXYZW) */ + const_reg = stride(const_reg, 0, 4, 0); + const_reg.subnr = 0; - if (relAddr) { - /* merge the two Owords into the constant register */ - /* const_reg[7..4] = const2_reg[7..4] */ - brw_MOV(p, - suboffset(stride(const_reg, 0, 4, 1), 4), - suboffset(stride(const2_reg, 0, 4, 1), 4)); - release_tmp(c, const2_reg); - } - else { - /* replicate lower four floats into upper half (to get XYZWXYZW) */ - const_reg = stride(const_reg, 0, 4, 0); - const_reg.subnr = 0; + return const_reg; +} + +static struct brw_reg +get_reladdr_constant(struct brw_vs_compile *c, + const struct prog_instruction *inst, + GLuint argIndex) +{ + const struct prog_src_register *src = &inst->SrcReg[argIndex]; + struct brw_compile *p = &c->func; + struct brw_context *brw = p->brw; + struct intel_context *intel = &brw->intel; + struct brw_reg const_reg = c->current_const[argIndex].reg; + struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0]; + uint32_t offset; + + assert(argIndex < 3); + + /* Can't reuse a reladdr constant load. */ + c->current_const[argIndex].index = -1; + + #if 0 + printf(" fetch const[a0.x+%d] for arg %d into reg %d\n", + src->Index, argIndex, c->current_const[argIndex].reg.nr); +#endif + + if (intel->gen >= 6) { + offset = src->Index; + } else { + struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D); + brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16)); + addr_reg = byte_addr_reg; + offset = 16 * src->Index; } + /* fetch the first vec4 */ + brw_dp_READ_4_vs_relative(p, + const_reg, + addr_reg, + offset, + SURF_INDEX_VERT_CONST_BUFFER); + return const_reg; } @@ -852,36 +1225,75 @@ static struct brw_reg get_reg( struct brw_vs_compile *c, */ static struct brw_reg deref( struct brw_vs_compile *c, struct brw_reg arg, - GLint offset) + GLint offset, + GLuint reg_size ) { struct brw_compile *p = &c->func; - struct brw_reg tmp = vec4(get_tmp(c)); + struct brw_reg tmp = get_tmp(c); struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0]; - struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW); - GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16; + struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D); + GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size; struct brw_reg indirect = brw_vec4_indirect(0,0); + struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW); + + /* Set the vertical stride on the register access so that the first + * 4 components come from a0.0 and the second 4 from a0.1. + */ + indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL; { brw_push_insn_state(p); brw_set_access_mode(p, BRW_ALIGN_1); - /* This is pretty clunky - load the address register twice and - * fetch each 4-dword value in turn. There must be a way to do - * this in a single pass, but I couldn't get it to work. - */ - brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset)); - brw_MOV(p, tmp, indirect); + brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size)); + brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset)); - brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset)); - brw_MOV(p, suboffset(tmp, 4), indirect); + brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size)); + brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset)); + + brw_MOV(p, tmp, indirect); brw_pop_insn_state(p); } - + /* NOTE: tmp not released */ - return vec8(tmp); + return tmp; } +static void +move_to_reladdr_dst(struct brw_vs_compile *c, + const struct prog_instruction *inst, + struct brw_reg val) +{ + struct brw_compile *p = &c->func; + int reg_size = 32; + struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0]; + struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D); + struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index]; + GLuint byte_offset = base.nr * 32 + base.subnr; + struct brw_reg indirect = brw_vec4_indirect(0,0); + struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW); + + /* Because destination register indirect addressing can only use + * one index, we'll write each vertex's vec4 value separately. + */ + val.width = BRW_WIDTH_4; + val.vstride = BRW_VERTICAL_STRIDE_4; + + brw_push_insn_state(p); + brw_set_access_mode(p, BRW_ALIGN_1); + + brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size)); + brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset)); + brw_MOV(p, indirect, val); + + brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size)); + brw_ADD(p, brw_address_reg(0), acc, + brw_imm_uw(byte_offset + reg_size / 2)); + brw_MOV(p, indirect, suboffset(val, 4)); + + brw_pop_insn_state(p); +} /** * Get brw reg corresponding to the instruction's [argIndex] src reg. @@ -896,12 +1308,61 @@ get_src_reg( struct brw_vs_compile *c, const GLint index = inst->SrcReg[argIndex].Index; const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr; + if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) { + const struct prog_src_register *src = &inst->SrcReg[argIndex]; + + if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO, + SWIZZLE_ZERO, + SWIZZLE_ZERO, + SWIZZLE_ZERO)) { + return brw_imm_f(0.0f); + } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE, + SWIZZLE_ONE, + SWIZZLE_ONE, + SWIZZLE_ONE)) { + if (src->Negate) + return brw_imm_f(-1.0F); + else + return brw_imm_f(1.0F); + } else if (src->File == PROGRAM_CONSTANT) { + const struct gl_program_parameter_list *params; + float f; + int component = -1; + + switch (src->Swizzle) { + case SWIZZLE_XXXX: + component = 0; + break; + case SWIZZLE_YYYY: + component = 1; + break; + case SWIZZLE_ZZZZ: + component = 2; + break; + case SWIZZLE_WWWW: + component = 3; + break; + } + + if (component >= 0) { + params = c->vp->program.Base.Parameters; + f = params->ParameterValues[src->Index][component]; + + if (src->Abs) + f = fabs(f); + if (src->Negate) + f = -f; + return brw_imm_f(f); + } + } + } + switch (file) { case PROGRAM_TEMPORARY: case PROGRAM_INPUT: case PROGRAM_OUTPUT: if (relAddr) { - return deref(c, c->regs[file][0], index); + return deref(c, c->regs[file][0], index, 32); } else { assert(c->regs[file][index].nr != 0); @@ -912,15 +1373,18 @@ get_src_reg( struct brw_vs_compile *c, case PROGRAM_CONSTANT: case PROGRAM_UNIFORM: case PROGRAM_ENV_PARAM: - if (c->vp->use_const_buffer) { - return get_constant(c, inst, argIndex); - } - else if (relAddr) { - return deref(c, c->regs[PROGRAM_STATE_VAR][0], index); - } - else { - assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0); - return c->regs[PROGRAM_STATE_VAR][index]; + case PROGRAM_LOCAL_PARAM: + if (!relAddr && c->constant_map[index] != -1) { + /* Take from the push constant buffer if possible. */ + assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0); + return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]]; + } else { + /* Must be in the pull constant buffer then .*/ + assert(c->vp->use_const_buffer); + if (relAddr) + return get_reladdr_constant(c, inst, argIndex); + else + return get_constant(c, inst, argIndex); } case PROGRAM_ADDRESS: assert(index == 0); @@ -930,7 +1394,6 @@ get_src_reg( struct brw_vs_compile *c, /* this is a normal case since we loop over all three src args */ return brw_null_reg(); - case PROGRAM_LOCAL_PARAM: case PROGRAM_WRITE_ONLY: default: assert(0); @@ -938,26 +1401,6 @@ get_src_reg( struct brw_vs_compile *c, } } - -static void emit_arl( struct brw_vs_compile *c, - struct brw_reg dst, - struct brw_reg arg0 ) -{ - struct brw_compile *p = &c->func; - struct brw_reg tmp = dst; - GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE); - - if (need_tmp) - tmp = get_tmp(c); - - brw_RNDD(p, tmp, arg0); /* tmp = round(arg0) */ - brw_MUL(p, dst, tmp, brw_imm_d(16)); /* dst = tmp * 16 */ - - if (need_tmp) - release_tmp(c, tmp); -} - - /** * Return the brw reg for the given instruction's src argument. * Will return mangled results for SWZ op. The emit_swz() function @@ -978,10 +1421,12 @@ static struct brw_reg get_arg( struct brw_vs_compile *c, /* Convert 3-bit swizzle to 2-bit. */ - reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0), - GET_SWZ(src->Swizzle, 1), - GET_SWZ(src->Swizzle, 2), - GET_SWZ(src->Swizzle, 3)); + if (reg.file != BRW_IMMEDIATE_VALUE) { + reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0), + GET_SWZ(src->Swizzle, 1), + GET_SWZ(src->Swizzle, 2), + GET_SWZ(src->Swizzle, 3)); + } /* Note this is ok for non-swizzle instructions: */ @@ -1002,8 +1447,17 @@ static struct brw_reg get_dst( struct brw_vs_compile *c, switch (dst.File) { case PROGRAM_TEMPORARY: case PROGRAM_OUTPUT: - assert(c->regs[dst.File][dst.Index].nr != 0); - reg = c->regs[dst.File][dst.Index]; + /* register-indirect addressing is only 1x1, not VxH, for + * destination regs. So, for RelAddr we'll return a temporary + * for the dest and do a move of the result to the RelAddr + * register after the instruction emit. + */ + if (dst.RelAddr) { + reg = get_tmp(c); + } else { + assert(c->regs[dst.File][dst.Index].nr != 0); + reg = c->regs[dst.File][dst.Index]; + } break; case PROGRAM_ADDRESS: assert(dst.Index == 0); @@ -1018,6 +1472,7 @@ static struct brw_reg get_dst( struct brw_vs_compile *c, reg = brw_null_reg(); } + assert(reg.type != BRW_IMMEDIATE_VALUE); reg.dw1.bits.writemask = dst.WriteMask; return reg; @@ -1100,11 +1555,13 @@ static void emit_swz( struct brw_vs_compile *c, static void emit_vertex_write( struct brw_vs_compile *c) { struct brw_compile *p = &c->func; - struct brw_reg m0 = brw_message_reg(0); + struct brw_context *brw = p->brw; + struct intel_context *intel = &brw->intel; struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS]; struct brw_reg ndc; int eot; - GLuint len_vertext_header = 2; + GLuint len_vertex_header = 2; + int next_mrf, i; if (c->key.copy_edgeflag) { brw_MOV(p, @@ -1112,19 +1569,45 @@ static void emit_vertex_write( struct brw_vs_compile *c) get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG)); } - /* Build ndc coords */ - ndc = get_tmp(c); - /* ndc = 1.0 / pos.w */ - emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL); - /* ndc.xyz = pos * ndc */ - brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc); + if (intel->gen < 6) { + /* Build ndc coords */ + ndc = get_tmp(c); + /* ndc = 1.0 / pos.w */ + emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL); + /* ndc.xyz = pos * ndc */ + brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc); + } /* Update the header for point size, user clipping flags, and -ve rhw * workaround. */ - if ((c->prog_data.outputs_written & (1<key.nr_userclip || BRW_IS_965(p->brw)) - { + if (intel->gen >= 6) { + struct brw_reg m1 = brw_message_reg(1); + + /* On gen6, m1 has each value in a separate dword, so we never + * need to mess with a temporary for computing the m1 value. + */ + brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0)); + if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) { + brw_MOV(p, brw_writemask(m1, WRITEMASK_W), + brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0)); + } + + /* Set the user clip distances in dword 8-15. (m3-4)*/ + if (c->key.nr_userclip) { + for (i = 0; i < c->key.nr_userclip; i++) { + struct brw_reg m; + if (i < 4) + m = brw_message_reg(3); + else + m = brw_message_reg(4); + + brw_DP4(p, brw_writemask(m, (1 << (i & 7))),pos, c->userplane[i]); + } + } + } else if ((c->prog_data.outputs_written & + BITFIELD64_BIT(VERT_RESULT_PSIZ)) || + c->key.nr_userclip || brw->has_negative_rhw_bug) { struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD); GLuint i; @@ -1132,10 +1615,12 @@ static void emit_vertex_write( struct brw_vs_compile *c) brw_set_access_mode(p, BRW_ALIGN_16); - if (c->prog_data.outputs_written & (1<prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) { struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ]; - brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11)); - brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8)); + brw_MUL(p, brw_writemask(header1, WRITEMASK_W), + brw_swizzle1(psiz, 0), brw_imm_f(1<<11)); + brw_AND(p, brw_writemask(header1, WRITEMASK_W), + header1, brw_imm_ud(0x7ff<<8)); } for (i = 0; i < c->key.nr_userclip; i++) { @@ -1154,7 +1639,7 @@ static void emit_vertex_write( struct brw_vs_compile *c) * Later, clipping will detect ucp[6] and ensure the primitive is * clipped against all fixed planes. */ - if (BRW_IS_965(p->brw)) { + if (brw->has_negative_rhw_bug) { brw_CMP(p, vec8(brw_null_reg()), BRW_CONDITIONAL_L, @@ -1180,21 +1665,65 @@ static void emit_vertex_write( struct brw_vs_compile *c) * of zeros followed by two sets of NDC coordinates: */ brw_set_access_mode(p, BRW_ALIGN_1); - brw_MOV(p, offset(m0, 2), ndc); - - if (BRW_IS_IGDNG(p->brw)) { - /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */ - brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */ - /* m4, m5 contain the distances from vertex to the user clip planeXXX. - * Seems it is useless for us. - * m6 is used for aligning, so that the remainder of vertex element is - * reg-aligned. - */ - brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */ - len_vertext_header = 6; + brw_set_acc_write_control(p, 0); + + /* The VUE layout is documented in Volume 2a. */ + if (intel->gen >= 6) { + /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge: + * dword 0-3 (m1) of the header is indices, point width, clip flags. + * dword 4-7 (m2) is the 4D space position + * dword 8-15 (m3,m4) of the vertex header is the user clip distance if + * enabled. + * m3 or 5 is the first vertex element data we fill, which is + * the vertex position. + */ + brw_MOV(p, brw_message_reg(2), pos); + len_vertex_header = 1; + if (c->key.nr_userclip > 0) + len_vertex_header += 2; + } else if (intel->gen == 5) { + /* There are 20 DWs (D0-D19) in VUE header on Ironlake: + * dword 0-3 (m1) of the header is indices, point width, clip flags. + * dword 4-7 (m2) is the ndc position (set above) + * dword 8-11 (m3) of the vertex header is the 4D space position + * dword 12-19 (m4,m5) of the vertex header is the user clip distance. + * m6 is a pad so that the vertex element data is aligned + * m7 is the first vertex data we fill, which is the vertex position. + */ + brw_MOV(p, brw_message_reg(2), ndc); + brw_MOV(p, brw_message_reg(3), pos); + brw_MOV(p, brw_message_reg(7), pos); + len_vertex_header = 6; } else { - brw_MOV(p, offset(m0, 3), pos); - len_vertext_header = 2; + /* There are 8 dwords in VUE header pre-Ironlake: + * dword 0-3 (m1) is indices, point width, clip flags. + * dword 4-7 (m2) is ndc position (set above) + * + * dword 8-11 (m3) is the first vertex data, which we always have be the + * vertex position. + */ + brw_MOV(p, brw_message_reg(2), ndc); + brw_MOV(p, brw_message_reg(3), pos); + len_vertex_header = 2; + } + + /* Move variable-addressed, non-overflow outputs to their MRFs. */ + next_mrf = 2 + len_vertex_header; + for (i = 0; i < VERT_RESULT_MAX; i++) { + if (c->first_overflow_output > 0 && i >= c->first_overflow_output) + break; + if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i))) + continue; + if (i == VERT_RESULT_PSIZ) + continue; + + if (i >= VERT_RESULT_TEX0 && + c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) { + brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]); + next_mrf++; + } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) { + next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1; + } } eot = (c->first_overflow_output == 0); @@ -1205,10 +1734,10 @@ static void emit_vertex_write( struct brw_vs_compile *c) c->r0, /* src */ 0, /* allocate */ 1, /* used */ - MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */ + MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */ 0, /* response len */ eot, /* eot */ - 1, /* writes complete */ + eot, /* writes complete */ 0, /* urb destination offset */ BRW_URB_SWIZZLE_INTERLEAVE); @@ -1217,62 +1746,84 @@ static void emit_vertex_write( struct brw_vs_compile *c) * Move the overflowed attributes from the GRF to the MRF and * issue another brw_urb_WRITE(). */ - /* XXX I'm not 100% sure about which MRF regs to use here. Starting - * at mrf[4] atm... - */ - GLuint i, mrf = 0; + GLuint i, mrf = 1; for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) { - if (c->prog_data.outputs_written & (1 << i)) { + if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) { /* move from GRF to MRF */ - brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]); + brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]); mrf++; } } brw_urb_WRITE(p, brw_null_reg(), /* dest */ - 4, /* starting mrf reg nr */ + 0, /* starting mrf reg nr */ c->r0, /* src */ 0, /* allocate */ 1, /* used */ - mrf+1, /* msg len */ + mrf, /* msg len */ 0, /* response len */ 1, /* eot */ 1, /* writes complete */ - BRW_MAX_MRF-1, /* urb destination offset */ + 14 / 2, /* urb destination offset */ BRW_URB_SWIZZLE_INTERLEAVE); } } - -/** - * Called after code generation to resolve subroutine calls and the - * END instruction. - * \param end_inst points to brw code for END instruction - * \param last_inst points to last instruction emitted before vertex write - */ -static void -post_vs_emit( struct brw_vs_compile *c, - struct brw_instruction *end_inst, - struct brw_instruction *last_inst ) +static GLboolean +accumulator_contains(struct brw_vs_compile *c, struct brw_reg val) { - GLint offset; - - brw_resolve_cals(&c->func); - - /* patch up the END code to jump past subroutines, etc */ - offset = last_inst - end_inst; - if (offset > 1) { - brw_set_src1(end_inst, brw_imm_d(offset * 16)); - } else { - end_inst->header.opcode = BRW_OPCODE_NOP; + struct brw_compile *p = &c->func; + struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1]; + + if (p->nr_insn == 0) + return GL_FALSE; + + if (val.address_mode != BRW_ADDRESS_DIRECT) + return GL_FALSE; + + switch (prev_insn->header.opcode) { + case BRW_OPCODE_MOV: + case BRW_OPCODE_MAC: + case BRW_OPCODE_MUL: + if (prev_insn->header.access_mode == BRW_ALIGN_16 && + prev_insn->header.execution_size == val.width && + prev_insn->bits1.da1.dest_reg_file == val.file && + prev_insn->bits1.da1.dest_reg_type == val.type && + prev_insn->bits1.da1.dest_address_mode == val.address_mode && + prev_insn->bits1.da1.dest_reg_nr == val.nr && + prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 && + prev_insn->bits1.da16.dest_writemask == 0xf) + return GL_TRUE; + else + return GL_FALSE; + default: + return GL_FALSE; } } static uint32_t -get_predicate(uint32_t swizzle) +get_predicate(const struct prog_instruction *inst) { - switch (swizzle) { + if (inst->DstReg.CondMask == COND_TR) + return BRW_PREDICATE_NONE; + + /* All of GLSL only produces predicates for COND_NE and one channel per + * vector. Fail badly if someone starts doing something else, as it might + * mean infinite looping or something. + * + * We'd like to support all the condition codes, but our hardware doesn't + * quite match the Mesa IR, which is modeled after the NV extensions. For + * those, the instruction may update the condition codes or not, then any + * later instruction may use one of those condition codes. For gen4, the + * instruction may update the flags register based on one of the condition + * codes output by the instruction, and then further instructions may + * predicate on that. We can probably support this, but it won't + * necessarily be easy. + */ + assert(inst->DstReg.CondMask == COND_NE); + + switch (inst->DstReg.CondSwizzle) { case SWIZZLE_XXXX: return BRW_PREDICATE_ALIGN16_REPLICATE_X; case SWIZZLE_YYYY: @@ -1282,7 +1833,8 @@ get_predicate(uint32_t swizzle) case SWIZZLE_WWWW: return BRW_PREDICATE_ALIGN16_REPLICATE_W; default: - _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n", swizzle); + _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n", + inst->DstReg.CondMask); return BRW_PREDICATE_NORMAL; } } @@ -1294,29 +1846,36 @@ void brw_vs_emit(struct brw_vs_compile *c ) #define MAX_IF_DEPTH 32 #define MAX_LOOP_DEPTH 32 struct brw_compile *p = &c->func; + struct brw_context *brw = p->brw; + struct intel_context *intel = &brw->intel; const GLuint nr_insns = c->vp->program.Base.NumInstructions; GLuint insn, if_depth = 0, loop_depth = 0; - GLuint end_offset = 0; - struct brw_instruction *end_inst, *last_inst; - struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH]; + struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 }; + int if_depth_in_loop[MAX_LOOP_DEPTH]; const struct brw_indirect stack_index = brw_indirect(0, 0); GLuint index; GLuint file; - if (INTEL_DEBUG & DEBUG_VS) { - _mesa_printf("vs-mesa:\n"); - _mesa_print_program(&c->vp->program.Base); - _mesa_printf("\n"); + if (unlikely(INTEL_DEBUG & DEBUG_VS)) { + printf("vs-mesa:\n"); + _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG, + GL_TRUE); + printf("\n"); } brw_set_compression_control(p, BRW_COMPRESSION_NONE); brw_set_access_mode(p, BRW_ALIGN_16); - - /* Message registers can't be read, so copy the output into GRF register - if they are used in source registers */ + if_depth_in_loop[loop_depth] = 0; + + brw_set_acc_write_control(p, 1); + for (insn = 0; insn < nr_insns; insn++) { GLuint i; struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn]; + + /* Message registers can't be read, so copy the output into GRF + * register if they are used in source registers + */ for (i = 0; i < 3; i++) { struct prog_src_register *src = &inst->SrcReg[i]; GLuint index = src->Index; @@ -1324,19 +1883,30 @@ void brw_vs_emit(struct brw_vs_compile *c ) if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS) c->output_regs[index].used_in_src = GL_TRUE; } + + switch (inst->Opcode) { + case OPCODE_CAL: + case OPCODE_RET: + c->needs_stack = GL_TRUE; + break; + default: + break; + } } /* Static register allocation */ brw_vs_alloc_regs(c); - brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack)); + + if (c->needs_stack) + brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack)); for (insn = 0; insn < nr_insns; insn++) { const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn]; struct brw_reg args[3], dst; GLuint i; - + #if 0 printf("%d: ", insn); _mesa_print_instruction(inst); @@ -1381,6 +1951,9 @@ void brw_vs_emit(struct brw_vs_compile *c ) case OPCODE_COS: emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL); break; + case OPCODE_DP2: + brw_DP2(p, dst, args[0], args[1]); + break; case OPCODE_DP3: brw_DP3(p, dst, args[0], args[1]); break; @@ -1406,7 +1979,7 @@ void brw_vs_emit(struct brw_vs_compile *c ) emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL); break; case OPCODE_ARL: - emit_arl(c, dst, args[0]); + emit_arl(p, dst, args[0]); break; case OPCODE_FLR: brw_RNDD(p, dst, args[0]); @@ -1427,9 +2000,13 @@ void brw_vs_emit(struct brw_vs_compile *c ) unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias); break; case OPCODE_MAD: - brw_MOV(p, brw_acc_reg(), args[2]); + if (!accumulator_contains(c, args[2])) + brw_MOV(p, brw_acc_reg(), args[2]); brw_MAC(p, dst, args[0], args[1]); break; + case OPCODE_CMP: + emit_cmp(p, dst, args[0], args[1], args[2]); + break; case OPCODE_MAX: emit_max(p, dst, args[0], args[1]); break; @@ -1449,29 +2026,32 @@ void brw_vs_emit(struct brw_vs_compile *c ) emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL); break; case OPCODE_RSQ: - emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL); + emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL); break; case OPCODE_SEQ: - emit_seq(p, dst, args[0], args[1]); + unalias2(c, dst, args[0], args[1], emit_seq); break; case OPCODE_SIN: emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL); break; case OPCODE_SNE: - emit_sne(p, dst, args[0], args[1]); + unalias2(c, dst, args[0], args[1], emit_sne); break; case OPCODE_SGE: - emit_sge(p, dst, args[0], args[1]); + unalias2(c, dst, args[0], args[1], emit_sge); break; case OPCODE_SGT: - emit_sgt(p, dst, args[0], args[1]); + unalias2(c, dst, args[0], args[1], emit_sgt); break; case OPCODE_SLT: - emit_slt(p, dst, args[0], args[1]); + unalias2(c, dst, args[0], args[1], emit_slt); break; case OPCODE_SLE: - emit_sle(p, dst, args[0], args[1]); + unalias2(c, dst, args[0], args[1], emit_sle); + break; + case OPCODE_SSG: + unalias1(c, dst, args[0], emit_sign); break; case OPCODE_SUB: brw_ADD(p, dst, args[0], negate(args[1])); @@ -1492,56 +2072,74 @@ void brw_vs_emit(struct brw_vs_compile *c ) case OPCODE_IF: assert(if_depth < MAX_IF_DEPTH); if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8); - if_inst[if_depth]->header.predicate_control = - get_predicate(inst->DstReg.CondSwizzle); + /* Note that brw_IF smashes the predicate_control field. */ + if_inst[if_depth]->header.predicate_control = get_predicate(inst); + if_depth_in_loop[loop_depth]++; if_depth++; break; case OPCODE_ELSE: + clear_current_const(c); + assert(if_depth > 0); if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]); break; case OPCODE_ENDIF: + clear_current_const(c); assert(if_depth > 0); brw_ENDIF(p, if_inst[--if_depth]); + if_depth_in_loop[loop_depth]--; break; -#if 0 case OPCODE_BGNLOOP: + clear_current_const(c); loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8); + if_depth_in_loop[loop_depth] = 0; break; case OPCODE_BRK: - brw_BREAK(p); - brw_set_predicate_control(p, BRW_PREDICATE_NONE); + brw_set_predicate_control(p, get_predicate(inst)); + brw_BREAK(p, if_depth_in_loop[loop_depth]); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); break; case OPCODE_CONT: - brw_CONT(p); + brw_set_predicate_control(p, get_predicate(inst)); + if (intel->gen >= 6) { + brw_CONT_gen6(p, loop_inst[loop_depth - 1]); + } else { + brw_CONT(p, if_depth_in_loop[loop_depth]); + } brw_set_predicate_control(p, BRW_PREDICATE_NONE); break; - case OPCODE_ENDLOOP: - { - struct brw_instruction *inst0, *inst1; - loop_depth--; - inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]); - /* patch all the BREAK/CONT instructions from last BEGINLOOP */ - while (inst0 > loop_inst[loop_depth]) { - inst0--; - if (inst0->header.opcode == BRW_OPCODE_BREAK) { - inst0->bits3.if_else.jump_count = inst1 - inst0 + 1; - inst0->bits3.if_else.pop_count = 0; - } - else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) { - inst0->bits3.if_else.jump_count = inst1 - inst0; - inst0->bits3.if_else.pop_count = 0; - } - } - } + + case OPCODE_ENDLOOP: { + clear_current_const(c); + struct brw_instruction *inst0, *inst1; + GLuint br = 1; + + loop_depth--; + + if (intel->gen == 5) + br = 2; + + inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]); + + if (intel->gen < 6) { + /* patch all the BREAK/CONT instructions from last BEGINLOOP */ + while (inst0 > loop_inst[loop_depth]) { + inst0--; + if (inst0->header.opcode == BRW_OPCODE_BREAK && + inst0->bits3.if_else.jump_count == 0) { + inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1); + } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE && + inst0->bits3.if_else.jump_count == 0) { + inst0->bits3.if_else.jump_count = br * (inst1 - inst0); + } + } + } + } break; -#else - (void) loop_inst; - (void) loop_depth; -#endif + case OPCODE_BRA: - brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); + brw_set_predicate_control(p, get_predicate(inst)); brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16)); - brw_set_predicate_control_flag_value(p, 0xff); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); break; case OPCODE_CAL: brw_set_access_mode(p, BRW_ALIGN_1); @@ -1559,12 +2157,8 @@ void brw_vs_emit(struct brw_vs_compile *c ) brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0)); brw_set_access_mode(p, BRW_ALIGN_16); break; - case OPCODE_END: - end_offset = p->nr_insn; - /* this instruction will get patched later to jump past subroutine - * code, etc. - */ - brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16)); + case OPCODE_END: + emit_vertex_write(c); break; case OPCODE_PRINT: /* no-op */ @@ -1622,23 +2216,26 @@ void brw_vs_emit(struct brw_vs_compile *c ) } } + if (inst->DstReg.RelAddr) { + assert(inst->DstReg.File == PROGRAM_TEMPORARY|| + inst->DstReg.File == PROGRAM_OUTPUT); + move_to_reladdr_dst(c, inst, dst); + } + release_tmps(c); } - end_inst = &p->store[end_offset]; - last_inst = &p->store[p->nr_insn]; - - /* The END instruction will be patched to jump to this code */ - emit_vertex_write(c); + brw_resolve_cals(p); + brw_set_uip_jip(p); - post_vs_emit(c, end_inst, last_inst); + brw_optimize(p); - if (INTEL_DEBUG & DEBUG_VS) { + if (unlikely(INTEL_DEBUG & DEBUG_VS)) { int i; - _mesa_printf("vs-native:\n"); + printf("vs-native:\n"); for (i = 0; i < p->nr_insn; i++) - brw_disasm(stderr, &p->store[i]); - _mesa_printf("\n"); + brw_disasm(stdout, &p->store[i], intel->gen); + printf("\n"); } }