From 6a8e5e48be0bad4606b2d5d7ba736a3d2a277c55 Mon Sep 17 00:00:00 2001 From: Axel Davy Date: Mon, 8 Dec 2014 15:38:28 +0100 Subject: [PATCH] st/nine: Rewrite LOOP implementation, and a0 aL handling Previous implementation didn't work well with nested loops. Instead of using several address registers, put a0 and aL into normal registers, and copy them to one address register when we need to use them. Wine tests loop_index_test() and nested_loop_test() now pass correctly. Fixes r600g crash while loading Bioshock - bug https://bugs.freedesktop.org/show_bug.cgi?id=85696 Tested-by: David Heidelberg Reviewed-by: Tiziano Bacocco Signed-off-by: Axel Davy Cc: "10.4" --- src/gallium/state_trackers/nine/nine_shader.c | 163 +++++++++++------- 1 file changed, 100 insertions(+), 63 deletions(-) diff --git a/src/gallium/state_trackers/nine/nine_shader.c b/src/gallium/state_trackers/nine/nine_shader.c index a6ca6d0d4c7..be930ef2fbc 100644 --- a/src/gallium/state_trackers/nine/nine_shader.c +++ b/src/gallium/state_trackers/nine/nine_shader.c @@ -466,14 +466,14 @@ struct shader_translator struct ureg_src vFace; struct ureg_src s; struct ureg_dst p; - struct ureg_dst a; + struct ureg_dst address; + struct ureg_dst a0; struct ureg_dst tS[8]; /* texture stage registers */ struct ureg_dst tdst; /* scratch dst if we need extra modifiers */ struct ureg_dst t[5]; /* scratch TEMPs */ struct ureg_src vC[2]; /* PS color in */ struct ureg_src vT[8]; /* PS texcoord in */ struct ureg_dst rL[NINE_MAX_LOOP_DEPTH]; /* loop ctr */ - struct ureg_dst aL[NINE_MAX_LOOP_DEPTH]; /* loop ctr ADDR register */ } regs; unsigned num_temp; /* Elements(regs.r) */ unsigned num_scratch; @@ -482,6 +482,7 @@ struct shader_translator unsigned cond_depth; unsigned loop_labels[NINE_MAX_LOOP_DEPTH]; unsigned cond_labels[NINE_MAX_COND_DEPTH]; + boolean loop_or_rep[NINE_MAX_LOOP_DEPTH]; /* true: loop, false: rep */ unsigned *inst_labels; /* LABEL op */ unsigned num_inst_labels; @@ -659,8 +660,10 @@ static INLINE void tx_addr_alloc(struct shader_translator *tx, INT idx) { assert(idx == 0); - if (ureg_dst_is_undef(tx->regs.a)) - tx->regs.a = ureg_DECL_address(tx->ureg); + if (ureg_dst_is_undef(tx->regs.address)) + tx->regs.address = ureg_DECL_address(tx->ureg); + if (ureg_dst_is_undef(tx->regs.a0)) + tx->regs.a0 = ureg_DECL_temporary(tx->ureg); } static INLINE void @@ -702,7 +705,7 @@ tx_endloop(struct shader_translator *tx) } static struct ureg_dst -tx_get_loopctr(struct shader_translator *tx) +tx_get_loopctr(struct shader_translator *tx, boolean loop_or_rep) { const unsigned l = tx->loop_depth - 1; @@ -712,26 +715,32 @@ tx_get_loopctr(struct shader_translator *tx) return ureg_dst_undef(); } - if (ureg_dst_is_undef(tx->regs.aL[l])) - { - struct ureg_dst rreg = ureg_DECL_local_temporary(tx->ureg); - struct ureg_dst areg = ureg_DECL_address(tx->ureg); - unsigned c; - - assert(l % 4 == 0); - for (c = l; c < (l + 4) && c < Elements(tx->regs.aL); ++c) { - tx->regs.rL[c] = ureg_writemask(rreg, 1 << (c & 3)); - tx->regs.aL[c] = ureg_writemask(areg, 1 << (c & 3)); - } + if (ureg_dst_is_undef(tx->regs.rL[l])) { + /* loop or rep ctr creation */ + tx->regs.rL[l] = ureg_DECL_local_temporary(tx->ureg); + tx->loop_or_rep[l] = loop_or_rep; } + /* loop - rep - endloop - endrep not allowed */ + assert(tx->loop_or_rep[l] == loop_or_rep); + return tx->regs.rL[l]; } -static struct ureg_dst -tx_get_aL(struct shader_translator *tx) + +static struct ureg_src +tx_get_loopal(struct shader_translator *tx) { - if (!ureg_dst_is_undef(tx_get_loopctr(tx))) - return tx->regs.aL[tx->loop_depth - 1]; - return ureg_dst_undef(); + int loop_level = tx->loop_depth - 1; + + while (loop_level >= 0) { + /* handle loop - rep - endrep - endloop case */ + if (tx->loop_or_rep[loop_level]) + /* the value is in the loop counter y component (nine implementation) */ + return ureg_scalar(ureg_src(tx->regs.rL[loop_level]), TGSI_SWIZZLE_Y); + loop_level--; + } + + DBG("aL counter requested outside of loop\n"); + return ureg_src_undef(); } static INLINE unsigned * @@ -782,8 +791,12 @@ tx_src_param(struct shader_translator *tx, const struct sm1_src_param *param) case D3DSPR_ADDR: assert(!param->rel); if (IS_VS) { - tx_addr_alloc(tx, param->idx); - src = ureg_src(tx->regs.a); + assert(param->idx == 0); + /* the address register (vs only) must be + * assigned before use */ + assert(!ureg_dst_is_undef(tx->regs.a0)); + ureg_ARR(ureg, tx->regs.address, ureg_src(tx->regs.a0)); + src = ureg_src(tx->regs.address); } else { if (tx->version.major < 2 && tx->version.minor < 4) { /* no subroutines, so should be defined */ @@ -857,7 +870,13 @@ tx_src_param(struct shader_translator *tx, const struct sm1_src_param *param) } break; case D3DSPR_LOOP: - src = tx_src_scalar(tx_get_aL(tx)); + if (ureg_dst_is_undef(tx->regs.address)) + tx->regs.address = ureg_DECL_address(ureg); + if (!tx->native_integers) + ureg_ARR(ureg, tx->regs.address, tx_get_loopal(tx)); + else + ureg_UARL(ureg, tx->regs.address, tx_get_loopal(tx)); + src = ureg_src(tx->regs.address); break; case D3DSPR_MISCTYPE: switch (param->idx) { @@ -996,7 +1015,7 @@ _tx_dst_param(struct shader_translator *tx, const struct sm1_dst_param *param) dst = ureg_dst(tx->regs.vT[param->idx]); } else { tx_addr_alloc(tx, param->idx); - dst = tx->regs.a; + dst = tx->regs.a0; } break; case D3DSPR_RASTOUT: @@ -1417,9 +1436,17 @@ DECL_SPECIAL(CALLNZ) DECL_SPECIAL(MOV_vs1x) { if (tx->insn.dst[0].file == D3DSPR_ADDR) { - ureg_ARL(tx->ureg, + /* Implementation note: We don't write directly + * to the addr register, but to an intermediate + * float register. + * Contrary to the doc, when writing to ADDR here, + * the rounding is not to nearest, but to lowest + * (wine test). + * Since we use ARR next, substract 0.5. */ + ureg_SUB(tx->ureg, tx_dst_param(tx, &tx->insn.dst[0]), - tx_src_param(tx, &tx->insn.src[0])); + tx_src_param(tx, &tx->insn.src[0]), + ureg_imm1f(tx->ureg, 0.5f)); return D3D_OK; } return NineTranslateInstruction_Generic(tx); @@ -1430,46 +1457,36 @@ DECL_SPECIAL(LOOP) struct ureg_program *ureg = tx->ureg; unsigned *label; struct ureg_src src = tx_src_param(tx, &tx->insn.src[1]); - struct ureg_src iter = ureg_scalar(src, TGSI_SWIZZLE_X); - struct ureg_src init = ureg_scalar(src, TGSI_SWIZZLE_Y); - struct ureg_src step = ureg_scalar(src, TGSI_SWIZZLE_Z); struct ureg_dst ctr; - struct ureg_dst tmp = tx_scratch_scalar(tx); + struct ureg_dst tmp; + struct ureg_src ctrx; label = tx_bgnloop(tx); - ctr = tx_get_loopctr(tx); + ctr = tx_get_loopctr(tx, TRUE); + ctrx = ureg_scalar(ureg_src(ctr), TGSI_SWIZZLE_X); - ureg_MOV(tx->ureg, ctr, init); + /* src: num_iterations - start_value of al - step for al - 0 */ + ureg_MOV(ureg, ctr, src); ureg_BGNLOOP(tx->ureg, label); - if (tx->native_integers) { - /* we'll let the backend pull up that MAD ... */ - ureg_UMAD(ureg, tmp, iter, step, init); - ureg_USEQ(ureg, tmp, ureg_src(ctr), tx_src_scalar(tmp)); -#ifdef NINE_TGSI_LAZY_DEVS - ureg_UIF(ureg, tx_src_scalar(tmp), tx_cond(tx)); -#endif - } else { - /* can't simply use SGE for precision because step might be negative */ - ureg_MAD(ureg, tmp, iter, step, init); - ureg_SEQ(ureg, tmp, ureg_src(ctr), tx_src_scalar(tmp)); -#ifdef NINE_TGSI_LAZY_DEVS + tmp = tx_scratch_scalar(tx); + /* Initially ctr.x contains the number of iterations. + * ctr.y will contain the updated value of al. + * We decrease ctr.x at the end of every iteration, + * and stop when it reaches 0. */ + + if (!tx->native_integers) { + /* case src and ctr contain floats */ + /* to avoid precision issue, we stop when ctr <= 0.5 */ + ureg_SGE(ureg, tmp, ureg_imm1f(ureg, 0.5f), ctrx); ureg_IF(ureg, tx_src_scalar(tmp), tx_cond(tx)); -#endif + } else { + /* case src and ctr contain integers */ + ureg_ISGE(ureg, tmp, ureg_imm1i(ureg, 0), ctrx); + ureg_UIF(ureg, tx_src_scalar(tmp), tx_cond(tx)); } -#ifdef NINE_TGSI_LAZY_DEVS ureg_BRK(ureg); tx_endcond(tx); ureg_ENDIF(ureg); -#else - ureg_BREAKC(ureg, tx_src_scalar(tmp)); -#endif - if (tx->native_integers) { - ureg_UARL(ureg, tx_get_aL(tx), tx_src_scalar(ctr)); - ureg_UADD(ureg, ctr, tx_src_scalar(ctr), step); - } else { - ureg_ARL(ureg, tx_get_aL(tx), tx_src_scalar(ctr)); - ureg_ADD(ureg, ctr, tx_src_scalar(ctr), step); - } return D3D_OK; } @@ -1481,6 +1498,25 @@ DECL_SPECIAL(RET) DECL_SPECIAL(ENDLOOP) { + struct ureg_program *ureg = tx->ureg; + struct ureg_dst ctr = tx_get_loopctr(tx, TRUE); + struct ureg_dst dst_ctrx, dst_al; + struct ureg_src src_ctr, al_counter; + + dst_ctrx = ureg_writemask(ctr, NINED3DSP_WRITEMASK_0); + dst_al = ureg_writemask(ctr, NINED3DSP_WRITEMASK_1); + src_ctr = ureg_src(ctr); + al_counter = ureg_scalar(src_ctr, TGSI_SWIZZLE_Z); + + /* ctr.x -= 1 + * ctr.y (aL) += step */ + if (!tx->native_integers) { + ureg_ADD(ureg, dst_ctrx, src_ctr, ureg_imm1f(ureg, -1.0f)); + ureg_ADD(ureg, dst_al, src_ctr, al_counter); + } else { + ureg_UADD(ureg, dst_ctrx, src_ctr, ureg_imm1i(ureg, -1)); + ureg_UADD(ureg, dst_al, src_ctr, al_counter); + } ureg_ENDLOOP(tx->ureg, tx_endloop(tx)); return D3D_OK; } @@ -1530,7 +1566,7 @@ DECL_SPECIAL(REP) tx->native_integers ? ureg_imm1u(ureg, 0) : ureg_imm1f(ureg, 0.0f); label = tx_bgnloop(tx); - ctr = tx_get_loopctr(tx); + ctr = tx_get_loopctr(tx, FALSE); /* NOTE: rep must be constant, so we don't have to save the count */ assert(rep.File == TGSI_FILE_CONSTANT || rep.File == TGSI_FILE_IMMEDIATE); @@ -2331,8 +2367,9 @@ struct sm1_op_info inst_table[] = _OPI(ENDIF, ENDIF, V(2,0), V(3,0), V(2,1), V(3,0), 0, 0, SPECIAL(ENDIF)), _OPI(BREAK, BRK, V(2,1), V(3,0), V(2,1), V(3,0), 0, 0, NULL), _OPI(BREAKC, BREAKC, V(2,1), V(3,0), V(2,1), V(3,0), 0, 2, SPECIAL(BREAKC)), - - _OPI(MOVA, ARR, V(2,0), V(3,0), V(0,0), V(0,0), 1, 1, NULL), + /* we don't write to the address register, but a normal register (copied + * when needed to the address register), thus we don't use ARR */ + _OPI(MOVA, MOV, V(2,0), V(3,0), V(0,0), V(0,0), 1, 1, NULL), _OPI(DEFB, NOP, V(0,0), V(3,0) , V(0,0), V(3,0) , 1, 0, SPECIAL(DEFB)), _OPI(DEFI, NOP, V(0,0), V(3,0) , V(0,0), V(3,0) , 1, 0, SPECIAL(DEFI)), @@ -2749,11 +2786,11 @@ tx_ctor(struct shader_translator *tx, struct nine_shader_info *info) info->lconstf.data = NULL; info->lconstf.ranges = NULL; - for (i = 0; i < Elements(tx->regs.aL); ++i) { - tx->regs.aL[i] = ureg_dst_undef(); + for (i = 0; i < Elements(tx->regs.rL); ++i) { tx->regs.rL[i] = ureg_dst_undef(); } - tx->regs.a = ureg_dst_undef(); + tx->regs.address = ureg_dst_undef(); + tx->regs.a0 = ureg_dst_undef(); tx->regs.p = ureg_dst_undef(); tx->regs.oDepth = ureg_dst_undef(); tx->regs.vPos = ureg_src_undef(); -- 2.30.2