X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Ftgsi%2Ftgsi_exec.c;h=fe571a86bcad95c2bf7fea9f5b5c3764c9243046;hb=27a19be8d1c59c64240198261af348b868b101e4;hp=d9dc217e52b5b85c2bf6bd2d2340353588741868;hpb=ae7ae570ef9f61073a2c0e4b77fc23094767f74c;p=mesa.git diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index d9dc217e52b..fe571a86bca 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -53,6 +53,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_state.h" #include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi_exec.h" @@ -123,6 +124,103 @@ #define UPDATE_EXEC_MASK(MACH) \ MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask + +static const union tgsi_exec_channel ZeroVec = + { { 0.0, 0.0, 0.0, 0.0 } }; + + +#ifdef DEBUG +static void +check_inf_or_nan(const union tgsi_exec_channel *chan) +{ + assert(!util_is_inf_or_nan(chan->f[0])); + assert(!util_is_inf_or_nan(chan->f[1])); + assert(!util_is_inf_or_nan(chan->f[2])); + assert(!util_is_inf_or_nan(chan->f[3])); +} +#endif + + +#ifdef DEBUG +static void +print_chan(const char *msg, const union tgsi_exec_channel *chan) +{ + debug_printf("%s = {%f, %f, %f, %f}\n", + msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]); +} +#endif + + +#ifdef DEBUG +static void +print_temp(const struct tgsi_exec_machine *mach, uint index) +{ + const struct tgsi_exec_vector *tmp = &mach->Temps[index]; + int i; + debug_printf("Temp[%u] =\n", index); + for (i = 0; i < 4; i++) { + debug_printf(" %c: { %f, %f, %f, %f }\n", + "XYZW"[i], + tmp->xyzw[i].f[0], + tmp->xyzw[i].f[1], + tmp->xyzw[i].f[2], + tmp->xyzw[i].f[3]); + } +} +#endif + + +/** + * Check if there's a potential src/dst register data dependency when + * using SOA execution. + * Example: + * MOV T, T.yxwz; + * This would expand into: + * MOV t0, t1; + * MOV t1, t0; + * MOV t2, t3; + * MOV t3, t2; + * The second instruction will have the wrong value for t0 if executed as-is. + */ +static boolean +tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst) +{ + uint i, chan; + + uint writemask = inst->FullDstRegisters[0].DstRegister.WriteMask; + if (writemask == TGSI_WRITEMASK_X || + writemask == TGSI_WRITEMASK_Y || + writemask == TGSI_WRITEMASK_Z || + writemask == TGSI_WRITEMASK_W || + writemask == TGSI_WRITEMASK_NONE) { + /* no chance of data dependency */ + return FALSE; + } + + /* loop over src regs */ + for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { + if ((inst->FullSrcRegisters[i].SrcRegister.File == + inst->FullDstRegisters[0].DstRegister.File) && + (inst->FullSrcRegisters[i].SrcRegister.Index == + inst->FullDstRegisters[0].DstRegister.Index)) { + /* loop over dest channels */ + uint channelsWritten = 0x0; + FOR_EACH_ENABLED_CHANNEL(*inst, chan) { + /* check if we're reading a channel that's been written */ + uint swizzle = tgsi_util_get_full_src_register_extswizzle(&inst->FullSrcRegisters[i], chan); + if (swizzle <= TGSI_SWIZZLE_W && + (channelsWritten & (1 << swizzle))) { + return TRUE; + } + + channelsWritten |= (1 << chan); + } + } + } + return FALSE; +} + + /** * Initialize machine state by expanding tokens to full instructions, * allocating temporary storage, setting up constants, etc. @@ -233,6 +331,17 @@ tgsi_exec_machine_bind_shader( memcpy(instructions + numInstructions, &parse.FullToken.FullInstruction, sizeof(instructions[0])); + +#if 0 + if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) { + debug_printf("SOA dependency in instruction:\n"); + tgsi_dump_instruction(&parse.FullToken.FullInstruction, + numInstructions); + } +#else + (void) tgsi_check_soa_dependencies; +#endif + numInstructions++; break; @@ -256,15 +365,26 @@ tgsi_exec_machine_bind_shader( } -void -tgsi_exec_machine_init( - struct tgsi_exec_machine *mach ) +struct tgsi_exec_machine * +tgsi_exec_machine_create( void ) { + struct tgsi_exec_machine *mach; uint i; - mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps); + mach = align_malloc( sizeof *mach, 16 ); + if (!mach) + goto fail; + mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR]; + mach->Samplers = NULL; + mach->Consts = NULL; + mach->Tokens = NULL; + mach->Primitives = NULL; + mach->InterpCoefs = NULL; + mach->Instructions = NULL; + mach->Declarations = NULL; + /* Setup constants. */ for( i = 0; i < 4; i++ ) { mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000; @@ -278,22 +398,30 @@ tgsi_exec_machine_init( mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f; mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f; } + +#ifdef DEBUG + /* silence warnings */ + (void) print_chan; + (void) print_temp; +#endif + + return mach; + +fail: + align_free(mach); + return NULL; } void -tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach) +tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach) { - if (mach->Instructions) { + if (mach) { FREE(mach->Instructions); - mach->Instructions = NULL; - mach->NumInstructions = 0; - } - if (mach->Declarations) { FREE(mach->Declarations); - mach->Declarations = NULL; - mach->NumDeclarations = 0; } + + align_free(mach); } @@ -1280,6 +1408,48 @@ store_dest( union tgsi_exec_channel null; union tgsi_exec_channel *dst; uint execmask = mach->ExecMask; + int offset = 0; /* indirection offset */ + int index; + +#ifdef DEBUG + check_inf_or_nan(chan); +#endif + + /* There is an extra source register that indirectly subscripts + * a register file. The direct index now becomes an offset + * that is being added to the indirect register. + * + * file[ind[2].x+1], + * where: + * ind = DstRegisterInd.File + * [2] = DstRegisterInd.Index + * .x = DstRegisterInd.SwizzleX + */ + if (reg->DstRegister.Indirect) { + union tgsi_exec_channel index; + union tgsi_exec_channel indir_index; + uint swizzle; + + /* which address register (always zero for now) */ + index.i[0] = + index.i[1] = + index.i[2] = + index.i[3] = reg->DstRegisterInd.Index; + + /* get current value of address register[swizzle] */ + swizzle = tgsi_util_get_src_register_swizzle( ®->DstRegisterInd, CHAN_X ); + + /* fetch values from the address/indirection register */ + fetch_src_file_channel( + mach, + reg->DstRegisterInd.File, + swizzle, + &index, + &indir_index ); + + /* save indirection offset */ + offset = (int) indir_index.f[0]; + } switch (reg->DstRegister.File) { case TGSI_FILE_NULL: @@ -1287,17 +1457,20 @@ store_dest( break; case TGSI_FILE_OUTPUT: - dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] - + reg->DstRegister.Index].xyzw[chan_index]; + index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] + + reg->DstRegister.Index; + dst = &mach->Outputs[offset + index].xyzw[chan_index]; break; case TGSI_FILE_TEMPORARY: - assert( reg->DstRegister.Index < TGSI_EXEC_NUM_TEMPS ); - dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index]; + index = reg->DstRegister.Index; + assert( index < TGSI_EXEC_NUM_TEMPS ); + dst = &mach->Temps[offset + index].xyzw[chan_index]; break; case TGSI_FILE_ADDRESS: - dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index]; + index = reg->DstRegister.Index; + dst = &mach->Addrs[index].xyzw[chan_index]; break; default: @@ -1643,7 +1816,7 @@ exec_tex(struct tgsi_exec_machine *mach, lodBias = 0.0; fetch_texel(mach->Samplers[unit], - &r[0], NULL, NULL, lodBias, /* S, T, P, BIAS */ + &r[0], &ZeroVec, &ZeroVec, lodBias, /* S, T, P, BIAS */ &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */ break; @@ -1841,13 +2014,13 @@ exec_instruction( int *pc ) { uint chan_index; - union tgsi_exec_channel r[8]; + union tgsi_exec_channel r[10]; (*pc)++; switch (inst->Instruction.Opcode) { case TGSI_OPCODE_ARL: - /* TGSI_OPCODE_FLOOR */ + case TGSI_OPCODE_FLOOR: /* TGSI_OPCODE_FLR */ FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); @@ -2391,7 +2564,52 @@ exec_instruction( break; case TGSI_OPCODE_RFL: - assert (0); + if (IS_CHANNEL_ENABLED(*inst, CHAN_X) || + IS_CHANNEL_ENABLED(*inst, CHAN_Y) || + IS_CHANNEL_ENABLED(*inst, CHAN_Z)) { + /* r0 = dp3(src0, src0) */ + FETCH(&r[2], 0, CHAN_X); + micro_mul(&r[0], &r[2], &r[2]); + FETCH(&r[4], 0, CHAN_Y); + micro_mul(&r[8], &r[4], &r[4]); + micro_add(&r[0], &r[0], &r[8]); + FETCH(&r[6], 0, CHAN_Z); + micro_mul(&r[8], &r[6], &r[6]); + micro_add(&r[0], &r[0], &r[8]); + + /* r1 = dp3(src0, src1) */ + FETCH(&r[3], 1, CHAN_X); + micro_mul(&r[1], &r[2], &r[3]); + FETCH(&r[5], 1, CHAN_Y); + micro_mul(&r[8], &r[4], &r[5]); + micro_add(&r[1], &r[1], &r[8]); + FETCH(&r[7], 1, CHAN_Z); + micro_mul(&r[8], &r[6], &r[7]); + micro_add(&r[1], &r[1], &r[8]); + + /* r1 = 2 * r1 / r0 */ + micro_add(&r[1], &r[1], &r[1]); + micro_div(&r[1], &r[1], &r[0]); + + if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) { + micro_mul(&r[2], &r[2], &r[1]); + micro_sub(&r[2], &r[2], &r[3]); + STORE(&r[2], 0, CHAN_X); + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) { + micro_mul(&r[4], &r[4], &r[1]); + micro_sub(&r[4], &r[4], &r[5]); + STORE(&r[4], 0, CHAN_Y); + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) { + micro_mul(&r[6], &r[6], &r[1]); + micro_sub(&r[6], &r[6], &r[7]); + STORE(&r[6], 0, CHAN_Z); + } + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) { + STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W); + } break; case TGSI_OPCODE_SEQ: @@ -2406,7 +2624,9 @@ exec_instruction( break; case TGSI_OPCODE_SFL: - assert (0); + FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) { + STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index); + } break; case TGSI_OPCODE_SGT: @@ -2445,7 +2665,9 @@ exec_instruction( break; case TGSI_OPCODE_STR: - assert (0); + FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) { + STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index); + } break; case TGSI_OPCODE_TEX: @@ -2502,7 +2724,40 @@ exec_instruction( break; case TGSI_OPCODE_X2D: - assert (0); + FETCH(&r[0], 1, CHAN_X); + FETCH(&r[1], 1, CHAN_Y); + if (IS_CHANNEL_ENABLED(*inst, CHAN_X) || + IS_CHANNEL_ENABLED(*inst, CHAN_Z)) { + FETCH(&r[2], 2, CHAN_X); + micro_mul(&r[2], &r[2], &r[0]); + FETCH(&r[3], 2, CHAN_Y); + micro_mul(&r[3], &r[3], &r[1]); + micro_add(&r[2], &r[2], &r[3]); + FETCH(&r[3], 0, CHAN_X); + micro_add(&r[2], &r[2], &r[3]); + if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) { + STORE(&r[2], 0, CHAN_X); + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) { + STORE(&r[2], 0, CHAN_Z); + } + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) || + IS_CHANNEL_ENABLED(*inst, CHAN_W)) { + FETCH(&r[2], 2, CHAN_Z); + micro_mul(&r[2], &r[2], &r[0]); + FETCH(&r[3], 2, CHAN_W); + micro_mul(&r[3], &r[3], &r[1]); + micro_add(&r[2], &r[2], &r[3]); + FETCH(&r[3], 0, CHAN_Y); + micro_add(&r[2], &r[2], &r[3]); + if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) { + STORE(&r[2], 0, CHAN_Y); + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) { + STORE(&r[2], 0, CHAN_W); + } + } break; case TGSI_OPCODE_ARA: @@ -2587,14 +2842,14 @@ exec_instruction( case TGSI_OPCODE_SCS: if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { FETCH( &r[0], 0, CHAN_X ); - } - if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) { - micro_cos( &r[1], &r[0] ); - STORE( &r[1], 0, CHAN_X ); - } - if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { - micro_sin( &r[1], &r[0] ); - STORE( &r[1], 0, CHAN_Y ); + if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) { + micro_cos(&r[1], &r[0]); + STORE(&r[1], 0, CHAN_X); + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) { + micro_sin(&r[1], &r[0]); + STORE(&r[1], 0, CHAN_Y); + } } if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z ); @@ -2606,31 +2861,35 @@ exec_instruction( case TGSI_OPCODE_NRM: /* 3-component vector normalize */ - { - union tgsi_exec_channel tmp, dot; - - /* tmp = dp3(src0, src0): */ - FETCH( &r[0], 0, CHAN_X ); - micro_mul( &tmp, &r[0], &r[0] ); - - FETCH( &r[1], 0, CHAN_Y ); - micro_mul( &dot, &r[1], &r[1] ); - micro_add( &tmp, &tmp, &dot ); - - FETCH( &r[2], 0, CHAN_Z ); - micro_mul( &dot, &r[2], &r[2] ); - micro_add( &tmp, &tmp, &dot ); - - /* tmp = 1 / sqrt(tmp) */ - micro_sqrt( &tmp, &tmp ); - micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp ); - - /* note: w channel is undefined */ - FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { - /* chan = chan * tmp */ - micro_mul( &r[chan_index], &tmp, &r[chan_index] ); - STORE( &r[chan_index], 0, chan_index ); + if(IS_CHANNEL_ENABLED(*inst, CHAN_X) || + IS_CHANNEL_ENABLED(*inst, CHAN_Y) || + IS_CHANNEL_ENABLED(*inst, CHAN_Z)) { + /* r3 = sqrt(dp3(src0, src0)) */ + FETCH(&r[0], 0, CHAN_X); + micro_mul(&r[3], &r[0], &r[0]); + FETCH(&r[1], 0, CHAN_Y); + micro_mul(&r[4], &r[1], &r[1]); + micro_add(&r[3], &r[3], &r[4]); + FETCH(&r[2], 0, CHAN_Z); + micro_mul(&r[4], &r[2], &r[2]); + micro_add(&r[3], &r[3], &r[4]); + micro_sqrt(&r[3], &r[3]); + + if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) { + micro_div(&r[0], &r[0], &r[3]); + STORE(&r[0], 0, CHAN_X); } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) { + micro_div(&r[1], &r[1], &r[3]); + STORE(&r[1], 0, CHAN_Y); + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) { + micro_div(&r[2], &r[2], &r[3]); + STORE(&r[2], 0, CHAN_Z); + } + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) { + STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W); } break;