X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Ftgsi%2Ftgsi_exec.c;h=fe571a86bcad95c2bf7fea9f5b5c3764c9243046;hb=27a19be8d1c59c64240198261af348b868b101e4;hp=94589cf79f1a7f1ce4e79df3f1eb4e85c7b53bfa;hpb=dceb09909ea9d6eaef0334897ebed6da45db6faa;p=mesa.git diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index 94589cf79f1..fe571a86bca 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -53,6 +53,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_state.h" #include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi_exec.h" @@ -123,6 +124,103 @@ #define UPDATE_EXEC_MASK(MACH) \ MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask + +static const union tgsi_exec_channel ZeroVec = + { { 0.0, 0.0, 0.0, 0.0 } }; + + +#ifdef DEBUG +static void +check_inf_or_nan(const union tgsi_exec_channel *chan) +{ + assert(!util_is_inf_or_nan(chan->f[0])); + assert(!util_is_inf_or_nan(chan->f[1])); + assert(!util_is_inf_or_nan(chan->f[2])); + assert(!util_is_inf_or_nan(chan->f[3])); +} +#endif + + +#ifdef DEBUG +static void +print_chan(const char *msg, const union tgsi_exec_channel *chan) +{ + debug_printf("%s = {%f, %f, %f, %f}\n", + msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]); +} +#endif + + +#ifdef DEBUG +static void +print_temp(const struct tgsi_exec_machine *mach, uint index) +{ + const struct tgsi_exec_vector *tmp = &mach->Temps[index]; + int i; + debug_printf("Temp[%u] =\n", index); + for (i = 0; i < 4; i++) { + debug_printf(" %c: { %f, %f, %f, %f }\n", + "XYZW"[i], + tmp->xyzw[i].f[0], + tmp->xyzw[i].f[1], + tmp->xyzw[i].f[2], + tmp->xyzw[i].f[3]); + } +} +#endif + + +/** + * Check if there's a potential src/dst register data dependency when + * using SOA execution. + * Example: + * MOV T, T.yxwz; + * This would expand into: + * MOV t0, t1; + * MOV t1, t0; + * MOV t2, t3; + * MOV t3, t2; + * The second instruction will have the wrong value for t0 if executed as-is. + */ +static boolean +tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst) +{ + uint i, chan; + + uint writemask = inst->FullDstRegisters[0].DstRegister.WriteMask; + if (writemask == TGSI_WRITEMASK_X || + writemask == TGSI_WRITEMASK_Y || + writemask == TGSI_WRITEMASK_Z || + writemask == TGSI_WRITEMASK_W || + writemask == TGSI_WRITEMASK_NONE) { + /* no chance of data dependency */ + return FALSE; + } + + /* loop over src regs */ + for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { + if ((inst->FullSrcRegisters[i].SrcRegister.File == + inst->FullDstRegisters[0].DstRegister.File) && + (inst->FullSrcRegisters[i].SrcRegister.Index == + inst->FullDstRegisters[0].DstRegister.Index)) { + /* loop over dest channels */ + uint channelsWritten = 0x0; + FOR_EACH_ENABLED_CHANNEL(*inst, chan) { + /* check if we're reading a channel that's been written */ + uint swizzle = tgsi_util_get_full_src_register_extswizzle(&inst->FullSrcRegisters[i], chan); + if (swizzle <= TGSI_SWIZZLE_W && + (channelsWritten & (1 << swizzle))) { + return TRUE; + } + + channelsWritten |= (1 << chan); + } + } + } + return FALSE; +} + + /** * Initialize machine state by expanding tokens to full instructions, * allocating temporary storage, setting up constants, etc. @@ -233,6 +331,17 @@ tgsi_exec_machine_bind_shader( memcpy(instructions + numInstructions, &parse.FullToken.FullInstruction, sizeof(instructions[0])); + +#if 0 + if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) { + debug_printf("SOA dependency in instruction:\n"); + tgsi_dump_instruction(&parse.FullToken.FullInstruction, + numInstructions); + } +#else + (void) tgsi_check_soa_dependencies; +#endif + numInstructions++; break; @@ -256,15 +365,26 @@ tgsi_exec_machine_bind_shader( } -void -tgsi_exec_machine_init( - struct tgsi_exec_machine *mach ) +struct tgsi_exec_machine * +tgsi_exec_machine_create( void ) { + struct tgsi_exec_machine *mach; uint i; - mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps); + mach = align_malloc( sizeof *mach, 16 ); + if (!mach) + goto fail; + mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR]; + mach->Samplers = NULL; + mach->Consts = NULL; + mach->Tokens = NULL; + mach->Primitives = NULL; + mach->InterpCoefs = NULL; + mach->Instructions = NULL; + mach->Declarations = NULL; + /* Setup constants. */ for( i = 0; i < 4; i++ ) { mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000; @@ -278,22 +398,30 @@ tgsi_exec_machine_init( mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f; mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f; } + +#ifdef DEBUG + /* silence warnings */ + (void) print_chan; + (void) print_temp; +#endif + + return mach; + +fail: + align_free(mach); + return NULL; } void -tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach) +tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach) { - if (mach->Instructions) { + if (mach) { FREE(mach->Instructions); - mach->Instructions = NULL; - mach->NumInstructions = 0; - } - if (mach->Declarations) { FREE(mach->Declarations); - mach->Declarations = NULL; - mach->NumDeclarations = 0; } + + align_free(mach); } @@ -485,6 +613,32 @@ micro_f2ut( } #endif +static void +micro_float_clamp(union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src) +{ + uint i; + + for (i = 0; i < 4; i++) { + if (src->f[i] > 0.0f) { + if (src->f[i] > 1.884467e+019f) + dst->f[i] = 1.884467e+019f; + else if (src->f[i] < 5.42101e-020f) + dst->f[i] = 5.42101e-020f; + else + dst->f[i] = src->f[i]; + } + else { + if (src->f[i] < -1.884467e+019f) + dst->f[i] = -1.884467e+019f; + else if (src->f[i] > -5.42101e-020f) + dst->f[i] = -5.42101e-020f; + else + dst->f[i] = src->f[i]; + } + } +} + static void micro_flr( union tgsi_exec_channel *dst, @@ -507,20 +661,6 @@ micro_frc( dst->f[3] = src->f[3] - floorf( src->f[3] ); } -static void -micro_ge( - union tgsi_exec_channel *dst, - const union tgsi_exec_channel *src0, - const union tgsi_exec_channel *src1, - const union tgsi_exec_channel *src2, - const union tgsi_exec_channel *src3 ) -{ - dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0]; - dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1]; - dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2]; - dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3]; -} - static void micro_i2f( union tgsi_exec_channel *dst, @@ -1268,6 +1408,48 @@ store_dest( union tgsi_exec_channel null; union tgsi_exec_channel *dst; uint execmask = mach->ExecMask; + int offset = 0; /* indirection offset */ + int index; + +#ifdef DEBUG + check_inf_or_nan(chan); +#endif + + /* There is an extra source register that indirectly subscripts + * a register file. The direct index now becomes an offset + * that is being added to the indirect register. + * + * file[ind[2].x+1], + * where: + * ind = DstRegisterInd.File + * [2] = DstRegisterInd.Index + * .x = DstRegisterInd.SwizzleX + */ + if (reg->DstRegister.Indirect) { + union tgsi_exec_channel index; + union tgsi_exec_channel indir_index; + uint swizzle; + + /* which address register (always zero for now) */ + index.i[0] = + index.i[1] = + index.i[2] = + index.i[3] = reg->DstRegisterInd.Index; + + /* get current value of address register[swizzle] */ + swizzle = tgsi_util_get_src_register_swizzle( ®->DstRegisterInd, CHAN_X ); + + /* fetch values from the address/indirection register */ + fetch_src_file_channel( + mach, + reg->DstRegisterInd.File, + swizzle, + &index, + &indir_index ); + + /* save indirection offset */ + offset = (int) indir_index.f[0]; + } switch (reg->DstRegister.File) { case TGSI_FILE_NULL: @@ -1275,17 +1457,20 @@ store_dest( break; case TGSI_FILE_OUTPUT: - dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] - + reg->DstRegister.Index].xyzw[chan_index]; + index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] + + reg->DstRegister.Index; + dst = &mach->Outputs[offset + index].xyzw[chan_index]; break; case TGSI_FILE_TEMPORARY: - assert( reg->DstRegister.Index < TGSI_EXEC_NUM_TEMPS ); - dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index]; + index = reg->DstRegister.Index; + assert( index < TGSI_EXEC_NUM_TEMPS ); + dst = &mach->Temps[offset + index].xyzw[chan_index]; break; case TGSI_FILE_ADDRESS: - dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index]; + index = reg->DstRegister.Index; + dst = &mach->Addrs[index].xyzw[chan_index]; break; default: @@ -1631,7 +1816,7 @@ exec_tex(struct tgsi_exec_machine *mach, lodBias = 0.0; fetch_texel(mach->Samplers[unit], - &r[0], NULL, NULL, lodBias, /* S, T, P, BIAS */ + &r[0], &ZeroVec, &ZeroVec, lodBias, /* S, T, P, BIAS */ &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */ break; @@ -1829,12 +2014,14 @@ exec_instruction( int *pc ) { uint chan_index; - union tgsi_exec_channel r[8]; + union tgsi_exec_channel r[10]; (*pc)++; switch (inst->Instruction.Opcode) { case TGSI_OPCODE_ARL: + case TGSI_OPCODE_FLOOR: + /* TGSI_OPCODE_FLR */ FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); micro_flr( &r[0], &r[0] ); @@ -1852,31 +2039,31 @@ exec_instruction( case TGSI_OPCODE_LIT: if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { - STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); } if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { - FETCH( &r[0], 0, CHAN_X ); - if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { - micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); - STORE( &r[0], 0, CHAN_Y ); - } + FETCH( &r[0], 0, CHAN_X ); + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { + micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + STORE( &r[0], 0, CHAN_Y ); + } - if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { - FETCH( &r[1], 0, CHAN_Y ); - micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + FETCH( &r[1], 0, CHAN_Y ); + micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); - FETCH( &r[2], 0, CHAN_W ); - micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] ); - micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] ); - micro_pow( &r[1], &r[1], &r[2] ); - micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); - STORE( &r[0], 0, CHAN_Z ); - } + FETCH( &r[2], 0, CHAN_W ); + micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] ); + micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] ); + micro_pow( &r[1], &r[1], &r[2] ); + micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + STORE( &r[0], 0, CHAN_Z ); + } } if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { - STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); } break; @@ -1885,7 +2072,7 @@ exec_instruction( FETCH( &r[0], 0, CHAN_X ); micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] ); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( &r[0], 0, chan_index ); + STORE( &r[0], 0, chan_index ); } break; @@ -1896,7 +2083,7 @@ exec_instruction( micro_sqrt( &r[0], &r[0] ); micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] ); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( &r[0], 0, chan_index ); + STORE( &r[0], 0, chan_index ); } break; @@ -2009,30 +2196,30 @@ exec_instruction( micro_add( &r[0], &r[0], &r[1] ); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( &r[0], 0, chan_index ); + STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_DST: if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { - STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); } if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { - FETCH( &r[0], 0, CHAN_Y ); - FETCH( &r[1], 1, CHAN_Y); - micro_mul( &r[0], &r[0], &r[1] ); - STORE( &r[0], 0, CHAN_Y ); + FETCH( &r[0], 0, CHAN_Y ); + FETCH( &r[1], 1, CHAN_Y); + micro_mul( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, CHAN_Y ); } if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { - FETCH( &r[0], 0, CHAN_Z ); - STORE( &r[0], 0, CHAN_Z ); + FETCH( &r[0], 0, CHAN_Z ); + STORE( &r[0], 0, CHAN_Z ); } if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { - FETCH( &r[0], 1, CHAN_W ); - STORE( &r[0], 0, CHAN_W ); + FETCH( &r[0], 1, CHAN_W ); + STORE( &r[0], 0, CHAN_W ); } break; @@ -2075,7 +2262,7 @@ exec_instruction( FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); - micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + micro_le( &r[0], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); STORE( &r[0], 0, chan_index ); } break; @@ -2119,15 +2306,27 @@ exec_instruction( break; case TGSI_OPCODE_CND: - assert (0); + FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + FETCH(&r[2], 2, chan_index); + micro_lt(&r[0], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]); + STORE(&r[0], 0, chan_index); + } break; case TGSI_OPCODE_CND0: - assert (0); + FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + FETCH(&r[2], 2, chan_index); + micro_le(&r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[2], &r[0], &r[1]); + STORE(&r[0], 0, chan_index); + } break; case TGSI_OPCODE_DOT2ADD: - /* TGSI_OPCODE_DP2A */ + /* TGSI_OPCODE_DP2A */ FETCH( &r[0], 0, CHAN_X ); FETCH( &r[1], 1, CHAN_X ); micro_mul( &r[0], &r[0], &r[1] ); @@ -2146,10 +2345,12 @@ exec_instruction( break; case TGSI_OPCODE_INDEX: + /* XXX: considered for removal */ assert (0); break; case TGSI_OPCODE_NEGATE: + /* XXX: considered for removal */ assert (0); break; @@ -2163,15 +2364,13 @@ exec_instruction( break; case TGSI_OPCODE_CLAMP: - assert (0); - break; - - case TGSI_OPCODE_FLOOR: - /* TGSI_OPCODE_FLR */ - FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( &r[0], 0, chan_index ); - micro_flr( &r[0], &r[0] ); - STORE( &r[0], 0, chan_index ); + FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + micro_max(&r[0], &r[0], &r[1]); + FETCH(&r[1], 2, chan_index); + micro_min(&r[0], &r[0], &r[1]); + STORE(&r[0], 0, chan_index); } break; @@ -2185,7 +2384,7 @@ exec_instruction( break; case TGSI_OPCODE_EXPBASE2: - /* TGSI_OPCODE_EX2 */ + /* TGSI_OPCODE_EX2 */ FETCH(&r[0], 0, CHAN_X); #if FAST_MATH @@ -2195,7 +2394,7 @@ exec_instruction( #endif FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( &r[0], 0, chan_index ); + STORE( &r[0], 0, chan_index ); } break; @@ -2209,19 +2408,19 @@ exec_instruction( break; case TGSI_OPCODE_POWER: - /* TGSI_OPCODE_POW */ + /* TGSI_OPCODE_POW */ FETCH(&r[0], 0, CHAN_X); FETCH(&r[1], 1, CHAN_X); micro_pow( &r[0], &r[0], &r[1] ); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( &r[0], 0, chan_index ); + STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_CROSSPRODUCT: - /* TGSI_OPCODE_XPD */ + /* TGSI_OPCODE_XPD */ FETCH(&r[0], 0, CHAN_Y); FETCH(&r[1], 1, CHAN_Z); @@ -2264,6 +2463,7 @@ exec_instruction( break; case TGSI_OPCODE_MULTIPLYMATRIX: + /* XXX: considered for removal */ assert (0); break; @@ -2278,7 +2478,12 @@ exec_instruction( break; case TGSI_OPCODE_RCC: - assert (0); + FETCH(&r[0], 0, CHAN_X); + micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]); + micro_float_clamp(&r[0], &r[0]); + FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) { + STORE(&r[0], 0, chan_index); + } break; case TGSI_OPCODE_DPH: @@ -2304,7 +2509,7 @@ exec_instruction( micro_add( &r[0], &r[0], &r[1] ); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( &r[0], 0, chan_index ); + STORE( &r[0], 0, chan_index ); } break; @@ -2314,7 +2519,7 @@ exec_instruction( micro_cos( &r[0], &r[0] ); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( &r[0], 0, chan_index ); + STORE( &r[0], 0, chan_index ); } break; @@ -2359,7 +2564,52 @@ exec_instruction( break; case TGSI_OPCODE_RFL: - assert (0); + if (IS_CHANNEL_ENABLED(*inst, CHAN_X) || + IS_CHANNEL_ENABLED(*inst, CHAN_Y) || + IS_CHANNEL_ENABLED(*inst, CHAN_Z)) { + /* r0 = dp3(src0, src0) */ + FETCH(&r[2], 0, CHAN_X); + micro_mul(&r[0], &r[2], &r[2]); + FETCH(&r[4], 0, CHAN_Y); + micro_mul(&r[8], &r[4], &r[4]); + micro_add(&r[0], &r[0], &r[8]); + FETCH(&r[6], 0, CHAN_Z); + micro_mul(&r[8], &r[6], &r[6]); + micro_add(&r[0], &r[0], &r[8]); + + /* r1 = dp3(src0, src1) */ + FETCH(&r[3], 1, CHAN_X); + micro_mul(&r[1], &r[2], &r[3]); + FETCH(&r[5], 1, CHAN_Y); + micro_mul(&r[8], &r[4], &r[5]); + micro_add(&r[1], &r[1], &r[8]); + FETCH(&r[7], 1, CHAN_Z); + micro_mul(&r[8], &r[6], &r[7]); + micro_add(&r[1], &r[1], &r[8]); + + /* r1 = 2 * r1 / r0 */ + micro_add(&r[1], &r[1], &r[1]); + micro_div(&r[1], &r[1], &r[0]); + + if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) { + micro_mul(&r[2], &r[2], &r[1]); + micro_sub(&r[2], &r[2], &r[3]); + STORE(&r[2], 0, CHAN_X); + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) { + micro_mul(&r[4], &r[4], &r[1]); + micro_sub(&r[4], &r[4], &r[5]); + STORE(&r[4], 0, CHAN_Y); + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) { + micro_mul(&r[6], &r[6], &r[1]); + micro_sub(&r[6], &r[6], &r[7]); + STORE(&r[6], 0, CHAN_Z); + } + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) { + STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W); + } break; case TGSI_OPCODE_SEQ: @@ -2374,7 +2624,9 @@ exec_instruction( break; case TGSI_OPCODE_SFL: - assert (0); + FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) { + STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index); + } break; case TGSI_OPCODE_SGT: @@ -2413,7 +2665,9 @@ exec_instruction( break; case TGSI_OPCODE_STR: - assert (0); + FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) { + STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index); + } break; case TGSI_OPCODE_TEX: @@ -2470,7 +2724,40 @@ exec_instruction( break; case TGSI_OPCODE_X2D: - assert (0); + FETCH(&r[0], 1, CHAN_X); + FETCH(&r[1], 1, CHAN_Y); + if (IS_CHANNEL_ENABLED(*inst, CHAN_X) || + IS_CHANNEL_ENABLED(*inst, CHAN_Z)) { + FETCH(&r[2], 2, CHAN_X); + micro_mul(&r[2], &r[2], &r[0]); + FETCH(&r[3], 2, CHAN_Y); + micro_mul(&r[3], &r[3], &r[1]); + micro_add(&r[2], &r[2], &r[3]); + FETCH(&r[3], 0, CHAN_X); + micro_add(&r[2], &r[2], &r[3]); + if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) { + STORE(&r[2], 0, CHAN_X); + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) { + STORE(&r[2], 0, CHAN_Z); + } + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) || + IS_CHANNEL_ENABLED(*inst, CHAN_W)) { + FETCH(&r[2], 2, CHAN_Z); + micro_mul(&r[2], &r[2], &r[0]); + FETCH(&r[3], 2, CHAN_W); + micro_mul(&r[3], &r[3], &r[1]); + micro_add(&r[2], &r[2], &r[3]); + FETCH(&r[3], 0, CHAN_Y); + micro_add(&r[2], &r[2], &r[3]); + if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) { + STORE(&r[2], 0, CHAN_Y); + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) { + STORE(&r[2], 0, CHAN_W); + } + } break; case TGSI_OPCODE_ARA: @@ -2555,14 +2842,14 @@ exec_instruction( case TGSI_OPCODE_SCS: if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { FETCH( &r[0], 0, CHAN_X ); - } - if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) { - micro_cos( &r[1], &r[0] ); - STORE( &r[1], 0, CHAN_X ); - } - if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { - micro_sin( &r[1], &r[0] ); - STORE( &r[1], 0, CHAN_Y ); + if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) { + micro_cos(&r[1], &r[0]); + STORE(&r[1], 0, CHAN_X); + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) { + micro_sin(&r[1], &r[0]); + STORE(&r[1], 0, CHAN_Y); + } } if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z ); @@ -2574,31 +2861,35 @@ exec_instruction( case TGSI_OPCODE_NRM: /* 3-component vector normalize */ - { - union tgsi_exec_channel tmp, dot; - - /* tmp = dp3(src0, src0): */ - FETCH( &r[0], 0, CHAN_X ); - micro_mul( &tmp, &r[0], &r[0] ); - - FETCH( &r[1], 0, CHAN_Y ); - micro_mul( &dot, &r[1], &r[1] ); - micro_add( &tmp, &tmp, &dot ); - - FETCH( &r[2], 0, CHAN_Z ); - micro_mul( &dot, &r[2], &r[2] ); - micro_add( &tmp, &tmp, &dot ); - - /* tmp = 1 / sqrt(tmp) */ - micro_sqrt( &tmp, &tmp ); - micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp ); - - /* note: w channel is undefined */ - FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { - /* chan = chan * tmp */ - micro_mul( &r[chan_index], &tmp, &r[chan_index] ); - STORE( &r[chan_index], 0, chan_index ); + if(IS_CHANNEL_ENABLED(*inst, CHAN_X) || + IS_CHANNEL_ENABLED(*inst, CHAN_Y) || + IS_CHANNEL_ENABLED(*inst, CHAN_Z)) { + /* r3 = sqrt(dp3(src0, src0)) */ + FETCH(&r[0], 0, CHAN_X); + micro_mul(&r[3], &r[0], &r[0]); + FETCH(&r[1], 0, CHAN_Y); + micro_mul(&r[4], &r[1], &r[1]); + micro_add(&r[3], &r[3], &r[4]); + FETCH(&r[2], 0, CHAN_Z); + micro_mul(&r[4], &r[2], &r[2]); + micro_add(&r[3], &r[3], &r[4]); + micro_sqrt(&r[3], &r[3]); + + if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) { + micro_div(&r[0], &r[0], &r[3]); + STORE(&r[0], 0, CHAN_X); } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) { + micro_div(&r[1], &r[1], &r[3]); + STORE(&r[1], 0, CHAN_Y); + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) { + micro_div(&r[2], &r[2], &r[3]); + STORE(&r[2], 0, CHAN_Z); + } + } + if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) { + STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W); } break; @@ -2960,5 +3251,3 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach ) return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; } - -