X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Ftgsi%2Ftgsi_sse2.c;h=a4b86aba98660fe24281811b597487fc63ec0981;hb=27a19be8d1c59c64240198261af348b868b101e4;hp=3df0c5db3fa9c125660c320d09fab82de95512ce;hpb=c417a2c3f37a6a28947db5dc5aa240473d29dd19;p=mesa.git diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c index 3df0c5db3fa..a4b86aba986 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c @@ -27,12 +27,15 @@ #include "pipe/p_config.h" -#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE) +#if defined(PIPE_ARCH_X86) -#include "pipe/p_debug.h" +#include "util/u_debug.h" #include "pipe/p_shader_tokens.h" #include "util/u_math.h" +#include "util/u_memory.h" +#if defined(PIPE_ARCH_SSE) #include "util/u_sse.h" +#endif #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi_exec.h" @@ -98,27 +101,43 @@ get_const_base( void ) { return x86_make_reg( file_REG32, - reg_CX ); + reg_AX ); } static struct x86_reg -get_input_base( void ) +get_machine_base( void ) { return x86_make_reg( file_REG32, - reg_AX ); + reg_CX ); +} + +static struct x86_reg +get_input_base( void ) +{ + return x86_make_disp( + get_machine_base(), + Offset(struct tgsi_exec_machine, Inputs) ); } static struct x86_reg get_output_base( void ) { - return x86_make_reg( - file_REG32, - reg_DX ); + return x86_make_disp( + get_machine_base(), + Offset(struct tgsi_exec_machine, Outputs) ); } static struct x86_reg get_temp_base( void ) +{ + return x86_make_disp( + get_machine_base(), + Offset(struct tgsi_exec_machine, Temps) ); +} + +static struct x86_reg +get_coef_base( void ) { return x86_make_reg( file_REG32, @@ -126,9 +145,11 @@ get_temp_base( void ) } static struct x86_reg -get_coef_base( void ) +get_sampler_base( void ) { - return get_output_base(); + return x86_make_reg( + file_REG32, + reg_DI ); } static struct x86_reg @@ -136,7 +157,7 @@ get_immediate_base( void ) { return x86_make_reg( file_REG32, - reg_DI ); + reg_DX ); } @@ -165,6 +186,15 @@ get_const( (vec * 4 + chan) * 4 ); } +static struct x86_reg +get_sampler_ptr( + unsigned unit ) +{ + return x86_make_disp( + get_sampler_base(), + unit * sizeof( struct tgsi_sampler * ) ); +} + static struct x86_reg get_input( unsigned vec, @@ -239,12 +269,14 @@ emit_const( /* 'vec' is the offset from the address register's value. * We're loading CONST[ADDR+vec] into an xmm register. */ - struct x86_reg r0 = get_input_base(); - struct x86_reg r1 = get_output_base(); + struct x86_reg r0 = get_immediate_base(); + struct x86_reg r1 = get_coef_base(); uint i; assert( indirectFile == TGSI_FILE_ADDRESS ); assert( indirectIndex == 0 ); + assert( r0.mod == mod_REG ); + assert( r1.mod == mod_REG ); x86_push( func, r0 ); x86_push( func, r1 ); @@ -518,24 +550,15 @@ emit_coef_dady( * that the stack pointer is 16 byte aligned, as expected. */ static void -emit_func_call_dst( +emit_func_call( struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst, + unsigned xmm_save_mask, + const struct x86_reg *arg, + unsigned nr_args, void (PIPE_CDECL *code)() ) { struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); - unsigned i, n, xmm; - unsigned xmm_mask; - - /* Bitmask of the xmm registers to save */ - xmm_mask = (1 << xmm_save) - 1; - xmm_mask &= ~(1 << xmm_dst); - - sse_movaps( - func, - get_temp( TEMP_R0, 0 ), - make_xmm( xmm_dst ) ); + unsigned i, n; x86_push( func, @@ -547,8 +570,10 @@ emit_func_call_dst( func, x86_make_reg( file_REG32, reg_DX) ); + /* Store XMM regs to the stack + */ for(i = 0, n = 0; i < 8; ++i) - if(xmm_mask & (1 << i)) + if(xmm_save_mask & (1 << i)) ++n; x86_sub_imm( @@ -557,29 +582,45 @@ emit_func_call_dst( n*16); for(i = 0, n = 0; i < 8; ++i) - if(xmm_mask & (1 << i)) { + if(xmm_save_mask & (1 << i)) { sse_movups( func, x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ), - make_xmm( xmm ) ); + make_xmm( i ) ); ++n; } + + for (i = 0; i < nr_args; i++) { + /* Load the address of the buffer we use for passing arguments and + * receiving results: + */ + x86_lea( + func, + ecx, + arg[i] ); - x86_lea( - func, - ecx, - get_temp( TEMP_R0, 0 ) ); - - x86_push( func, ecx ); + /* Push actual function arguments (currently just the pointer to + * the buffer above), and call the function: + */ + x86_push( func, ecx ); + } + x86_mov_reg_imm( func, ecx, (unsigned long) code ); x86_call( func, ecx ); - x86_pop(func, ecx ); - + + /* Pop the arguments (or just add an immediate to esp) + */ + for (i = 0; i < nr_args; i++) { + x86_pop(func, ecx ); + } + + /* Pop the saved XMM regs: + */ for(i = 0, n = 0; i < 8; ++i) - if(xmm_mask & (1 << i)) { + if(xmm_save_mask & (1 << i)) { sse_movups( func, - make_xmm( xmm ), + make_xmm( i ), x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) ); ++n; } @@ -600,33 +641,88 @@ emit_func_call_dst( x86_pop( func, x86_make_reg( file_REG32, reg_AX) ); +} + +static void +emit_func_call_dst_src1( + struct x86_function *func, + unsigned xmm_save, + unsigned xmm_dst, + unsigned xmm_src0, + void (PIPE_CDECL *code)() ) +{ + struct x86_reg store = get_temp( TEMP_R0, 0 ); + unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst); + + /* Store our input parameters (in xmm regs) to the buffer we use + * for passing arguments. We will pass a pointer to this buffer as + * the actual function argument. + */ + sse_movaps( + func, + store, + make_xmm( xmm_src0 ) ); + + emit_func_call( func, + xmm_mask, + &store, + 1, + code ); sse_movaps( func, make_xmm( xmm_dst ), - get_temp( TEMP_R0, 0 ) ); + store ); } + static void -emit_func_call_dst_src( +emit_func_call_dst_src2( struct x86_function *func, unsigned xmm_save, unsigned xmm_dst, - unsigned xmm_src, + unsigned xmm_src0, + unsigned xmm_src1, void (PIPE_CDECL *code)() ) { + struct x86_reg store = get_temp( TEMP_R0, 0 ); + unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst); + + /* Store two inputs to parameter buffer. + */ sse_movaps( func, - get_temp( TEMP_R0, 1 ), - make_xmm( xmm_src ) ); + store, + make_xmm( xmm_src0 ) ); - emit_func_call_dst( + sse_movaps( func, - xmm_save, - xmm_dst, - code ); + x86_make_disp( store, 4 * sizeof(float) ), + make_xmm( xmm_src1 ) ); + + + /* Emit the call + */ + emit_func_call( func, + xmm_mask, + &store, + 1, + code ); + + /* Retrieve the results: + */ + sse_movaps( + func, + make_xmm( xmm_dst ), + store ); } + + + + +#if defined(PIPE_ARCH_SSE) + /* * Fast SSE2 implementation of special math functions. */ @@ -678,6 +774,7 @@ exp2f4(__m128 x) return _mm_mul_ps(expipart, expfpart); } + /** * See http://www.devmaster.net/forums/showthread.php?p=43580 */ @@ -720,12 +817,16 @@ log2f4(__m128 x) return _mm_add_ps(logmant, exp); } + static INLINE __m128 powf4(__m128 x, __m128 y) { return exp2f4(_mm_mul_ps(log2f4(x), y)); } +#endif /* PIPE_ARCH_SSE */ + + /** * Low-level instruction translators. @@ -772,21 +873,29 @@ emit_cos( unsigned xmm_save, unsigned xmm_dst ) { - emit_func_call_dst( + emit_func_call_dst_src1( func, xmm_save, xmm_dst, + xmm_dst, cos4f ); } static void PIPE_CDECL -#if defined(PIPE_CC_GCC) +#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE) __attribute__((force_align_arg_pointer)) #endif ex24f( float *store ) { +#if defined(PIPE_ARCH_SSE) _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) )); +#else + store[0] = util_fast_exp2( store[0] ); + store[1] = util_fast_exp2( store[1] ); + store[2] = util_fast_exp2( store[2] ); + store[3] = util_fast_exp2( store[3] ); +#endif } static void @@ -795,10 +904,11 @@ emit_ex2( unsigned xmm_save, unsigned xmm_dst ) { - emit_func_call_dst( + emit_func_call_dst_src1( func, xmm_save, xmm_dst, + xmm_dst, ex24f ); } @@ -840,10 +950,11 @@ emit_flr( unsigned xmm_save, unsigned xmm_dst ) { - emit_func_call_dst( + emit_func_call_dst_src1( func, xmm_save, xmm_dst, + xmm_dst, flr4f ); } @@ -863,21 +974,29 @@ emit_frc( unsigned xmm_save, unsigned xmm_dst ) { - emit_func_call_dst( + emit_func_call_dst_src1( func, xmm_save, xmm_dst, + xmm_dst, frc4f ); } static void PIPE_CDECL -#if defined(PIPE_CC_GCC) +#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE) __attribute__((force_align_arg_pointer)) #endif lg24f( float *store ) { +#if defined(PIPE_ARCH_SSE) _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) )); +#else + store[0] = util_fast_log2( store[0] ); + store[1] = util_fast_log2( store[1] ); + store[2] = util_fast_log2( store[2] ); + store[3] = util_fast_log2( store[3] ); +#endif } static void @@ -886,10 +1005,11 @@ emit_lg2( unsigned xmm_save, unsigned xmm_dst ) { - emit_func_call_dst( + emit_func_call_dst_src1( func, xmm_save, xmm_dst, + xmm_dst, lg24f ); } @@ -930,19 +1050,19 @@ emit_neg( } static void PIPE_CDECL -#if defined(PIPE_CC_GCC) +#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE) __attribute__((force_align_arg_pointer)) #endif pow4f( float *store ) { -#if 1 +#if defined(PIPE_ARCH_SSE) _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) )); #else - store[0] = powf( store[0], store[4] ); - store[1] = powf( store[1], store[5] ); - store[2] = powf( store[2], store[6] ); - store[3] = powf( store[3], store[7] ); + store[0] = util_fast_pow( store[0], store[4] ); + store[1] = util_fast_pow( store[1], store[5] ); + store[2] = util_fast_pow( store[2], store[6] ); + store[3] = util_fast_pow( store[3], store[7] ); #endif } @@ -951,13 +1071,15 @@ emit_pow( struct x86_function *func, unsigned xmm_save, unsigned xmm_dst, - unsigned xmm_src ) + unsigned xmm_src0, + unsigned xmm_src1 ) { - emit_func_call_dst_src( + emit_func_call_dst_src2( func, xmm_save, xmm_dst, - xmm_src, + xmm_src0, + xmm_src1, pow4f ); } @@ -977,6 +1099,30 @@ emit_rcp ( make_xmm( xmm_src ) ); } +static void PIPE_CDECL +rnd4f( + float *store ) +{ + store[0] = floorf( store[0] + 0.5f ); + store[1] = floorf( store[1] + 0.5f ); + store[2] = floorf( store[2] + 0.5f ); + store[3] = floorf( store[3] + 0.5f ); +} + +static void +emit_rnd( + struct x86_function *func, + unsigned xmm_save, + unsigned xmm_dst ) +{ + emit_func_call_dst_src1( + func, + xmm_save, + xmm_dst, + xmm_dst, + rnd4f ); +} + static void emit_rsqrt( struct x86_function *func, @@ -1036,6 +1182,30 @@ emit_setsign( TGSI_EXEC_TEMP_80000000_C ) ); } +static void PIPE_CDECL +sgn4f( + float *store ) +{ + store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f; + store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f; + store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f; + store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f; +} + +static void +emit_sgn( + struct x86_function *func, + unsigned xmm_save, + unsigned xmm_dst ) +{ + emit_func_call_dst_src1( + func, + xmm_save, + xmm_dst, + xmm_dst, + sgn4f ); +} + static void PIPE_CDECL sin4f( float *store ) @@ -1051,10 +1221,11 @@ emit_sin (struct x86_function *func, unsigned xmm_save, unsigned xmm_dst) { - emit_func_call_dst( + emit_func_call_dst_src1( func, xmm_save, xmm_dst, + xmm_dst, sin4f ); } @@ -1070,6 +1241,12 @@ emit_sub( make_xmm( xmm_src ) ); } + + + + + + /** * Register fetch. */ @@ -1228,20 +1405,164 @@ emit_store( #define STORE( FUNC, INST, XMM, INDEX, CHAN )\ emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN ) + +static void PIPE_CDECL +fetch_texel( struct tgsi_sampler **sampler, + float *store ) +{ +#if 0 + uint j; + + debug_printf("%s sampler: %p (%p) store: %p\n", + __FUNCTION__, + sampler, *sampler, + store ); + + debug_printf("lodbias %f\n", store[12]); + + for (j = 0; j < 4; j++) + debug_printf("sample %d texcoord %f %f\n", + j, + store[0+j], + store[4+j]); +#endif + + { + float rgba[NUM_CHANNELS][QUAD_SIZE]; + (*sampler)->get_samples(*sampler, + &store[0], + &store[4], + &store[8], + 0.0f, /*store[12], lodbias */ + rgba); + + memcpy( store, rgba, 16 * sizeof(float)); + } + +#if 0 + for (j = 0; j < 4; j++) + debug_printf("sample %d result %f %f %f %f\n", + j, + store[0+j], + store[4+j], + store[8+j], + store[12+j]); +#endif +} + /** * High-level instruction translators. */ +static void +emit_tex( struct x86_function *func, + const struct tgsi_full_instruction *inst, + boolean lodbias, + boolean projected) +{ + const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index; + struct x86_reg args[2]; + unsigned count; + unsigned i; + + switch (inst->InstructionExtTexture.Texture) { + case TGSI_TEXTURE_1D: + case TGSI_TEXTURE_SHADOW1D: + count = 1; + break; + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + case TGSI_TEXTURE_SHADOW2D: + case TGSI_TEXTURE_SHADOWRECT: + count = 2; + break; + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + count = 3; + break; + default: + assert(0); + return; + } + + if (lodbias) { + FETCH( func, *inst, 3, 0, 3 ); + } + else { + emit_tempf( + func, + 3, + TGSI_EXEC_TEMP_00000000_I, + TGSI_EXEC_TEMP_00000000_C ); + + } + + /* store lodbias whether enabled or not -- fetch_texel currently + * respects it always. + */ + sse_movaps( func, + get_temp( TEMP_R0, 3 ), + make_xmm( 3 ) ); + + + if (projected) { + FETCH( func, *inst, 3, 0, 3 ); + + emit_rcp( func, 3, 3 ); + } + + for (i = 0; i < count; i++) { + FETCH( func, *inst, i, 0, i ); + + if (projected) { + sse_mulps( + func, + make_xmm( i ), + make_xmm( 3 ) ); + } + + /* Store in the argument buffer: + */ + sse_movaps( + func, + get_temp( TEMP_R0, i ), + make_xmm( i ) ); + } + + args[0] = get_temp( TEMP_R0, 0 ); + args[1] = get_sampler_ptr( unit ); + + + emit_func_call( func, + 0, + args, + Elements(args), + fetch_texel ); + + /* If all four channels are enabled, could use a pointer to + * dst[0].x instead of TEMP_R0 for store? + */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) { + + sse_movaps( + func, + make_xmm( 0 ), + get_temp( TEMP_R0, i ) ); + + STORE( func, *inst, 0, 0, i ); + } +} + + static void emit_kil( struct x86_function *func, const struct tgsi_full_src_register *reg ) { unsigned uniquemask; - unsigned registers[4]; - unsigned nextregister = 0; - unsigned firstchan = ~0; + unsigned unique_count = 0; unsigned chan_index; + unsigned i; /* This mask stores component bits that were already tested. Note that * we test if the value is less than zero, so 1.0 and 0.0 need not to be @@ -1261,18 +1582,11 @@ emit_kil( uniquemask |= 1 << swizzle; /* allocate register */ - registers[chan_index] = nextregister; emit_fetch( func, - nextregister, + unique_count++, reg, chan_index ); - nextregister++; - - /* mark the first channel used */ - if( firstchan == ~0 ) { - firstchan = chan_index; - } } } @@ -1283,32 +1597,32 @@ emit_kil( func, x86_make_reg( file_REG32, reg_DX ) ); - FOR_EACH_CHANNEL( chan_index ) { - if( uniquemask & (1 << chan_index) ) { - sse_cmpps( + for (i = 0 ; i < unique_count; i++ ) { + struct x86_reg dataXMM = make_xmm(i); + + sse_cmpps( + func, + dataXMM, + get_temp( + TGSI_EXEC_TEMP_00000000_I, + TGSI_EXEC_TEMP_00000000_C ), + cc_LessThan ); + + if( i == 0 ) { + sse_movmskps( func, - make_xmm( registers[chan_index] ), - get_temp( - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ), - cc_LessThan ); - - if( chan_index == firstchan ) { - sse_pmovmskb( - func, - x86_make_reg( file_REG32, reg_AX ), - make_xmm( registers[chan_index] ) ); - } - else { - sse_pmovmskb( - func, - x86_make_reg( file_REG32, reg_DX ), - make_xmm( registers[chan_index] ) ); - x86_or( - func, - x86_make_reg( file_REG32, reg_AX ), - x86_make_reg( file_REG32, reg_DX ) ); - } + x86_make_reg( file_REG32, reg_AX ), + dataXMM ); + } + else { + sse_movmskps( + func, + x86_make_reg( file_REG32, reg_DX ), + dataXMM ); + x86_or( + func, + x86_make_reg( file_REG32, reg_AX ), + x86_make_reg( file_REG32, reg_DX ) ); } } @@ -1396,6 +1710,31 @@ emit_cmp( } } + +/** + * Check if inst src/dest regs use indirect addressing into temporary + * register file. + */ +static boolean +indirect_temp_reference(const struct tgsi_full_instruction *inst) +{ + uint i; + for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { + const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i]; + if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY && + reg->SrcRegister.Indirect) + return TRUE; + } + for (i = 0; i < inst->Instruction.NumDstRegs; i++) { + const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i]; + if (reg->DstRegister.File == TGSI_FILE_TEMPORARY && + reg->DstRegister.Indirect) + return TRUE; + } + return FALSE; +} + + static int emit_instruction( struct x86_function *func, @@ -1403,10 +1742,15 @@ emit_instruction( { unsigned chan_index; + /* we can't handle indirect addressing into temp register file yet */ + if (indirect_temp_reference(inst)) + return FALSE; + switch (inst->Instruction.Opcode) { case TGSI_OPCODE_ARL: FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( func, *inst, 0, 0, chan_index ); + emit_flr(func, 0, 0); emit_f2it( func, 0 ); STORE( func, *inst, 0, 0, chan_index ); } @@ -1473,7 +1817,7 @@ emit_instruction( get_temp( TGSI_EXEC_TEMP_MINUS_128_I, TGSI_EXEC_TEMP_MINUS_128_C ) ); - emit_pow( func, 3, 1, 2 ); + emit_pow( func, 3, 1, 1, 2 ); FETCH( func, *inst, 0, 0, CHAN_X ); sse_xorps( func, @@ -1483,7 +1827,7 @@ emit_instruction( func, make_xmm( 2 ), make_xmm( 0 ), - cc_LessThanEqual ); + cc_LessThan ); sse_andps( func, make_xmm( 2 ), @@ -1505,6 +1849,7 @@ emit_instruction( case TGSI_OPCODE_RSQ: /* TGSI_OPCODE_RECIPSQRT */ FETCH( func, *inst, 0, 0, CHAN_X ); + emit_abs( func, 0 ); emit_rsqrt( func, 1, 0 ); FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { STORE( func, *inst, 1, 0, chan_index ); @@ -1742,7 +2087,18 @@ emit_instruction( case TGSI_OPCODE_DOT2ADD: /* TGSI_OPCODE_DP2A */ - return 0; + FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */ + FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */ + emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */ + FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */ + FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */ + emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */ + emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */ + FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */ + emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */ + } break; case TGSI_OPCODE_INDEX: @@ -1776,7 +2132,11 @@ emit_instruction( break; case TGSI_OPCODE_ROUND: - return 0; + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + emit_rnd( func, 0, 0 ); + STORE( func, *inst, 0, 0, chan_index ); + } break; case TGSI_OPCODE_EXPBASE2: @@ -1801,7 +2161,7 @@ emit_instruction( /* TGSI_OPCODE_POW */ FETCH( func, *inst, 0, 0, CHAN_X ); FETCH( func, *inst, 1, 1, CHAN_X ); - emit_pow( func, 0, 0, 1 ); + emit_pow( func, 0, 0, 0, 1 ); FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { STORE( func, *inst, 0, 0, chan_index ); } @@ -1970,21 +2330,7 @@ emit_instruction( break; case TGSI_OPCODE_TEX: - if (0) { - /* Disable dummy texture code: - */ - emit_tempf( - func, - 0, - TEMP_ONE_I, - TEMP_ONE_C ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - } - else { - return 0; - } + emit_tex( func, inst, FALSE, FALSE ); break; case TGSI_OPCODE_TXD: @@ -2016,7 +2362,12 @@ emit_instruction( break; case TGSI_OPCODE_ARR: - return 0; + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + emit_rnd( func, 0, 0 ); + emit_f2it( func, 0 ); + STORE( func, *inst, 0, 0, chan_index ); + } break; case TGSI_OPCODE_BRA: @@ -2035,7 +2386,12 @@ emit_instruction( break; case TGSI_OPCODE_SSG: - return 0; + /* TGSI_OPCODE_SGN */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + emit_sgn( func, 0, 0 ); + STORE( func, *inst, 0, 0, chan_index ); + } break; case TGSI_OPCODE_CMP: @@ -2072,11 +2428,94 @@ emit_instruction( break; case TGSI_OPCODE_TXB: - return 0; + emit_tex( func, inst, TRUE, FALSE ); break; case TGSI_OPCODE_NRM: - return 0; + /* fall-through */ + case TGSI_OPCODE_NRM4: + /* 3 or 4-component normalization */ + { + uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4; + + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) || + IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) || + IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) || + (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) { + + /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */ + + /* xmm4 = src.x */ + /* xmm0 = src.x * src.x */ + FETCH(func, *inst, 0, 0, CHAN_X); + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) { + emit_MOV(func, 4, 0); + } + emit_mul(func, 0, 0); + + /* xmm5 = src.y */ + /* xmm0 = xmm0 + src.y * src.y */ + FETCH(func, *inst, 1, 0, CHAN_Y); + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { + emit_MOV(func, 5, 1); + } + emit_mul(func, 1, 1); + emit_add(func, 0, 1); + + /* xmm6 = src.z */ + /* xmm0 = xmm0 + src.z * src.z */ + FETCH(func, *inst, 1, 0, CHAN_Z); + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { + emit_MOV(func, 6, 1); + } + emit_mul(func, 1, 1); + emit_add(func, 0, 1); + + if (dims == 4) { + /* xmm7 = src.w */ + /* xmm0 = xmm0 + src.w * src.w */ + FETCH(func, *inst, 1, 0, CHAN_W); + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) { + emit_MOV(func, 7, 1); + } + emit_mul(func, 1, 1); + emit_add(func, 0, 1); + } + + /* xmm1 = 1 / sqrt(xmm0) */ + emit_rsqrt(func, 1, 0); + + /* dst.x = xmm1 * src.x */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) { + emit_mul(func, 4, 1); + STORE(func, *inst, 4, 0, CHAN_X); + } + + /* dst.y = xmm1 * src.y */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { + emit_mul(func, 5, 1); + STORE(func, *inst, 5, 0, CHAN_Y); + } + + /* dst.z = xmm1 * src.z */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { + emit_mul(func, 6, 1); + STORE(func, *inst, 6, 0, CHAN_Z); + } + + /* dst.w = xmm1 * src.w */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) { + emit_mul(func, 7, 1); + STORE(func, *inst, 7, 0, CHAN_W); + } + } + + /* dst0.w = 1.0 */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) { + emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C); + STORE(func, *inst, 0, 0, CHAN_W); + } + } break; case TGSI_OPCODE_DIV: @@ -2084,13 +2523,26 @@ emit_instruction( break; case TGSI_OPCODE_DP2: - return 0; + FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */ + FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */ + emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */ + FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */ + FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */ + emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */ + emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */ + } break; case TGSI_OPCODE_TXL: - return 0; + emit_tex( func, inst, TRUE, FALSE ); break; + case TGSI_OPCODE_TXP: + emit_tex( func, inst, FALSE, TRUE ); + break; + case TGSI_OPCODE_BRK: return 0; break; @@ -2270,7 +2722,7 @@ emit_declaration( static void aos_to_soa( struct x86_function *func, uint arg_aos, - uint arg_soa, + uint arg_machine, uint arg_num, uint arg_stride ) { @@ -2285,7 +2737,10 @@ static void aos_to_soa( struct x86_function *func, x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) ); - x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) ); + x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) ); + x86_lea( func, soa_input, + x86_make_disp( soa_input, + Offset(struct tgsi_exec_machine, Inputs) ) ); x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) ); x86_mov( func, stride, x86_fn_arg( func, arg_stride ) ); @@ -2327,28 +2782,30 @@ static void aos_to_soa( struct x86_function *func, x86_jcc( func, cc_NE, inner_loop ); /* Restore EBX */ - x86_pop( func, aos_input ); + x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); } -static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride ) +static void soa_to_aos( struct x86_function *func, + uint arg_aos, + uint arg_machine, + uint arg_num, + uint arg_stride ) { - struct x86_reg soa_output; - struct x86_reg aos_output; - struct x86_reg num_outputs; - struct x86_reg temp; + struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX ); + struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX ); + struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX ); + struct x86_reg temp = x86_make_reg( file_REG32, reg_DX ); int inner_loop; - soa_output = x86_make_reg( file_REG32, reg_AX ); - aos_output = x86_make_reg( file_REG32, reg_BX ); - num_outputs = x86_make_reg( file_REG32, reg_CX ); - temp = x86_make_reg( file_REG32, reg_DX ); - /* Save EBX */ - x86_push( func, aos_output ); + x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); - x86_mov( func, soa_output, x86_fn_arg( func, soa ) ); - x86_mov( func, aos_output, x86_fn_arg( func, aos ) ); - x86_mov( func, num_outputs, x86_fn_arg( func, num ) ); + x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) ); + x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) ); + x86_lea( func, soa_output, + x86_make_disp( soa_output, + Offset(struct tgsi_exec_machine, Outputs) ) ); + x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) ); /* do */ inner_loop = x86_get_label( func ); @@ -2365,7 +2822,7 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) ); sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) ); - x86_mov( func, temp, x86_fn_arg( func, stride ) ); + x86_mov( func, temp, x86_fn_arg( func, arg_stride ) ); x86_push( func, aos_output ); sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) ); sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) ); @@ -2389,20 +2846,13 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, x86_jcc( func, cc_NE, inner_loop ); /* Restore EBX */ - x86_pop( func, aos_output ); + x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); } /** * Translate a TGSI vertex/fragment shader to SSE2 code. * Slightly different things are done for vertex vs. fragment shaders. * - * Note that fragment shaders are responsible for interpolating shader - * inputs. Because on x86 we have only 4 GP registers, and here we - * have 5 shader arguments (input, output, const, temp and coef), the - * code is split into two phases -- DECLARATION and INSTRUCTION phase. - * GP register holding the output argument is aliased with the coeff - * argument, as outputs are not needed in the DECLARATION phase. - * * \param tokens the TGSI input shader * \param func the output SSE code/function * \param immediates buffer to place immediates, later passed to SSE func @@ -2416,7 +2866,6 @@ tgsi_emit_sse2( boolean do_swizzles ) { struct tgsi_parse_context parse; - boolean instruction_phase = FALSE; unsigned ok = 1; uint num_immediates = 0; @@ -2428,74 +2877,48 @@ tgsi_emit_sse2( /* Can't just use EDI, EBX without save/restoring them: */ - x86_push( - func, - get_immediate_base() ); - - x86_push( - func, - get_temp_base() ); - + x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); + x86_push( func, x86_make_reg( file_REG32, reg_DI ) ); /* * Different function args for vertex/fragment shaders: */ - if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { - /* DECLARATION phase, do not load output argument. */ - x86_mov( - func, - get_input_base(), - x86_fn_arg( func, 1 ) ); - /* skipping outputs argument here */ - x86_mov( - func, - get_const_base(), - x86_fn_arg( func, 3 ) ); - x86_mov( - func, - get_temp_base(), - x86_fn_arg( func, 4 ) ); - x86_mov( - func, - get_coef_base(), - x86_fn_arg( func, 5 ) ); - x86_mov( - func, - get_immediate_base(), - x86_fn_arg( func, 6 ) ); - } - else { - assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX); - + if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) { if (do_swizzles) aos_to_soa( func, - 6, /* aos_input */ - 1, /* machine->input */ - 7, /* num_inputs */ - 8 ); /* input_stride */ + 4, /* aos_input */ + 1, /* machine */ + 5, /* num_inputs */ + 6 ); /* input_stride */ + } + x86_mov( + func, + get_machine_base(), + x86_fn_arg( func, 1 ) ); + x86_mov( + func, + get_const_base(), + x86_fn_arg( func, 2 ) ); + x86_mov( + func, + get_immediate_base(), + x86_fn_arg( func, 3 ) ); + + if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { x86_mov( - func, - get_input_base(), - x86_fn_arg( func, 1 ) ); - x86_mov( - func, - get_output_base(), - x86_fn_arg( func, 2 ) ); - x86_mov( - func, - get_const_base(), - x86_fn_arg( func, 3 ) ); - x86_mov( - func, - get_temp_base(), - x86_fn_arg( func, 4 ) ); + func, + get_coef_base(), + x86_fn_arg( func, 4 ) ); + x86_mov( - func, - get_immediate_base(), - x86_fn_arg( func, 5 ) ); + func, + get_sampler_base(), + x86_make_disp( get_machine_base(), + Offset( struct tgsi_exec_machine, Samplers ) ) ); } + while( !tgsi_parse_end_of_tokens( &parse ) && ok ) { tgsi_parse_token( &parse ); @@ -2509,17 +2932,6 @@ tgsi_emit_sse2( break; case TGSI_TOKEN_TYPE_INSTRUCTION: - if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { - if( !instruction_phase ) { - /* INSTRUCTION phase, overwrite coeff with output. */ - instruction_phase = TRUE; - x86_mov( - func, - get_output_base(), - x86_fn_arg( func, 2 ) ); - } - } - ok = emit_instruction( func, &parse.FullToken.FullInstruction ); @@ -2535,7 +2947,7 @@ tgsi_emit_sse2( case TGSI_TOKEN_TYPE_IMMEDIATE: /* simply copy the immediate values into the next immediates[] slot */ { - const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1; + const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1; uint i; assert(size <= 4); assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES); @@ -2563,18 +2975,17 @@ tgsi_emit_sse2( if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) { if (do_swizzles) - soa_to_aos( func, 9, 2, 10, 11 ); + soa_to_aos( func, + 7, /* aos_output */ + 1, /* machine */ + 8, /* num_outputs */ + 9 ); /* output_stride */ } /* Can't just use EBX, EDI without save/restoring them: */ - x86_pop( - func, - get_temp_base() ); - - x86_pop( - func, - get_immediate_base() ); + x86_pop( func, x86_make_reg( file_REG32, reg_DI ) ); + x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); emit_ret( func );