#include "tgsi/util/tgsi_parse.h"
#define SSE_MAX_VERTICES 4
+#define SSE_SWIZZLES 1
+#if SSE_SWIZZLES
+typedef void (XSTDCALL *codegen_function) (
+ const struct tgsi_exec_vector *input,
+ struct tgsi_exec_vector *output,
+ float (*constant)[4],
+ struct tgsi_exec_vector *temporary,
+ float (*immediates)[4],
+ const float (*aos_input)[4],
+ uint num_inputs,
+ uint input_stride,
+ float (*aos_output)[4],
+ uint num_outputs,
+ uint output_stride );
+#else
typedef void (XSTDCALL *codegen_function) (
const struct tgsi_exec_vector *input,
struct tgsi_exec_vector *output,
float (*constant)[4],
struct tgsi_exec_vector *temporary,
float (*immediates)[4] );
-
+#endif
struct draw_sse_vertex_shader {
struct draw_vertex_shader base;
{
struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
struct tgsi_exec_machine *machine = shader->machine;
- unsigned int i, j;
- unsigned slot;
+ unsigned int i;
for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
+#if SSE_SWIZZLES
+ /* run compiled shader
+ */
+ shader->func(machine->Inputs,
+ machine->Outputs,
+ (float (*)[4])constants,
+ machine->Temps,
+ shader->immediates,
+ input,
+ base->info.num_inputs,
+ input_stride,
+ output,
+ base->info.num_outputs,
+ output_stride );
+
+ input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
+ output = (float (*)[4])((char *)output + output_stride * max_vertices);
+#else
+ unsigned int j, slot;
+
/* Swizzle inputs.
*/
for (j = 0; j < max_vertices; j++) {
machine->Inputs[slot].xyzw[1].f[j] = input[slot][1];
machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
- }
+ }
input = (const float (*)[4])((const char *)input + input_stride);
- }
+ }
/* run compiled shader
*/
machine->Temps,
shader->immediates);
-
/* Unswizzle all output results.
*/
for (j = 0; j < max_vertices; j++) {
output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
- }
+ }
output = (float (*)[4])((char *)output + output_stride);
- }
+ }
+#endif
}
}
x86_init_func( &vs->sse2_program );
if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens,
- &vs->sse2_program, vs->immediates ))
+ &vs->sse2_program, vs->immediates, SSE_SWIZZLES ))
goto fail;
vs->func = (codegen_function) x86_get_func( &vs->sse2_program );
break;
case TGSI_OPCODE_RET:
- case TGSI_OPCODE_END:
#ifdef WIN32
emit_retw( func, 16 );
#else
#endif
break;
+ case TGSI_OPCODE_END:
+ break;
+
case TGSI_OPCODE_SSG:
return 0;
break;
}
}
+static void aos_to_soa( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+{
+ struct x86_reg soa_input;
+ struct x86_reg aos_input;
+ struct x86_reg num_inputs;
+ struct x86_reg temp;
+ unsigned char *inner_loop;
+
+ soa_input = x86_make_reg( file_REG32, reg_AX );
+ aos_input = x86_make_reg( file_REG32, reg_BX );
+ num_inputs = x86_make_reg( file_REG32, reg_CX );
+ temp = x86_make_reg( file_REG32, reg_DX );
+
+ /* Save EBX */
+ x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+
+ x86_mov( func, soa_input, get_argument( soa + 1 ) );
+ x86_mov( func, aos_input, get_argument( aos + 1 ) );
+ x86_mov( func, num_inputs, get_argument( num + 1 ) );
+
+ inner_loop = x86_get_label( func );
+
+ x86_mov( func, temp, get_argument( stride + 1 ) );
+ x86_push( func, aos_input );
+ sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+ sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+ x86_add( func, aos_input, temp );
+ sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+ sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+ x86_add( func, aos_input, temp );
+ sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+ sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+ x86_add( func, aos_input, temp );
+ sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+ sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+ x86_pop( func, aos_input );
+
+ sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+ sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+ sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
+ sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
+ sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
+ sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
+
+ sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
+ sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
+ sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
+ sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
+
+ /* Advance to next input */
+ x86_mov_reg_imm( func, temp, 16 );
+ x86_add( func, aos_input, temp );
+ x86_mov_reg_imm( func, temp, 64 );
+ x86_add( func, soa_input, temp );
+ x86_dec( func, num_inputs );
+ x86_jcc( func, cc_NE, inner_loop );
+
+ /* Restore EBX */
+ x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
+}
+
+static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+{
+ struct x86_reg soa_output;
+ struct x86_reg aos_output;
+ struct x86_reg num_outputs;
+ struct x86_reg temp;
+ unsigned char *inner_loop;
+
+ soa_output = x86_make_reg( file_REG32, reg_AX );
+ aos_output = x86_make_reg( file_REG32, reg_BX );
+ num_outputs = x86_make_reg( file_REG32, reg_CX );
+ temp = x86_make_reg( file_REG32, reg_DX );
+
+ /* Save EBX */
+ x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+
+ x86_mov( func, soa_output, get_argument( soa + 1 ) );
+ x86_mov( func, aos_output, get_argument( aos + 1 ) );
+ x86_mov( func, num_outputs, get_argument( num + 1 ) );
+
+ inner_loop = x86_get_label( func );
+
+ sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
+ sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
+ sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
+ sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
+
+ sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+ sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+ sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
+ sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
+ sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
+ sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
+
+ x86_mov( func, temp, get_argument( stride + 1 ) );
+ x86_push( func, aos_output );
+ sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+ sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+ x86_add( func, aos_output, temp );
+ sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+ sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+ x86_add( func, aos_output, temp );
+ sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+ sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+ x86_add( func, aos_output, temp );
+ sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+ sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+ x86_pop( func, aos_output );
+
+ /* Advance to next output */
+ x86_mov_reg_imm( func, temp, 16 );
+ x86_add( func, aos_output, temp );
+ x86_mov_reg_imm( func, temp, 64 );
+ x86_add( func, soa_output, temp );
+ x86_dec( func, num_outputs );
+ x86_jcc( func, cc_NE, inner_loop );
+
+ /* Restore EBX */
+ x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
+}
/**
* Translate a TGSI vertex/fragment shader to SSE2 code.
tgsi_emit_sse2(
const struct tgsi_token *tokens,
struct x86_function *func,
- float (*immediates)[4])
+ float (*immediates)[4],
+ boolean do_swizzles )
{
struct tgsi_parse_context parse;
boolean instruction_phase = FALSE;
else {
assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
+ if (do_swizzles)
+ aos_to_soa( func, 5, 0, 6, 7 );
+
x86_mov(
func,
get_input_base(),
}
}
+ if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
+ if (do_swizzles)
+ soa_to_aos( func, 8, 1, 9, 10 );
+ }
+
+#ifdef WIN32
+ emit_retw( func, 16 );
+#else
+ emit_ret( func );
+#endif
+
tgsi_parse_free( &parse );
return ok;