From 0d13ade0cdd38759936a74824efbd6ac8b563aed Mon Sep 17 00:00:00 2001 From: Brian Date: Tue, 2 Oct 2007 11:46:11 -0600 Subject: [PATCH] Move tgsi machine state init/allocations so they're done less frequently. This, plus expanding all instructions ahead of time, seems to have improved the performance of program execution by 8x or so. --- src/mesa/pipe/draw/draw_private.h | 4 + src/mesa/pipe/draw/draw_vertex_shader.c | 52 ++++--- src/mesa/pipe/softpipe/sp_quad_fs.c | 81 ++++++----- src/mesa/pipe/tgsi/exec/tgsi_exec.c | 176 ++++++++++++++---------- src/mesa/pipe/tgsi/exec/tgsi_exec.h | 8 +- 5 files changed, 179 insertions(+), 142 deletions(-) diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h index 12a970a6711..a54fef41e74 100644 --- a/src/mesa/pipe/draw/draw_private.h +++ b/src/mesa/pipe/draw/draw_private.h @@ -47,6 +47,8 @@ #include "draw_vertex.h" #include "x86/rtasm/x86sse.h" +#include "pipe/tgsi/exec/tgsi_core.h" + /** * Basic vertex info. @@ -187,6 +189,8 @@ struct draw_context unsigned prim; /**< current prim type: PIPE_PRIM_x */ unsigned reduced_prim; + /** TGSI program interpreter runtime state */ + struct tgsi_exec_machine machine; /* Post-tnl vertex cache: */ diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c index 3518bd52a3f..e3bcd353341 100644 --- a/src/mesa/pipe/draw/draw_vertex_shader.c +++ b/src/mesa/pipe/draw/draw_vertex_shader.c @@ -86,7 +86,7 @@ run_vertex_program(struct draw_context *draw, unsigned elts[4], unsigned count, struct vertex_header *vOut[]) { - struct tgsi_exec_machine machine; + struct tgsi_exec_machine *machine = &draw->machine; unsigned int j; ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX); @@ -98,35 +98,39 @@ run_vertex_program(struct draw_context *draw, assert(draw->vertex_shader->state->output_semantic_name[0] == TGSI_SEMANTIC_POSITION); -#ifdef DEBUG - memset( &machine, 0, sizeof( machine ) ); +#ifdef DEBUG_foo + memset( machine, 0, sizeof( *machine ) ); #endif +#if 0 /* init machine state */ - tgsi_exec_machine_init(&machine, + tgsi_exec_machine_init(machine, draw->vertex_shader->state->tokens, PIPE_MAX_SAMPLERS, NULL /*samplers*/ ); +#endif /* Consts does not require 16 byte alignment. */ - machine.Consts = (float (*)[4]) draw->mapped_constants; + machine->Consts = (float (*)[4]) draw->mapped_constants; - machine.Inputs = ALIGN16_ASSIGN(inputs); - machine.Outputs = ALIGN16_ASSIGN(outputs); + machine->Inputs = ALIGN16_ASSIGN(inputs); + machine->Outputs = ALIGN16_ASSIGN(outputs); - draw_vertex_fetch( draw, &machine, elts, count ); + draw_vertex_fetch( draw, machine, elts, count ); /* run shader */ if( draw->vertex_shader->state->executable != NULL ) { + /* SSE */ codegen_function func = (codegen_function) draw->vertex_shader->state->executable; func( - machine.Inputs, - machine.Outputs, - machine.Consts, - machine.Temps ); + machine->Inputs, + machine->Outputs, + machine->Consts, + machine->Temps ); } else { - tgsi_exec_machine_run( &machine ); + /* interpreter */ + tgsi_exec_machine_run( machine ); } @@ -136,10 +140,10 @@ run_vertex_program(struct draw_context *draw, float x, y, z, w; /* Handle attr[0] (position) specially: */ - x = vOut[j]->clip[0] = machine.Outputs[0].xyzw[0].f[j]; - y = vOut[j]->clip[1] = machine.Outputs[0].xyzw[1].f[j]; - z = vOut[j]->clip[2] = machine.Outputs[0].xyzw[2].f[j]; - w = vOut[j]->clip[3] = machine.Outputs[0].xyzw[3].f[j]; + x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j]; + y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j]; + z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j]; + w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j]; vOut[j]->clipmask = compute_clipmask(x, y, z, w) | draw->user_clipmask; vOut[j]->edgeflag = 1; @@ -162,10 +166,10 @@ run_vertex_program(struct draw_context *draw, * Subtract two because of the VERTEX_HEADER, CLIP_POS attribs. */ for (slot = 1; slot < draw->vertex_info.num_attribs - 2; slot++) { - vOut[j]->data[slot][0] = machine.Outputs[slot].xyzw[0].f[j]; - vOut[j]->data[slot][1] = machine.Outputs[slot].xyzw[1].f[j]; - vOut[j]->data[slot][2] = machine.Outputs[slot].xyzw[2].f[j]; - vOut[j]->data[slot][3] = machine.Outputs[slot].xyzw[3].f[j]; + vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j]; + vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j]; + vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j]; + vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j]; /* printf("output %d: %f %f %f %f\n", slot, vOut[j]->data[slot][0], @@ -235,6 +239,12 @@ void draw_bind_vertex_shader(struct draw_context *draw, { draw_flush(draw); draw->vertex_shader = (struct draw_vertex_shader*)(vcso); + + /* init machine state */ + tgsi_exec_machine_init(&draw->machine, + draw->vertex_shader->state->tokens, + PIPE_MAX_SAMPLERS, + NULL /*samplers*/ ); } void draw_delete_vertex_shader(struct draw_context *draw, diff --git a/src/mesa/pipe/softpipe/sp_quad_fs.c b/src/mesa/pipe/softpipe/sp_quad_fs.c index 673d339f412..57c01dcfcc3 100755 --- a/src/mesa/pipe/softpipe/sp_quad_fs.c +++ b/src/mesa/pipe/softpipe/sp_quad_fs.c @@ -45,6 +45,8 @@ struct quad_shade_stage { struct quad_stage stage; struct tgsi_sampler samplers[PIPE_MAX_SAMPLERS]; + struct tgsi_exec_machine machine; + struct tgsi_exec_vector *inputs, *outputs; }; @@ -83,58 +85,41 @@ shade_quad( struct softpipe_context *softpipe = qs->softpipe; const float fx = (float) quad->x0; const float fy = (float) quad->y0; - struct tgsi_exec_machine machine; - - ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX); - ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX); - -#ifdef DEBUG - memset( &machine, 0, sizeof( machine ) ); -#endif - - /* init machine state */ - tgsi_exec_machine_init( - &machine, - softpipe->fs->tokens, - PIPE_MAX_SAMPLERS, - qss->samplers ); + struct tgsi_exec_machine *machine = &qss->machine; /* Consts does not require 16 byte alignment. */ - machine.Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT]; - - machine.Inputs = ALIGN16_ASSIGN(inputs); - machine.Outputs = ALIGN16_ASSIGN(outputs); + machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT]; - machine.InterpCoefs = quad->coef; + machine->InterpCoefs = quad->coef; - machine.Inputs[0].xyzw[0].f[0] = fx; - machine.Inputs[0].xyzw[0].f[1] = fx + 1.0f; - machine.Inputs[0].xyzw[0].f[2] = fx; - machine.Inputs[0].xyzw[0].f[3] = fx + 1.0f; + machine->Inputs[0].xyzw[0].f[0] = fx; + machine->Inputs[0].xyzw[0].f[1] = fx + 1.0f; + machine->Inputs[0].xyzw[0].f[2] = fx; + machine->Inputs[0].xyzw[0].f[3] = fx + 1.0f; - machine.Inputs[0].xyzw[1].f[0] = fy; - machine.Inputs[0].xyzw[1].f[1] = fy; - machine.Inputs[0].xyzw[1].f[2] = fy + 1.0f; - machine.Inputs[0].xyzw[1].f[3] = fy + 1.0f; + machine->Inputs[0].xyzw[1].f[0] = fy; + machine->Inputs[0].xyzw[1].f[1] = fy; + machine->Inputs[0].xyzw[1].f[2] = fy + 1.0f; + machine->Inputs[0].xyzw[1].f[3] = fy + 1.0f; /* run shader */ if( softpipe->fs->executable != NULL ) { codegen_function func = (codegen_function) softpipe->fs->executable; func( - machine.Inputs, - machine.Outputs, - machine.Consts, - machine.Temps, - machine.InterpCoefs ); + machine->Inputs, + machine->Outputs, + machine->Consts, + machine->Temps, + machine->InterpCoefs ); } else { - tgsi_exec_machine_run( &machine ); + tgsi_exec_machine_run( machine ); } /* store result color (always in output[1]) */ memcpy( quad->outputs.color, - &machine.Outputs[1].xyzw[0].f[0], + &machine->Outputs[1].xyzw[0].f[0], sizeof( quad->outputs.color ) ); #if 0 @@ -142,14 +127,14 @@ shade_quad( /* XXX temporary */ memcpy( quad->outputs.depth, - &machine.Outputs[0].xyzw[2], + machine->Outputs[0].xyzw[2], sizeof( quad->outputs.depth ) ); } #else { uint i; for (i = 0; i < 4; i++) { - quad->outputs.depth[i] = machine.Inputs[0].xyzw[2].f[i]; + quad->outputs.depth[i] = machine->Inputs[0].xyzw[2].f[i]; #if 0 printf("output z %f\n", quad->outputs.depth[i]); #endif @@ -188,6 +173,12 @@ static void shade_begin(struct quad_stage *qs) } } + /* XXX only do this if the fragment shader changes... */ + tgsi_exec_machine_init(&qss->machine, + softpipe->fs->tokens, + PIPE_MAX_SAMPLERS, + qss->samplers ); + if (qs->next) qs->next->begin(qs->next); } @@ -195,11 +186,17 @@ static void shade_begin(struct quad_stage *qs) struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe ) { - struct quad_shade_stage *stage = CALLOC_STRUCT(quad_shade_stage); + struct quad_shade_stage *qss = CALLOC_STRUCT(quad_shade_stage); + + /* allocate storage for program inputs/outputs, aligned to 16 bytes */ + qss->inputs = malloc(PIPE_ATTRIB_MAX * sizeof(*qss->inputs) + 16); + qss->outputs = malloc(PIPE_ATTRIB_MAX * sizeof(*qss->outputs) + 16); + qss->machine.Inputs = align16(qss->inputs); + qss->machine.Outputs = align16(qss->outputs); - stage->stage.softpipe = softpipe; - stage->stage.begin = shade_begin; - stage->stage.run = shade_quad; + qss->stage.softpipe = softpipe; + qss->stage.begin = shade_begin; + qss->stage.run = shade_quad; - return &stage->stage; + return &qss->stage; } diff --git a/src/mesa/pipe/tgsi/exec/tgsi_exec.c b/src/mesa/pipe/tgsi/exec/tgsi_exec.c index 77a24ec1d8b..1c515a26e33 100644 --- a/src/mesa/pipe/tgsi/exec/tgsi_exec.c +++ b/src/mesa/pipe/tgsi/exec/tgsi_exec.c @@ -65,6 +65,80 @@ #define CHAN_Z 2 #define CHAN_W 3 + +static void +expand_program(struct tgsi_exec_machine *mach ) +{ + struct tgsi_full_instruction *instructions; + struct tgsi_full_declaration *declarations; + struct tgsi_parse_context parse; + uint k; + uint maxInstructions = 10, numInstructions = 0; + uint maxDeclarations = 10, numDeclarations = 0; + + k = tgsi_parse_init( &parse, mach->Tokens ); + if (k != TGSI_PARSE_OK) { + printf("Problem parsing!\n"); + return; + } + + declarations = (struct tgsi_full_declaration *) + malloc(maxDeclarations * sizeof(struct tgsi_full_declaration)); + + instructions = (struct tgsi_full_instruction *) + malloc(maxInstructions * sizeof(struct tgsi_full_instruction)); + + while( !tgsi_parse_end_of_tokens( &parse ) ) { + tgsi_parse_token( &parse ); + switch( parse.FullToken.Token.Type ) { + case TGSI_TOKEN_TYPE_DECLARATION: + /* + exec_declaration( mach, &parse.FullToken.FullDeclaration ); + */ + if (numDeclarations == maxDeclarations) { + maxDeclarations += 10; + declarations = realloc(declarations, + maxDeclarations + * sizeof(struct tgsi_full_instruction)); + } + memcpy(declarations + numDeclarations, + &parse.FullToken.FullInstruction, + sizeof(declarations[0])); + numDeclarations++; + break; + case TGSI_TOKEN_TYPE_IMMEDIATE: + break; + case TGSI_TOKEN_TYPE_INSTRUCTION: + if (numInstructions == maxInstructions) { + maxInstructions += 10; + instructions = realloc(instructions, + maxInstructions + * sizeof(struct tgsi_full_instruction)); + } + memcpy(instructions + numInstructions, + &parse.FullToken.FullInstruction, + sizeof(instructions[0])); + numInstructions++; + break; + default: + assert( 0 ); + } + } + tgsi_parse_free (&parse); + + assert(!mach->Instructions); + mach->Instructions = instructions; + mach->NumInstructions = numInstructions; + mach->Declarations = declarations; + mach->NumDeclarations = numDeclarations; +} + + +/** + * Initialize machine state by expanding tokens to full instructions, + * allocating temporary storage, setting up constants, etc. + * After this, we can call tgsi_exec_machine_run() many times. + */ void tgsi_exec_machine_init( struct tgsi_exec_machine *mach, @@ -103,16 +177,32 @@ tgsi_exec_machine_init( mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f; } + if (mach->Declarations) { + free(mach->Declarations); + mach->Declarations = NULL; + mach->NumDeclarations = 0; + } + if (mach->Instructions) { + free(mach->Instructions); + mach->Instructions = NULL; + mach->NumInstructions = 0; + } + mach->CondMask = 0xf; mach->LoopMask = 0xf; mach->ExecMask = 0xf; + +#if 01 + tgsi_exec_prepare( mach ); + expand_program(mach); +#endif } void tgsi_exec_prepare( - struct tgsi_exec_machine *mach, - struct tgsi_exec_labels *labels ) + struct tgsi_exec_machine *mach ) { + struct tgsi_exec_labels *labels = &mach->Labels; struct tgsi_parse_context parse; GLuint k; GLuint instno = 0; @@ -164,10 +254,10 @@ void tgsi_exec_machine_run( struct tgsi_exec_machine *mach ) { - struct tgsi_exec_labels labels; - - tgsi_exec_prepare( mach, &labels ); - tgsi_exec_machine_run2( mach, &labels ); +#if 0 + tgsi_exec_prepare( mach ); +#endif + tgsi_exec_machine_run2( mach ); } static void @@ -2170,77 +2260,9 @@ exec_instruction( } -static void -expand_program(struct tgsi_exec_machine *mach ) -{ - struct tgsi_full_instruction *instructions; - struct tgsi_full_declaration *declarations; - struct tgsi_parse_context parse; - uint k; - uint maxInstructions = 10, numInstructions = 0; - uint maxDeclarations = 10, numDeclarations = 0; - - k = tgsi_parse_init( &parse, mach->Tokens ); - if (k != TGSI_PARSE_OK) { - printf("Problem parsing!\n"); - return; - } - - declarations = (struct tgsi_full_declaration *) - malloc(maxDeclarations * sizeof(struct tgsi_full_declaration)); - - instructions = (struct tgsi_full_instruction *) - malloc(maxInstructions * sizeof(struct tgsi_full_instruction)); - - while( !tgsi_parse_end_of_tokens( &parse ) ) { - tgsi_parse_token( &parse ); - switch( parse.FullToken.Token.Type ) { - case TGSI_TOKEN_TYPE_DECLARATION: - /* - exec_declaration( mach, &parse.FullToken.FullDeclaration ); - */ - if (numDeclarations == maxDeclarations) { - maxDeclarations += 10; - declarations = realloc(declarations, - maxDeclarations - * sizeof(struct tgsi_full_instruction)); - } - memcpy(declarations + numDeclarations, - &parse.FullToken.FullInstruction, - sizeof(declarations[0])); - numDeclarations++; - break; - case TGSI_TOKEN_TYPE_IMMEDIATE: - break; - case TGSI_TOKEN_TYPE_INSTRUCTION: - if (numInstructions == maxInstructions) { - maxInstructions += 10; - instructions = realloc(instructions, - maxInstructions - * sizeof(struct tgsi_full_instruction)); - } - memcpy(instructions + numInstructions, - &parse.FullToken.FullInstruction, - sizeof(instructions[0])); - numInstructions++; - break; - default: - assert( 0 ); - } - } - tgsi_parse_free (&parse); - - mach->Instructions = instructions; - mach->NumInstructions = numInstructions; - mach->Declarations = declarations; - mach->NumDeclarations = numDeclarations; -} - - void tgsi_exec_machine_run2( - struct tgsi_exec_machine *mach, - struct tgsi_exec_labels *labels ) + struct tgsi_exec_machine *mach ) { #if 0 && MESA GET_CURRENT_CONTEXT(ctx); @@ -2255,9 +2277,11 @@ tgsi_exec_machine_run2( GLuint k; #endif +#if 0 if (!mach->Instructions) { expand_program(mach); } +#endif mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0; mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0; @@ -2305,8 +2329,10 @@ tgsi_exec_machine_run2( exec_instruction( mach, mach->Instructions + pc, &pc ); } +#if 0 free(mach->Declarations); free(mach->Instructions); +#endif } #endif diff --git a/src/mesa/pipe/tgsi/exec/tgsi_exec.h b/src/mesa/pipe/tgsi/exec/tgsi_exec.h index 8997ea9c090..2b493ff6821 100644 --- a/src/mesa/pipe/tgsi/exec/tgsi_exec.h +++ b/src/mesa/pipe/tgsi/exec/tgsi_exec.h @@ -154,6 +154,8 @@ struct tgsi_exec_machine struct tgsi_full_declaration *Declarations; uint NumDeclarations; + + struct tgsi_exec_labels Labels; }; @@ -166,8 +168,7 @@ tgsi_exec_machine_init( void tgsi_exec_prepare( - struct tgsi_exec_machine *mach, - struct tgsi_exec_labels *labels ); + struct tgsi_exec_machine *mach ); void tgsi_exec_machine_run( @@ -175,8 +176,7 @@ tgsi_exec_machine_run( void tgsi_exec_machine_run2( - struct tgsi_exec_machine *mach, - struct tgsi_exec_labels *labels ); + struct tgsi_exec_machine *mach ); #if defined __cplusplus } // extern "C" -- 2.30.2