From: Keith Whitwell Date: Fri, 15 Feb 2008 11:15:47 +0000 (+0000) Subject: Merge commit 'origin/gallium-0.1' into gallium-0.1 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=6ac2c1cc0cd1253ba2014d459010032127f185ec;p=mesa.git Merge commit 'origin/gallium-0.1' into gallium-0.1 Conflicts: src/gallium/drivers/softpipe/sp_quad_fs.c src/gallium/drivers/softpipe/sp_state.h src/gallium/drivers/softpipe/sp_state_fs.c --- 6ac2c1cc0cd1253ba2014d459010032127f185ec diff --cc src/gallium/aux/draw/draw_vertex_shader.c index 00000000000,377ecbb931c..9413f8b43a4 mode 000000,100644..100644 --- a/src/gallium/aux/draw/draw_vertex_shader.c +++ b/src/gallium/aux/draw/draw_vertex_shader.c @@@ -1,0 -1,325 +1,327 @@@ + /************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /* + * Authors: + * Keith Whitwell + * Brian Paul + */ + + #include "pipe/p_util.h" + #include "pipe/p_shader_tokens.h" + #if defined(__i386__) || defined(__386__) + #include "tgsi/exec/tgsi_sse2.h" + #endif + #include "draw_private.h" + #include "draw_context.h" + + #include "x86/rtasm/x86sse.h" + #include "llvm/gallivm.h" + + + #define DBG_VS 0 + + + static INLINE unsigned + compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr) + { + unsigned mask = 0; + unsigned i; + + /* Do the hardwired planes first: + */ + if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT; + if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT; + if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT; + if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT; + if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT; + if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT; + + /* Followed by any remaining ones: + */ + for (i = 6; i < nr; i++) { + if (dot4(clip, plane[i]) < 0) + mask |= (1<machine; + unsigned int j; + + ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX); + ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX); + const float *scale = draw->viewport.scale; + const float *trans = draw->viewport.translate; + + assert(count <= 4); + assert(draw->vertex_shader->state->output_semantic_name[0] + == TGSI_SEMANTIC_POSITION); + + /* Consts does not require 16 byte alignment. */ + machine->Consts = (float (*)[4]) draw->user.constants; + + machine->Inputs = ALIGN16_ASSIGN(inputs); + machine->Outputs = ALIGN16_ASSIGN(outputs); + + draw->vertex_fetch.fetch_func( draw, machine, elts, count ); + + /* run shader */ + #ifdef MESA_LLVM + if (1) { + struct gallivm_prog *prog = draw->vertex_shader->llvm_prog; + gallivm_cpu_vs_exec(prog, + machine->Inputs, + machine->Outputs, + machine->Consts, + machine->Temps); + } else + #elif defined(__i386__) || defined(__386__) + if (draw->use_sse) { + /* SSE */ + /* cast away const */ + struct draw_vertex_shader *shader + = (struct draw_vertex_shader *)draw->vertex_shader; + codegen_function func + = (codegen_function) x86_get_func( &shader->sse2_program ); + + if (func) + func( + machine->Inputs, + machine->Outputs, + machine->Consts, + machine->Temps ); + else + /* interpreter */ + tgsi_exec_machine_run( machine ); + } + else + #endif + { + /* interpreter */ + tgsi_exec_machine_run( machine ); + } + + /* store machine results */ + for (j = 0; j < count; j++) { + unsigned slot; + float x, y, z, w; + + /* Handle attr[0] (position) specially: + * + * XXX: Computing the clipmask should be done in the vertex + * program as a set of DP4 instructions appended to the + * user-provided code. + */ + x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j]; + y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j]; + z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j]; + w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j]; + + vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes); + vOut[j]->edgeflag = 1; + + /* divide by w */ + w = 1.0f / w; + x *= w; + y *= w; + z *= w; + + /* Viewport mapping */ + vOut[j]->data[0][0] = x * scale[0] + trans[0]; + vOut[j]->data[0][1] = y * scale[1] + trans[1]; + vOut[j]->data[0][2] = z * scale[2] + trans[2]; + vOut[j]->data[0][3] = w; + + #if DBG_VS + debug_printf("output[%d]win: %f %f %f %f\n", j, + vOut[j]->data[0][0], + vOut[j]->data[0][1], + vOut[j]->data[0][2], + vOut[j]->data[0][3]); + #endif + /* Remaining attributes are packed into sequential post-transform + * vertex attrib slots. + */ + for (slot = 1; slot < draw->num_vs_outputs; slot++) { + vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j]; + vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j]; + vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j]; + vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j]; + #if DBG_VS + debug_printf("output[%d][%d]: %f %f %f %f\n", j, slot, + vOut[j]->data[slot][0], + vOut[j]->data[slot][1], + vOut[j]->data[slot][2], + vOut[j]->data[slot][3]); + #endif + } + } /* loop over vertices */ + } + + + /** + * Run the vertex shader on all vertices in the vertex queue. + * Called by the draw module when the vertx cache needs to be flushed. + */ + void + draw_vertex_shader_queue_flush(struct draw_context *draw) + { + unsigned i; + + assert(draw->vs.queue_nr != 0); + + /* XXX: do this on statechange: + */ + draw_update_vertex_fetch( draw ); + + // fprintf(stderr, " q(%d) ", draw->vs.queue_nr ); + + /* run vertex shader on vertex cache entries, four per invokation */ + for (i = 0; i < draw->vs.queue_nr; i += 4) { + struct vertex_header *dests[4]; + unsigned elts[4]; + int j, n = MIN2(4, draw->vs.queue_nr - i); + + for (j = 0; j < n; j++) { + elts[j] = draw->vs.queue[i + j].elt; + dests[j] = draw->vs.queue[i + j].dest; + } + + for ( ; j < 4; j++) { + elts[j] = elts[0]; + dests[j] = dests[0]; + } + + assert(n > 0); + assert(n <= 4); + + run_vertex_program(draw, elts, n, dests); + } + + draw->vs.queue_nr = 0; + } + + + struct draw_vertex_shader * + draw_create_vertex_shader(struct draw_context *draw, + const struct pipe_shader_state *shader) + { + struct draw_vertex_shader *vs; + + vs = CALLOC_STRUCT( draw_vertex_shader ); + if (vs == NULL) { + return NULL; + } + + vs->state = shader; + + #ifdef MESA_LLVM + struct gallivm_ir *ir = gallivm_ir_new(GALLIVM_VS); + gallivm_ir_set_layout(ir, GALLIVM_SOA); + gallivm_ir_set_components(ir, 4); + gallivm_ir_fill_from_tgsi(ir, shader->tokens); + vs->llvm_prog = gallivm_ir_compile(ir); + gallivm_ir_delete(ir); + + draw->engine = gallivm_global_cpu_engine(); + if (!draw->engine) { + draw->engine = gallivm_cpu_engine_create(vs->llvm_prog); + } + else { + gallivm_cpu_jit_compile(draw->engine, vs->llvm_prog); + } + #elif defined(__i386__) || defined(__386__) + if (draw->use_sse) { + /* cast-away const */ + struct pipe_shader_state *sh = (struct pipe_shader_state *) shader; + + x86_init_func( &vs->sse2_program ); + if (!tgsi_emit_sse2( (struct tgsi_token *) sh->tokens, + &vs->sse2_program )) { + x86_release_func( (struct x86_function *) &vs->sse2_program ); + fprintf(stdout /*err*/, + "tgsi_emit_sse2() failed, falling back to interpreter\n"); + } + } + #endif + + return vs; + } + + + void + draw_bind_vertex_shader(struct draw_context *draw, + struct draw_vertex_shader *dvs) + { + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); + + draw->vertex_shader = dvs; + draw->num_vs_outputs = dvs->state->num_outputs; + ++ tgsi_exec_machine_init(&draw->machine); ++ + /* specify the vertex program to interpret/execute */ - tgsi_exec_machine_init(&draw->machine, - draw->vertex_shader->state->tokens, - PIPE_MAX_SAMPLERS, - NULL /*samplers*/ ); ++ tgsi_exec_machine_bind_shader(&draw->machine, ++ draw->vertex_shader->state->tokens, ++ PIPE_MAX_SAMPLERS, ++ NULL /*samplers*/ ); + } + + + void + draw_delete_vertex_shader(struct draw_context *draw, + struct draw_vertex_shader *dvs) + { + #if defined(__i386__) || defined(__386__) + x86_release_func( (struct x86_function *) &dvs->sse2_program ); + #endif + + FREE( dvs ); + } diff --cc src/gallium/aux/tgsi/exec/tgsi_exec.c index 00000000000,a8f64c2287f..d7b18dc9c59 mode 000000,100644..100644 --- a/src/gallium/aux/tgsi/exec/tgsi_exec.c +++ b/src/gallium/aux/tgsi/exec/tgsi_exec.c @@@ -1,0 -1,2485 +1,2476 @@@ + /************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /** + * TGSI interpretor/executor. + * + * Flow control information: + * + * Since we operate on 'quads' (4 pixels or 4 vertices in parallel) + * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special + * care since a condition may be true for some quad components but false + * for other components. + * + * We basically execute all statements (even if they're in the part of + * an IF/ELSE clause that's "not taken") and use a special mask to + * control writing to destination registers. This is the ExecMask. + * See store_dest(). + * + * The ExecMask is computed from three other masks (CondMask, LoopMask and + * ContMask) which are controlled by the flow control instructions (namely: + * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT). + * + * + * Authors: + * Michal Krol + * Brian Paul + */ + + #include "pipe/p_compiler.h" + #include "pipe/p_state.h" + #include "pipe/p_util.h" + #include "pipe/p_shader_tokens.h" + #include "tgsi/util/tgsi_parse.h" + #include "tgsi/util/tgsi_util.h" + #include "tgsi_exec.h" + + #define TILE_TOP_LEFT 0 + #define TILE_TOP_RIGHT 1 + #define TILE_BOTTOM_LEFT 2 + #define TILE_BOTTOM_RIGHT 3 + + /* + * Shorthand locations of various utility registers (_I = Index, _C = Channel) + */ + #define TEMP_0_I TGSI_EXEC_TEMP_00000000_I + #define TEMP_0_C TGSI_EXEC_TEMP_00000000_C + #define TEMP_7F_I TGSI_EXEC_TEMP_7FFFFFFF_I + #define TEMP_7F_C TGSI_EXEC_TEMP_7FFFFFFF_C + #define TEMP_80_I TGSI_EXEC_TEMP_80000000_I + #define TEMP_80_C TGSI_EXEC_TEMP_80000000_C + #define TEMP_FF_I TGSI_EXEC_TEMP_FFFFFFFF_I + #define TEMP_FF_C TGSI_EXEC_TEMP_FFFFFFFF_C + #define TEMP_1_I TGSI_EXEC_TEMP_ONE_I + #define TEMP_1_C TGSI_EXEC_TEMP_ONE_C + #define TEMP_2_I TGSI_EXEC_TEMP_TWO_I + #define TEMP_2_C TGSI_EXEC_TEMP_TWO_C + #define TEMP_128_I TGSI_EXEC_TEMP_128_I + #define TEMP_128_C TGSI_EXEC_TEMP_128_C + #define TEMP_M128_I TGSI_EXEC_TEMP_MINUS_128_I + #define TEMP_M128_C TGSI_EXEC_TEMP_MINUS_128_C + #define TEMP_KILMASK_I TGSI_EXEC_TEMP_KILMASK_I + #define TEMP_KILMASK_C TGSI_EXEC_TEMP_KILMASK_C + #define TEMP_OUTPUT_I TGSI_EXEC_TEMP_OUTPUT_I + #define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C + #define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I + #define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C + #define TEMP_R0 TGSI_EXEC_TEMP_R0 + + #define FOR_EACH_CHANNEL(CHAN)\ + for (CHAN = 0; CHAN < 4; CHAN++) + + #define IS_CHANNEL_ENABLED(INST, CHAN)\ + ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN))) + + #define IS_CHANNEL_ENABLED2(INST, CHAN)\ + ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN))) + + #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\ + FOR_EACH_CHANNEL( CHAN )\ + if (IS_CHANNEL_ENABLED( INST, CHAN )) + + #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\ + FOR_EACH_CHANNEL( CHAN )\ + if (IS_CHANNEL_ENABLED2( INST, CHAN )) + + + /** The execution mask depends on the conditional mask and the loop mask */ + #define UPDATE_EXEC_MASK(MACH) \ + MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask + + + #define CHAN_X 0 + #define CHAN_Y 1 + #define CHAN_Z 2 + #define CHAN_W 3 + + + -static void -tgsi_exec_prepare( struct tgsi_exec_machine *mach ) ++/** ++ * Initialize machine state by expanding tokens to full instructions, ++ * allocating temporary storage, setting up constants, etc. ++ * After this, we can call tgsi_exec_machine_run() many times. ++ */ ++void ++tgsi_exec_machine_bind_shader( ++ struct tgsi_exec_machine *mach, ++ const struct tgsi_token *tokens, ++ uint numSamplers, ++ struct tgsi_sampler *samplers) + { - struct tgsi_exec_labels *labels = &mach->Labels; ++ uint k; + struct tgsi_parse_context parse; ++ struct tgsi_exec_labels *labels = &mach->Labels; + struct tgsi_full_instruction *instructions; + struct tgsi_full_declaration *declarations; + uint maxInstructions = 10, numInstructions = 0; + uint maxDeclarations = 10, numDeclarations = 0; - uint k; + uint instno = 0; + ++#if 0 ++ tgsi_dump(tokens, 0); ++#endif ++ ++ mach->Tokens = tokens; ++ mach->Samplers = samplers; ++ ++ k = tgsi_parse_init (&parse, mach->Tokens); ++ if (k != TGSI_PARSE_OK) { ++ debug_printf( "Problem parsing!\n" ); ++ return; ++ } ++ ++ mach->Processor = parse.FullHeader.Processor.Processor; + mach->ImmLimit = 0; + labels->count = 0; + + declarations = (struct tgsi_full_declaration *) + MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) ); + + instructions = (struct tgsi_full_instruction *) + MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) ); + - k = tgsi_parse_init( &parse, mach->Tokens ); - if (k != TGSI_PARSE_OK) { - debug_printf("Problem parsing!\n"); - return; - } + + while( !tgsi_parse_end_of_tokens( &parse ) ) { + uint pointer = parse.Position; + uint i; + + tgsi_parse_token( &parse ); + switch( parse.FullToken.Token.Type ) { + case TGSI_TOKEN_TYPE_DECLARATION: + /* save expanded declaration */ + if (numDeclarations == maxDeclarations) { + declarations = REALLOC(declarations, + maxDeclarations + * sizeof(struct tgsi_full_declaration), + (maxDeclarations + 10) + * sizeof(struct tgsi_full_declaration)); + maxDeclarations += 10; + } + memcpy(declarations + numDeclarations, + &parse.FullToken.FullDeclaration, + sizeof(declarations[0])); + numDeclarations++; + break; + + case TGSI_TOKEN_TYPE_IMMEDIATE: + { + uint size = parse.FullToken.FullImmediate.Immediate.Size - 1; + assert( size % 4 == 0 ); + assert( mach->ImmLimit + size / 4 <= TGSI_EXEC_NUM_IMMEDIATES ); + + for( i = 0; i < size; i++ ) { - mach->Imms[mach->ImmLimit + i / 4][i % 4] = parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float; ++ mach->Imms[mach->ImmLimit + i / 4][i % 4] = ++ parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float; + } + mach->ImmLimit += size / 4; + } + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + assert( labels->count < 128 ); + + labels->labels[labels->count][0] = instno; + labels->labels[labels->count][1] = pointer; + labels->count++; + + /* save expanded instruction */ + if (numInstructions == maxInstructions) { + instructions = REALLOC(instructions, + maxInstructions + * sizeof(struct tgsi_full_instruction), + (maxInstructions + 10) + * sizeof(struct tgsi_full_instruction)); + maxInstructions += 10; + } + memcpy(instructions + numInstructions, + &parse.FullToken.FullInstruction, + sizeof(instructions[0])); + numInstructions++; + break; + + default: + assert( 0 ); + } + } + tgsi_parse_free (&parse); + + if (mach->Declarations) { + FREE( mach->Declarations ); + } + mach->Declarations = declarations; + mach->NumDeclarations = numDeclarations; + + if (mach->Instructions) { + FREE( mach->Instructions ); + } + mach->Instructions = instructions; + mach->NumInstructions = numInstructions; + } + + -/** - * Initialize machine state by expanding tokens to full instructions, - * allocating temporary storage, setting up constants, etc. - * After this, we can call tgsi_exec_machine_run() many times. - */ + void + tgsi_exec_machine_init( - struct tgsi_exec_machine *mach, - const struct tgsi_token *tokens, - uint numSamplers, - struct tgsi_sampler *samplers) ++ struct tgsi_exec_machine *mach ) + { - uint i, k; - struct tgsi_parse_context parse; - -#if 0 - tgsi_dump(tokens, 0); -#endif - - mach->Tokens = tokens; - - mach->Samplers = samplers; - - k = tgsi_parse_init (&parse, mach->Tokens); - if (k != TGSI_PARSE_OK) { - debug_printf( "Problem parsing!\n" ); - return; - } - - mach->Processor = parse.FullHeader.Processor.Processor; - tgsi_parse_free (&parse); ++ uint i; + + mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps); + mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS]; + + /* Setup constants. */ + for( i = 0; i < 4; i++ ) { + mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000; + mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF; + mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000; + mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF; + mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f; + mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f; + mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f; + mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f; + } - - tgsi_exec_prepare( mach ); + } + + + void + tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach) + { + if (mach->Instructions) { + FREE(mach->Instructions); + mach->Instructions = NULL; + mach->NumInstructions = 0; + } + if (mach->Declarations) { + FREE(mach->Declarations); + mach->Declarations = NULL; + mach->NumDeclarations = 0; + } + } + + + static void + micro_abs( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) fabs( (double) src->f[0] ); + dst->f[1] = (float) fabs( (double) src->f[1] ); + dst->f[2] = (float) fabs( (double) src->f[2] ); + dst->f[3] = (float) fabs( (double) src->f[3] ); + } + + static void + micro_add( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = src0->f[0] + src1->f[0]; + dst->f[1] = src0->f[1] + src1->f[1]; + dst->f[2] = src0->f[2] + src1->f[2]; + dst->f[3] = src0->f[3] + src1->f[3]; + } + + static void + micro_iadd( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->i[0] = src0->i[0] + src1->i[0]; + dst->i[1] = src0->i[1] + src1->i[1]; + dst->i[2] = src0->i[2] + src1->i[2]; + dst->i[3] = src0->i[3] + src1->i[3]; + } + + static void + micro_and( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] & src1->u[0]; + dst->u[1] = src0->u[1] & src1->u[1]; + dst->u[2] = src0->u[2] & src1->u[2]; + dst->u[3] = src0->u[3] & src1->u[3]; + } + + static void + micro_ceil( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) ceil( (double) src->f[0] ); + dst->f[1] = (float) ceil( (double) src->f[1] ); + dst->f[2] = (float) ceil( (double) src->f[2] ); + dst->f[3] = (float) ceil( (double) src->f[3] ); + } + + static void + micro_cos( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) cos( (double) src->f[0] ); + dst->f[1] = (float) cos( (double) src->f[1] ); + dst->f[2] = (float) cos( (double) src->f[2] ); + dst->f[3] = (float) cos( (double) src->f[3] ); + } + + static void + micro_ddx( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = + dst->f[1] = + dst->f[2] = + dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT]; + } + + static void + micro_ddy( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = + dst->f[1] = + dst->f[2] = + dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT]; + } + + static void + micro_div( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = src0->f[0] / src1->f[0]; + dst->f[1] = src0->f[1] / src1->f[1]; + dst->f[2] = src0->f[2] / src1->f[2]; + dst->f[3] = src0->f[3] / src1->f[3]; + } + + static void + micro_udiv( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] / src1->u[0]; + dst->u[1] = src0->u[1] / src1->u[1]; + dst->u[2] = src0->u[2] / src1->u[2]; + dst->u[3] = src0->u[3] / src1->u[3]; + } + + static void + micro_eq( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2, + const union tgsi_exec_channel *src3 ) + { + dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0]; + dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1]; + dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2]; + dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3]; + } + + static void + micro_ieq( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2, + const union tgsi_exec_channel *src3 ) + { + dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0]; + dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1]; + dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2]; + dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3]; + } + + static void + micro_exp2( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src) + { + dst->f[0] = (float) pow( 2.0, (double) src->f[0] ); + dst->f[1] = (float) pow( 2.0, (double) src->f[1] ); + dst->f[2] = (float) pow( 2.0, (double) src->f[2] ); + dst->f[3] = (float) pow( 2.0, (double) src->f[3] ); + } + + static void + micro_f2it( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->i[0] = (int) src->f[0]; + dst->i[1] = (int) src->f[1]; + dst->i[2] = (int) src->f[2]; + dst->i[3] = (int) src->f[3]; + } + + static void + micro_f2ut( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->u[0] = (uint) src->f[0]; + dst->u[1] = (uint) src->f[1]; + dst->u[2] = (uint) src->f[2]; + dst->u[3] = (uint) src->f[3]; + } + + static void + micro_flr( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) floor( (double) src->f[0] ); + dst->f[1] = (float) floor( (double) src->f[1] ); + dst->f[2] = (float) floor( (double) src->f[2] ); + dst->f[3] = (float) floor( (double) src->f[3] ); + } + + static void + micro_frc( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] ); + dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] ); + dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] ); + dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] ); + } + + static void + micro_ge( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2, + const union tgsi_exec_channel *src3 ) + { + dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0]; + dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1]; + dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2]; + dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3]; + } + + static void + micro_i2f( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) src->i[0]; + dst->f[1] = (float) src->i[1]; + dst->f[2] = (float) src->i[2]; + dst->f[3] = (float) src->i[3]; + } + + static void + micro_lg2( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f; + dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f; + dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f; + dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f; + } + + static void + micro_lt( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2, + const union tgsi_exec_channel *src3 ) + { + dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0]; + dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1]; + dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2]; + dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3]; + } + + static void + micro_ilt( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2, + const union tgsi_exec_channel *src3 ) + { + dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0]; + dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1]; + dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2]; + dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3]; + } + + static void + micro_ult( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2, + const union tgsi_exec_channel *src3 ) + { + dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0]; + dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1]; + dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2]; + dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3]; + } + + static void + micro_max( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0]; + dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1]; + dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2]; + dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3]; + } + + static void + micro_imax( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0]; + dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1]; + dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2]; + dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3]; + } + + static void + micro_umax( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0]; + dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1]; + dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2]; + dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3]; + } + + static void + micro_min( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0]; + dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1]; + dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2]; + dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3]; + } + + static void + micro_imin( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0]; + dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1]; + dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2]; + dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3]; + } + + static void + micro_umin( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0]; + dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1]; + dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2]; + dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3]; + } + + static void + micro_umod( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] % src1->u[0]; + dst->u[1] = src0->u[1] % src1->u[1]; + dst->u[2] = src0->u[2] % src1->u[2]; + dst->u[3] = src0->u[3] % src1->u[3]; + } + + static void + micro_mul( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = src0->f[0] * src1->f[0]; + dst->f[1] = src0->f[1] * src1->f[1]; + dst->f[2] = src0->f[2] * src1->f[2]; + dst->f[3] = src0->f[3] * src1->f[3]; + } + + static void + micro_imul( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->i[0] = src0->i[0] * src1->i[0]; + dst->i[1] = src0->i[1] * src1->i[1]; + dst->i[2] = src0->i[2] * src1->i[2]; + dst->i[3] = src0->i[3] * src1->i[3]; + } + + static void + micro_imul64( + union tgsi_exec_channel *dst0, + union tgsi_exec_channel *dst1, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst1->i[0] = src0->i[0] * src1->i[0]; + dst1->i[1] = src0->i[1] * src1->i[1]; + dst1->i[2] = src0->i[2] * src1->i[2]; + dst1->i[3] = src0->i[3] * src1->i[3]; + dst0->i[0] = 0; + dst0->i[1] = 0; + dst0->i[2] = 0; + dst0->i[3] = 0; + } + + static void + micro_umul64( + union tgsi_exec_channel *dst0, + union tgsi_exec_channel *dst1, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst1->u[0] = src0->u[0] * src1->u[0]; + dst1->u[1] = src0->u[1] * src1->u[1]; + dst1->u[2] = src0->u[2] * src1->u[2]; + dst1->u[3] = src0->u[3] * src1->u[3]; + dst0->u[0] = 0; + dst0->u[1] = 0; + dst0->u[2] = 0; + dst0->u[3] = 0; + } + + static void + micro_movc( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2 ) + { + dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0]; + dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1]; + dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2]; + dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3]; + } + + static void + micro_neg( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = -src->f[0]; + dst->f[1] = -src->f[1]; + dst->f[2] = -src->f[2]; + dst->f[3] = -src->f[3]; + } + + static void + micro_ineg( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->i[0] = -src->i[0]; + dst->i[1] = -src->i[1]; + dst->i[2] = -src->i[2]; + dst->i[3] = -src->i[3]; + } + + static void + micro_not( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->u[0] = ~src->u[0]; + dst->u[1] = ~src->u[1]; + dst->u[2] = ~src->u[2]; + dst->u[3] = ~src->u[3]; + } + + static void + micro_or( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] | src1->u[0]; + dst->u[1] = src0->u[1] | src1->u[1]; + dst->u[2] = src0->u[2] | src1->u[2]; + dst->u[3] = src0->u[3] | src1->u[3]; + } + + static void + micro_pow( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] ); + dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] ); + dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] ); + dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] ); + } + + static void + micro_rnd( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) ); + dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) ); + dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) ); + dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) ); + } + + static void + micro_shl( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->i[0] = src0->i[0] << src1->i[0]; + dst->i[1] = src0->i[1] << src1->i[1]; + dst->i[2] = src0->i[2] << src1->i[2]; + dst->i[3] = src0->i[3] << src1->i[3]; + } + + static void + micro_ishr( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->i[0] = src0->i[0] >> src1->i[0]; + dst->i[1] = src0->i[1] >> src1->i[1]; + dst->i[2] = src0->i[2] >> src1->i[2]; + dst->i[3] = src0->i[3] >> src1->i[3]; + } + + static void + micro_trunc( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0 ) + { + dst->f[0] = (float) (int) src0->f[0]; + dst->f[1] = (float) (int) src0->f[1]; + dst->f[2] = (float) (int) src0->f[2]; + dst->f[3] = (float) (int) src0->f[3]; + } + + static void + micro_ushr( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] >> src1->u[0]; + dst->u[1] = src0->u[1] >> src1->u[1]; + dst->u[2] = src0->u[2] >> src1->u[2]; + dst->u[3] = src0->u[3] >> src1->u[3]; + } + + static void + micro_sin( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) sin( (double) src->f[0] ); + dst->f[1] = (float) sin( (double) src->f[1] ); + dst->f[2] = (float) sin( (double) src->f[2] ); + dst->f[3] = (float) sin( (double) src->f[3] ); + } + + static void + micro_sqrt( union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) sqrt( (double) src->f[0] ); + dst->f[1] = (float) sqrt( (double) src->f[1] ); + dst->f[2] = (float) sqrt( (double) src->f[2] ); + dst->f[3] = (float) sqrt( (double) src->f[3] ); + } + + static void + micro_sub( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = src0->f[0] - src1->f[0]; + dst->f[1] = src0->f[1] - src1->f[1]; + dst->f[2] = src0->f[2] - src1->f[2]; + dst->f[3] = src0->f[3] - src1->f[3]; + } + + static void + micro_u2f( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) src->u[0]; + dst->f[1] = (float) src->u[1]; + dst->f[2] = (float) src->u[2]; + dst->f[3] = (float) src->u[3]; + } + + static void + micro_xor( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] ^ src1->u[0]; + dst->u[1] = src0->u[1] ^ src1->u[1]; + dst->u[2] = src0->u[2] ^ src1->u[2]; + dst->u[3] = src0->u[3] ^ src1->u[3]; + } + + static void + fetch_src_file_channel( + const struct tgsi_exec_machine *mach, + const uint file, + const uint swizzle, + const union tgsi_exec_channel *index, + union tgsi_exec_channel *chan ) + { + switch( swizzle ) { + case TGSI_EXTSWIZZLE_X: + case TGSI_EXTSWIZZLE_Y: + case TGSI_EXTSWIZZLE_Z: + case TGSI_EXTSWIZZLE_W: + switch( file ) { + case TGSI_FILE_CONSTANT: + chan->f[0] = mach->Consts[index->i[0]][swizzle]; + chan->f[1] = mach->Consts[index->i[1]][swizzle]; + chan->f[2] = mach->Consts[index->i[2]][swizzle]; + chan->f[3] = mach->Consts[index->i[3]][swizzle]; + break; + + case TGSI_FILE_INPUT: + chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3]; + break; + + case TGSI_FILE_TEMPORARY: + chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3]; + break; + + case TGSI_FILE_IMMEDIATE: + assert( index->i[0] < (int) mach->ImmLimit ); + chan->f[0] = mach->Imms[index->i[0]][swizzle]; + assert( index->i[1] < (int) mach->ImmLimit ); + chan->f[1] = mach->Imms[index->i[1]][swizzle]; + assert( index->i[2] < (int) mach->ImmLimit ); + chan->f[2] = mach->Imms[index->i[2]][swizzle]; + assert( index->i[3] < (int) mach->ImmLimit ); + chan->f[3] = mach->Imms[index->i[3]][swizzle]; + break; + + case TGSI_FILE_ADDRESS: + chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3]; + break; + + case TGSI_FILE_OUTPUT: + /* vertex/fragment output vars can be read too */ + chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3]; + break; + + default: + assert( 0 ); + } + break; + + case TGSI_EXTSWIZZLE_ZERO: + *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]; + break; + + case TGSI_EXTSWIZZLE_ONE: + *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]; + break; + + default: + assert( 0 ); + } + } + + static void + fetch_source( + const struct tgsi_exec_machine *mach, + union tgsi_exec_channel *chan, + const struct tgsi_full_src_register *reg, + const uint chan_index ) + { + union tgsi_exec_channel index; + uint swizzle; + + index.i[0] = + index.i[1] = + index.i[2] = + index.i[3] = reg->SrcRegister.Index; + + if (reg->SrcRegister.Indirect) { + union tgsi_exec_channel index2; + union tgsi_exec_channel indir_index; + + index2.i[0] = + index2.i[1] = + index2.i[2] = + index2.i[3] = reg->SrcRegisterInd.Index; + + swizzle = tgsi_util_get_src_register_swizzle( ®->SrcRegisterInd, CHAN_X ); + fetch_src_file_channel( + mach, + reg->SrcRegisterInd.File, + swizzle, + &index2, + &indir_index ); + + index.i[0] += indir_index.i[0]; + index.i[1] += indir_index.i[1]; + index.i[2] += indir_index.i[2]; + index.i[3] += indir_index.i[3]; + } + + if( reg->SrcRegister.Dimension ) { + switch( reg->SrcRegister.File ) { + case TGSI_FILE_INPUT: + index.i[0] *= 17; + index.i[1] *= 17; + index.i[2] *= 17; + index.i[3] *= 17; + break; + case TGSI_FILE_CONSTANT: + index.i[0] *= 4096; + index.i[1] *= 4096; + index.i[2] *= 4096; + index.i[3] *= 4096; + break; + default: + assert( 0 ); + } + + index.i[0] += reg->SrcRegisterDim.Index; + index.i[1] += reg->SrcRegisterDim.Index; + index.i[2] += reg->SrcRegisterDim.Index; + index.i[3] += reg->SrcRegisterDim.Index; + + if (reg->SrcRegisterDim.Indirect) { + union tgsi_exec_channel index2; + union tgsi_exec_channel indir_index; + + index2.i[0] = + index2.i[1] = + index2.i[2] = + index2.i[3] = reg->SrcRegisterDimInd.Index; + + swizzle = tgsi_util_get_src_register_swizzle( ®->SrcRegisterDimInd, CHAN_X ); + fetch_src_file_channel( + mach, + reg->SrcRegisterDimInd.File, + swizzle, + &index2, + &indir_index ); + + index.i[0] += indir_index.i[0]; + index.i[1] += indir_index.i[1]; + index.i[2] += indir_index.i[2]; + index.i[3] += indir_index.i[3]; + } + } + + swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index ); + fetch_src_file_channel( + mach, + reg->SrcRegister.File, + swizzle, + &index, + chan ); + + switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) { + case TGSI_UTIL_SIGN_CLEAR: + micro_abs( chan, chan ); + break; + + case TGSI_UTIL_SIGN_SET: + micro_abs( chan, chan ); + micro_neg( chan, chan ); + break; + + case TGSI_UTIL_SIGN_TOGGLE: + micro_neg( chan, chan ); + break; + + case TGSI_UTIL_SIGN_KEEP: + break; + } + + if (reg->SrcRegisterExtMod.Complement) { + micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan ); + } + } + + static void + store_dest( + struct tgsi_exec_machine *mach, + const union tgsi_exec_channel *chan, + const struct tgsi_full_dst_register *reg, + const struct tgsi_full_instruction *inst, + uint chan_index ) + { + union tgsi_exec_channel *dst; + + switch( reg->DstRegister.File ) { + case TGSI_FILE_NULL: + return; + + case TGSI_FILE_OUTPUT: + dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] + + reg->DstRegister.Index].xyzw[chan_index]; + break; + + case TGSI_FILE_TEMPORARY: + dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index]; + break; + + case TGSI_FILE_ADDRESS: + dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index]; + break; + + default: + assert( 0 ); + return; + } + + switch (inst->Instruction.Saturate) + { + case TGSI_SAT_NONE: + if (mach->ExecMask & 0x1) + dst->i[0] = chan->i[0]; + if (mach->ExecMask & 0x2) + dst->i[1] = chan->i[1]; + if (mach->ExecMask & 0x4) + dst->i[2] = chan->i[2]; + if (mach->ExecMask & 0x8) + dst->i[3] = chan->i[3]; + break; + + case TGSI_SAT_ZERO_ONE: + /* XXX need to obey ExecMask here */ + micro_max(dst, chan, &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]); + micro_min(dst, dst, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]); + break; + + case TGSI_SAT_MINUS_PLUS_ONE: + assert( 0 ); + break; + + default: + assert( 0 ); + } + } + + #define FETCH(VAL,INDEX,CHAN)\ + fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN) + + #define STORE(VAL,INDEX,CHAN)\ + store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN ) + + + /** + * Execute ARB-style KIL which is predicated by a src register. + * Kill fragment if any of the four values is less than zero. + */ + static void + exec_kilp(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) + { + uint uniquemask; + uint chan_index; + uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */ + union tgsi_exec_channel r[1]; + + /* This mask stores component bits that were already tested. Note that + * we test if the value is less than zero, so 1.0 and 0.0 need not to be + * tested. */ + uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE); + + for (chan_index = 0; chan_index < 4; chan_index++) + { + uint swizzle; + uint i; + + /* unswizzle channel */ + swizzle = tgsi_util_get_full_src_register_extswizzle ( + &inst->FullSrcRegisters[0], + chan_index); + + /* check if the component has not been already tested */ + if (uniquemask & (1 << swizzle)) + continue; + uniquemask |= 1 << swizzle; + + FETCH(&r[0], 0, chan_index); + for (i = 0; i < 4; i++) + if (r[0].f[i] < 0.0f) + kilmask |= 1 << i; + } + + mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask; + } + + + /* + * Fetch a texel using STR texture coordinates. + */ + static void + fetch_texel( struct tgsi_sampler *sampler, + const union tgsi_exec_channel *s, + const union tgsi_exec_channel *t, + const union tgsi_exec_channel *p, + float lodbias, /* XXX should be float[4] */ + union tgsi_exec_channel *r, + union tgsi_exec_channel *g, + union tgsi_exec_channel *b, + union tgsi_exec_channel *a ) + { + uint j; + float rgba[NUM_CHANNELS][QUAD_SIZE]; + + sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba); + + for (j = 0; j < 4; j++) { + r->f[j] = rgba[0][j]; + g->f[j] = rgba[1][j]; + b->f[j] = rgba[2][j]; + a->f[j] = rgba[3][j]; + } + } + + + static void + exec_tex(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst, + boolean biasLod) + { + const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index; + union tgsi_exec_channel r[8]; + uint chan_index; + float lodBias; + + /* debug_printf("Sampler %u unit %u\n", sampler, unit); */ + + switch (inst->InstructionExtTexture.Texture) { + case TGSI_TEXTURE_1D: + + FETCH(&r[0], 0, CHAN_X); + + switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) { + case TGSI_EXTSWIZZLE_W: + FETCH(&r[1], 0, CHAN_W); + micro_div( &r[0], &r[0], &r[1] ); + break; + + case TGSI_EXTSWIZZLE_ONE: + break; + + default: + assert (0); + } + + if (biasLod) { + FETCH(&r[1], 0, CHAN_W); + lodBias = r[2].f[0]; + } + else + lodBias = 0.0; + + fetch_texel(&mach->Samplers[unit], + &r[0], NULL, NULL, lodBias, /* S, T, P, BIAS */ + &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */ + break; + + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 0, CHAN_Z); + + switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) { + case TGSI_EXTSWIZZLE_W: + FETCH(&r[3], 0, CHAN_W); + micro_div( &r[0], &r[0], &r[3] ); + micro_div( &r[1], &r[1], &r[3] ); + micro_div( &r[2], &r[2], &r[3] ); + break; + + case TGSI_EXTSWIZZLE_ONE: + break; + + default: + assert (0); + } + + if (biasLod) { + FETCH(&r[3], 0, CHAN_W); + lodBias = r[3].f[0]; + } + else + lodBias = 0.0; + + fetch_texel(&mach->Samplers[unit], + &r[0], &r[1], &r[2], lodBias, /* inputs */ + &r[0], &r[1], &r[2], &r[3]); /* outputs */ + break; + + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 0, CHAN_Z); + + switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) { + case TGSI_EXTSWIZZLE_W: + FETCH(&r[3], 0, CHAN_W); + micro_div( &r[0], &r[0], &r[3] ); + micro_div( &r[1], &r[1], &r[3] ); + micro_div( &r[2], &r[2], &r[3] ); + break; + + case TGSI_EXTSWIZZLE_ONE: + break; + + default: + assert (0); + } + + if (biasLod) { + FETCH(&r[3], 0, CHAN_W); + lodBias = r[3].f[0]; + } + else + lodBias = 0.0; + + fetch_texel(&mach->Samplers[unit], + &r[0], &r[1], &r[2], lodBias, + &r[0], &r[1], &r[2], &r[3]); + break; + + default: + assert (0); + } + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[chan_index], 0, chan_index ); + } + } + + + /** + * Evaluate a constant-valued coefficient at the position of the + * current quad. + */ + static void + eval_constant_coef( + struct tgsi_exec_machine *mach, + unsigned attrib, + unsigned chan ) + { + unsigned i; + + for( i = 0; i < QUAD_SIZE; i++ ) { + mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan]; + } + } + + /** + * Evaluate a linear-valued coefficient at the position of the + * current quad. + */ + static void + eval_linear_coef( + struct tgsi_exec_machine *mach, + unsigned attrib, + unsigned chan ) + { + const float x = mach->QuadPos.xyzw[0].f[0]; + const float y = mach->QuadPos.xyzw[1].f[0]; + const float dadx = mach->InterpCoefs[attrib].dadx[chan]; + const float dady = mach->InterpCoefs[attrib].dady[chan]; + const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y; + mach->Inputs[attrib].xyzw[chan].f[0] = a0; + mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx; + mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady; + mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady; + } + + /** + * Evaluate a perspective-valued coefficient at the position of the + * current quad. + */ + static void + eval_perspective_coef( + struct tgsi_exec_machine *mach, + unsigned attrib, + unsigned chan ) + { + const float x = mach->QuadPos.xyzw[0].f[0]; + const float y = mach->QuadPos.xyzw[1].f[0]; + const float dadx = mach->InterpCoefs[attrib].dadx[chan]; + const float dady = mach->InterpCoefs[attrib].dady[chan]; + const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y; + const float *w = mach->QuadPos.xyzw[3].f; + /* divide by W here */ + mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0]; + mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1]; + mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2]; + mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3]; + } + + + typedef void (* eval_coef_func)( + struct tgsi_exec_machine *mach, + unsigned attrib, + unsigned chan ); + + static void + exec_declaration( + struct tgsi_exec_machine *mach, + const struct tgsi_full_declaration *decl ) + { + if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) { + if( decl->Declaration.File == TGSI_FILE_INPUT ) { + unsigned first, last, mask; + eval_coef_func eval; + + assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE ); + + first = decl->u.DeclarationRange.First; + last = decl->u.DeclarationRange.Last; + mask = decl->Declaration.UsageMask; + + switch( decl->Interpolation.Interpolate ) { + case TGSI_INTERPOLATE_CONSTANT: + eval = eval_constant_coef; + break; + + case TGSI_INTERPOLATE_LINEAR: + eval = eval_linear_coef; + break; + + case TGSI_INTERPOLATE_PERSPECTIVE: + eval = eval_perspective_coef; + break; + + default: + assert( 0 ); + } + + if( mask == TGSI_WRITEMASK_XYZW ) { + unsigned i, j; + + for( i = first; i <= last; i++ ) { + for( j = 0; j < NUM_CHANNELS; j++ ) { + eval( mach, i, j ); + } + } + } + else { + unsigned i, j; + + for( j = 0; j < NUM_CHANNELS; j++ ) { + if( mask & (1 << j) ) { + for( i = first; i <= last; i++ ) { + eval( mach, i, j ); + } + } + } + } + } + } + } + + static void + exec_instruction( + struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst, + int *pc ) + { + uint chan_index; + union tgsi_exec_channel r[8]; + + (*pc)++; + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_ARL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_f2it( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_MOV: + /* TGSI_OPCODE_SWZ */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_LIT: + if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + FETCH( &r[0], 0, CHAN_X ); + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { + micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + STORE( &r[0], 0, CHAN_Y ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + FETCH( &r[1], 0, CHAN_Y ); + micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + + FETCH( &r[2], 0, CHAN_W ); + micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] ); + micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] ); + micro_pow( &r[1], &r[1], &r[2] ); + micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + STORE( &r[0], 0, CHAN_Z ); + } + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_RCP: + /* TGSI_OPCODE_RECIP */ + FETCH( &r[0], 0, CHAN_X ); + micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] ); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_RSQ: + /* TGSI_OPCODE_RECIPSQRT */ + FETCH( &r[0], 0, CHAN_X ); + micro_sqrt( &r[0], &r[0] ); + micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] ); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_EXP: + assert (0); + break; + + case TGSI_OPCODE_LOG: + assert (0); + break; + + case TGSI_OPCODE_MUL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) + { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + micro_mul( &r[0], &r[0], &r[1] ); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_ADD: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_add( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DP3: + /* TGSI_OPCODE_DOT3 */ + FETCH( &r[0], 0, CHAN_X ); + FETCH( &r[1], 1, CHAN_X ); + micro_mul( &r[0], &r[0], &r[1] ); + + FETCH( &r[1], 0, CHAN_Y ); + FETCH( &r[2], 1, CHAN_Y ); + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FETCH( &r[1], 0, CHAN_Z ); + FETCH( &r[2], 1, CHAN_Z ); + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DP4: + /* TGSI_OPCODE_DOT4 */ + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 1, CHAN_X); + + micro_mul( &r[0], &r[0], &r[1] ); + + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 1, CHAN_Y); + + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FETCH(&r[1], 0, CHAN_Z); + FETCH(&r[2], 1, CHAN_Z); + + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FETCH(&r[1], 0, CHAN_W); + FETCH(&r[2], 1, CHAN_W); + + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DST: + if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { + FETCH( &r[0], 0, CHAN_Y ); + FETCH( &r[1], 1, CHAN_Y); + micro_mul( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, CHAN_Y ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + FETCH( &r[0], 0, CHAN_Z ); + STORE( &r[0], 0, CHAN_Z ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { + FETCH( &r[0], 1, CHAN_W ); + STORE( &r[0], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_MIN: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + /* XXX use micro_min()?? */ + micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] ); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_MAX: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + /* XXX use micro_max()?? */ + micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] ); + + STORE(&r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SLT: + /* TGSI_OPCODE_SETLT */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SGE: + /* TGSI_OPCODE_SETGE */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_MAD: + /* TGSI_OPCODE_MADD */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_mul( &r[0], &r[0], &r[1] ); + FETCH( &r[1], 2, chan_index ); + micro_add( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SUB: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + micro_sub( &r[0], &r[0], &r[1] ); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_LERP: + /* TGSI_OPCODE_LRP */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + FETCH(&r[2], 2, chan_index); + + micro_sub( &r[1], &r[1], &r[2] ); + micro_mul( &r[0], &r[0], &r[1] ); + micro_add( &r[0], &r[0], &r[2] ); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_CND: + assert (0); + break; + + case TGSI_OPCODE_CND0: + assert (0); + break; + + case TGSI_OPCODE_DOT2ADD: + /* TGSI_OPCODE_DP2A */ + assert (0); + break; + + case TGSI_OPCODE_INDEX: + assert (0); + break; + + case TGSI_OPCODE_NEGATE: + assert (0); + break; + + case TGSI_OPCODE_FRAC: + /* TGSI_OPCODE_FRC */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_frc( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_CLAMP: + assert (0); + break; + + case TGSI_OPCODE_FLOOR: + /* TGSI_OPCODE_FLR */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_flr( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_ROUND: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_rnd( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_EXPBASE2: + /* TGSI_OPCODE_EX2 */ + FETCH(&r[0], 0, CHAN_X); + + micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_LOGBASE2: + /* TGSI_OPCODE_LG2 */ + FETCH( &r[0], 0, CHAN_X ); + micro_lg2( &r[0], &r[0] ); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_POWER: + /* TGSI_OPCODE_POW */ + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 1, CHAN_X); + + micro_pow( &r[0], &r[0], &r[1] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_CROSSPRODUCT: + /* TGSI_OPCODE_XPD */ + FETCH(&r[0], 0, CHAN_Y); + FETCH(&r[1], 1, CHAN_Z); + + micro_mul( &r[2], &r[0], &r[1] ); + + FETCH(&r[3], 0, CHAN_Z); + FETCH(&r[4], 1, CHAN_Y); + + micro_mul( &r[5], &r[3], &r[4] ); + micro_sub( &r[2], &r[2], &r[5] ); + + if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { + STORE( &r[2], 0, CHAN_X ); + } + + FETCH(&r[2], 1, CHAN_X); + + micro_mul( &r[3], &r[3], &r[2] ); + + FETCH(&r[5], 0, CHAN_X); + + micro_mul( &r[1], &r[1], &r[5] ); + micro_sub( &r[3], &r[3], &r[1] ); + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { + STORE( &r[3], 0, CHAN_Y ); + } + + micro_mul( &r[5], &r[5], &r[4] ); + micro_mul( &r[0], &r[0], &r[2] ); + micro_sub( &r[5], &r[5], &r[0] ); + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + STORE( &r[5], 0, CHAN_Z ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_MULTIPLYMATRIX: + assert (0); + break; + + case TGSI_OPCODE_ABS: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + + micro_abs( &r[0], &r[0] ); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_RCC: + assert (0); + break; + + case TGSI_OPCODE_DPH: + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 1, CHAN_X); + + micro_mul( &r[0], &r[0], &r[1] ); + + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 1, CHAN_Y); + + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FETCH(&r[1], 0, CHAN_Z); + FETCH(&r[2], 1, CHAN_Z); + + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FETCH(&r[1], 1, CHAN_W); + + micro_add( &r[0], &r[0], &r[1] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_COS: + FETCH(&r[0], 0, CHAN_X); + + micro_cos( &r[0], &r[0] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DDX: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_ddx( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DDY: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_ddy( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_KILP: + exec_kilp (mach, inst); + break; + + case TGSI_OPCODE_KIL: + /* for enabled ExecMask bits, set the killed bit */ + mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= mach->ExecMask; + break; + + case TGSI_OPCODE_PK2H: + assert (0); + break; + + case TGSI_OPCODE_PK2US: + assert (0); + break; + + case TGSI_OPCODE_PK4B: + assert (0); + break; + + case TGSI_OPCODE_PK4UB: + assert (0); + break; + + case TGSI_OPCODE_RFL: + assert (0); + break; + + case TGSI_OPCODE_SEQ: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_eq( &r[0], &r[0], &r[1], + &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], + &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SFL: + assert (0); + break; + + case TGSI_OPCODE_SGT: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SIN: + FETCH( &r[0], 0, CHAN_X ); + micro_sin( &r[0], &r[0] ); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SLE: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SNE: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_STR: + assert (0); + break; + + case TGSI_OPCODE_TEX: + /* simple texture lookup */ + /* src[0] = texcoord */ + /* src[1] = sampler unit */ + exec_tex(mach, inst, FALSE); + break; + + case TGSI_OPCODE_TXB: + /* Texture lookup with lod bias */ + /* src[0] = texcoord (src[0].w = LOD bias) */ + /* src[1] = sampler unit */ + exec_tex(mach, inst, TRUE); + break; + + case TGSI_OPCODE_TXD: + /* Texture lookup with explict partial derivatives */ + /* src[0] = texcoord */ + /* src[1] = d[strq]/dx */ + /* src[2] = d[strq]/dy */ + /* src[3] = sampler unit */ + assert (0); + break; + + case TGSI_OPCODE_TXL: + /* Texture lookup with explit LOD */ + /* src[0] = texcoord (src[0].w = LOD) */ + /* src[1] = sampler unit */ + exec_tex(mach, inst, TRUE); + break; + + case TGSI_OPCODE_UP2H: + assert (0); + break; + + case TGSI_OPCODE_UP2US: + assert (0); + break; + + case TGSI_OPCODE_UP4B: + assert (0); + break; + + case TGSI_OPCODE_UP4UB: + assert (0); + break; + + case TGSI_OPCODE_X2D: + assert (0); + break; + + case TGSI_OPCODE_ARA: + assert (0); + break; + + case TGSI_OPCODE_ARR: + assert (0); + break; + + case TGSI_OPCODE_BRA: + assert (0); + break; + + case TGSI_OPCODE_CAL: + /* skip the call if no execution channels are enabled */ + if (mach->ExecMask) { + /* do the call */ + + /* push the Cond, Loop, Cont stacks */ + assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING); + mach->CondStack[mach->CondStackTop++] = mach->CondMask; + assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask; + assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->ContStack[mach->ContStackTop++] = mach->ContMask; + + assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING); + mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask; + + /* note that PC was already incremented above */ + mach->CallStack[mach->CallStackTop++] = *pc; + *pc = inst->InstructionExtLabel.Label; + } + break; + + case TGSI_OPCODE_RET: + mach->FuncMask &= ~mach->ExecMask; + UPDATE_EXEC_MASK(mach); + + if (mach->ExecMask == 0x0) { + /* really return now (otherwise, keep executing */ + + if (mach->CallStackTop == 0) { + /* returning from main() */ + *pc = -1; + return; + } + *pc = mach->CallStack[--mach->CallStackTop]; + + /* pop the Cond, Loop, Cont stacks */ + assert(mach->CondStackTop > 0); + mach->CondMask = mach->CondStack[--mach->CondStackTop]; + assert(mach->LoopStackTop > 0); + mach->LoopMask = mach->LoopStack[--mach->LoopStackTop]; + assert(mach->ContStackTop > 0); + mach->ContMask = mach->ContStack[--mach->ContStackTop]; + assert(mach->FuncStackTop > 0); + mach->FuncMask = mach->FuncStack[--mach->FuncStackTop]; + + UPDATE_EXEC_MASK(mach); + } + break; + + case TGSI_OPCODE_SSG: + assert (0); + break; + + case TGSI_OPCODE_CMP: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + FETCH(&r[2], 2, chan_index); + + micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] ); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_SCS: + if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { + FETCH( &r[0], 0, CHAN_X ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) { + micro_cos( &r[1], &r[0] ); + STORE( &r[1], 0, CHAN_X ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { + micro_sin( &r[1], &r[0] ); + STORE( &r[1], 0, CHAN_Y ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { + STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_NRM: + assert (0); + break; + + case TGSI_OPCODE_DIV: + assert( 0 ); + break; + + case TGSI_OPCODE_DP2: + FETCH( &r[0], 0, CHAN_X ); + FETCH( &r[1], 1, CHAN_X ); + micro_mul( &r[0], &r[0], &r[1] ); + + FETCH( &r[1], 0, CHAN_Y ); + FETCH( &r[2], 1, CHAN_Y ); + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_IF: + /* push CondMask */ + assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING); + mach->CondStack[mach->CondStackTop++] = mach->CondMask; + FETCH( &r[0], 0, CHAN_X ); + /* update CondMask */ + if( ! r[0].u[0] ) { + mach->CondMask &= ~0x1; + } + if( ! r[0].u[1] ) { + mach->CondMask &= ~0x2; + } + if( ! r[0].u[2] ) { + mach->CondMask &= ~0x4; + } + if( ! r[0].u[3] ) { + mach->CondMask &= ~0x8; + } + UPDATE_EXEC_MASK(mach); + /* Todo: If CondMask==0, jump to ELSE */ + break; + + case TGSI_OPCODE_ELSE: + /* invert CondMask wrt previous mask */ + { + uint prevMask; + assert(mach->CondStackTop > 0); + prevMask = mach->CondStack[mach->CondStackTop - 1]; + mach->CondMask = ~mach->CondMask & prevMask; + UPDATE_EXEC_MASK(mach); + /* Todo: If CondMask==0, jump to ENDIF */ + } + break; + + case TGSI_OPCODE_ENDIF: + /* pop CondMask */ + assert(mach->CondStackTop > 0); + mach->CondMask = mach->CondStack[--mach->CondStackTop]; + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_END: + /* halt execution */ + *pc = -1; + break; + + case TGSI_OPCODE_REP: + assert (0); + break; + + case TGSI_OPCODE_ENDREP: + assert (0); + break; + + case TGSI_OPCODE_PUSHA: + assert (0); + break; + + case TGSI_OPCODE_POPA: + assert (0); + break; + + case TGSI_OPCODE_CEIL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_ceil( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_I2F: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_i2f( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_NOT: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_not( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_TRUNC: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_trunc( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SHL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_shl( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SHR: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_ishr( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_AND: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_and( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_OR: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_or( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_MOD: + assert (0); + break; + + case TGSI_OPCODE_XOR: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_xor( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SAD: + assert (0); + break; + + case TGSI_OPCODE_TXF: + assert (0); + break; + + case TGSI_OPCODE_TXQ: + assert (0); + break; + + case TGSI_OPCODE_EMIT: + mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16; + mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++; + break; + + case TGSI_OPCODE_ENDPRIM: + mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++; + mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0; + break; + + case TGSI_OPCODE_LOOP: + /* fall-through (for now) */ + case TGSI_OPCODE_BGNLOOP2: + /* push LoopMask and ContMasks */ + assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask; + assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->ContStack[mach->ContStackTop++] = mach->ContMask; + break; + + case TGSI_OPCODE_ENDLOOP: + /* fall-through (for now at least) */ + case TGSI_OPCODE_ENDLOOP2: + /* Restore ContMask, but don't pop */ + assert(mach->ContStackTop > 0); + mach->ContMask = mach->ContStack[mach->ContStackTop - 1]; + if (mach->LoopMask) { + /* repeat loop: jump to instruction just past BGNLOOP */ + *pc = inst->InstructionExtLabel.Label + 1; + } + else { + /* exit loop: pop LoopMask */ + assert(mach->LoopStackTop > 0); + mach->LoopMask = mach->LoopStack[--mach->LoopStackTop]; + /* pop ContMask */ + assert(mach->ContStackTop > 0); + mach->ContMask = mach->ContStack[--mach->ContStackTop]; + } + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_BRK: + /* turn off loop channels for each enabled exec channel */ + mach->LoopMask &= ~mach->ExecMask; + /* Todo: if mach->LoopMask == 0, jump to end of loop */ + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_CONT: + /* turn off cont channels for each enabled exec channel */ + mach->ContMask &= ~mach->ExecMask; + /* Todo: if mach->LoopMask == 0, jump to end of loop */ + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_BGNSUB: + /* no-op */ + break; + + case TGSI_OPCODE_ENDSUB: + /* no-op */ + break; + + case TGSI_OPCODE_NOISE1: + assert( 0 ); + break; + + case TGSI_OPCODE_NOISE2: + assert( 0 ); + break; + + case TGSI_OPCODE_NOISE3: + assert( 0 ); + break; + + case TGSI_OPCODE_NOISE4: + assert( 0 ); + break; + + case TGSI_OPCODE_NOP: + break; + + default: + assert( 0 ); + } + } + + + /** + * Run TGSI interpreter. + * \return bitmask of "alive" quad components + */ + uint + tgsi_exec_machine_run( struct tgsi_exec_machine *mach ) + { + uint i; + int pc = 0; + + mach->CondMask = 0xf; + mach->LoopMask = 0xf; + mach->ContMask = 0xf; + mach->FuncMask = 0xf; + mach->ExecMask = 0xf; + + mach->CondStackTop = 0; /* temporarily subvert this assertion */ + assert(mach->CondStackTop == 0); + assert(mach->LoopStackTop == 0); + assert(mach->ContStackTop == 0); + assert(mach->CallStackTop == 0); + + mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0; + mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0; + + if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) { + mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0; + mach->Primitives[0] = 0; + } + + + /* execute declarations (interpolants) */ + for (i = 0; i < mach->NumDeclarations; i++) { + exec_declaration( mach, mach->Declarations+i ); + } + + /* execute instructions, until pc is set to -1 */ + while (pc != -1) { + assert(pc < mach->NumInstructions); + exec_instruction( mach, mach->Instructions + pc, &pc ); + } + + #if 0 + /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */ + if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) { + /* + * Scale back depth component. + */ + for (i = 0; i < 4; i++) + mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF; + } + #endif + + return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; + } + + diff --cc src/gallium/aux/tgsi/exec/tgsi_exec.h index 00000000000,1fb66ee960f..45c49dd007c mode 000000,100644..100644 --- a/src/gallium/aux/tgsi/exec/tgsi_exec.h +++ b/src/gallium/aux/tgsi/exec/tgsi_exec.h @@@ -1,0 -1,239 +1,243 @@@ + /************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + #if !defined TGSI_EXEC_H + #define TGSI_EXEC_H + + #include "pipe/p_compiler.h" + + #if defined __cplusplus + extern "C" { + #endif + + #define NUM_CHANNELS 4 /* R,G,B,A */ + #define QUAD_SIZE 4 /* 4 pixel/quad */ + + /** + * Registers may be treated as float, signed int or unsigned int. + */ + union tgsi_exec_channel + { + float f[QUAD_SIZE]; + int i[QUAD_SIZE]; + unsigned u[QUAD_SIZE]; + }; + + /** + * A vector[RGBA] of channels[4 pixels] + */ + struct tgsi_exec_vector + { + union tgsi_exec_channel xyzw[NUM_CHANNELS]; + }; + + /** + * For fragment programs, information for computing fragment input + * values from plane equation of the triangle/line. + */ + struct tgsi_interp_coef + { + float a0[NUM_CHANNELS]; /* in an xyzw layout */ + float dadx[NUM_CHANNELS]; + float dady[NUM_CHANNELS]; + }; + + + struct softpipe_tile_cache; /**< Opaque to TGSI */ + + /** + * Information for sampling textures, which must be implemented + * by code outside the TGSI executor. + */ + struct tgsi_sampler + { + const struct pipe_sampler_state *state; + struct pipe_texture *texture; + /** Get samples for four fragments in a quad */ + void (*get_samples)(struct tgsi_sampler *sampler, + const float s[QUAD_SIZE], + const float t[QUAD_SIZE], + const float p[QUAD_SIZE], + float lodbias, + float rgba[NUM_CHANNELS][QUAD_SIZE]); + void *pipe; /*XXX temporary*/ + struct softpipe_tile_cache *cache; + }; + + /** + * For branching/calling subroutines. + */ + struct tgsi_exec_labels + { + unsigned labels[128][2]; + unsigned count; + }; + + /* + * Locations of various utility registers (_I = Index, _C = Channel) + */ + #define TGSI_EXEC_TEMP_00000000_I 32 + #define TGSI_EXEC_TEMP_00000000_C 0 + + #define TGSI_EXEC_TEMP_7FFFFFFF_I 32 + #define TGSI_EXEC_TEMP_7FFFFFFF_C 1 + + #define TGSI_EXEC_TEMP_80000000_I 32 + #define TGSI_EXEC_TEMP_80000000_C 2 + + #define TGSI_EXEC_TEMP_FFFFFFFF_I 32 + #define TGSI_EXEC_TEMP_FFFFFFFF_C 3 + + #define TGSI_EXEC_TEMP_ONE_I 33 + #define TGSI_EXEC_TEMP_ONE_C 0 + + #define TGSI_EXEC_TEMP_TWO_I 33 + #define TGSI_EXEC_TEMP_TWO_C 1 + + #define TGSI_EXEC_TEMP_128_I 33 + #define TGSI_EXEC_TEMP_128_C 2 + + #define TGSI_EXEC_TEMP_MINUS_128_I 33 + #define TGSI_EXEC_TEMP_MINUS_128_C 3 + + #define TGSI_EXEC_TEMP_KILMASK_I 34 + #define TGSI_EXEC_TEMP_KILMASK_C 0 + + #define TGSI_EXEC_TEMP_OUTPUT_I 34 + #define TGSI_EXEC_TEMP_OUTPUT_C 1 + + #define TGSI_EXEC_TEMP_PRIMITIVE_I 34 + #define TGSI_EXEC_TEMP_PRIMITIVE_C 2 + + #define TGSI_EXEC_TEMP_R0 35 + + #define TGSI_EXEC_NUM_TEMPS (32 + 4) + #define TGSI_EXEC_NUM_ADDRS 1 + #define TGSI_EXEC_NUM_IMMEDIATES 256 + + #define TGSI_EXEC_MAX_COND_NESTING 10 + #define TGSI_EXEC_MAX_LOOP_NESTING 10 + #define TGSI_EXEC_MAX_CALL_NESTING 10 + + /** + * Run-time virtual machine state for executing TGSI shader. + */ + struct tgsi_exec_machine + { + /* + * 32 program temporaries + * 4 internal temporaries + * 1 address + * 1 temporary of padding to align to 16 bytes + */ + struct tgsi_exec_vector _Temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_ADDRS + 1]; + + /* + * This will point to _Temps after aligning to 16B boundary. + */ + struct tgsi_exec_vector *Temps; + struct tgsi_exec_vector *Addrs; + + struct tgsi_sampler *Samplers; + + float Imms[TGSI_EXEC_NUM_IMMEDIATES][4]; + unsigned ImmLimit; + float (*Consts)[4]; + struct tgsi_exec_vector *Inputs; + struct tgsi_exec_vector *Outputs; + const struct tgsi_token *Tokens; + unsigned Processor; + + /* GEOMETRY processor only. */ + unsigned *Primitives; + + /* FRAGMENT processor only. */ + const struct tgsi_interp_coef *InterpCoefs; + struct tgsi_exec_vector QuadPos; + + /* Conditional execution masks */ + uint CondMask; /**< For IF/ELSE/ENDIF */ + uint LoopMask; /**< For BGNLOOP/ENDLOOP */ + uint ContMask; /**< For loop CONT statements */ + uint FuncMask; /**< For function calls */ + uint ExecMask; /**< = CondMask & LoopMask */ + + /** Condition mask stack (for nested conditionals) */ + uint CondStack[TGSI_EXEC_MAX_COND_NESTING]; + int CondStackTop; + + /** Loop mask stack (for nested loops) */ + uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING]; + int LoopStackTop; + + /** Loop continue mask stack (see comments in tgsi_exec.c) */ + uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING]; + int ContStackTop; + + /** Function execution mask stack (for executing subroutine code) */ + uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING]; + int FuncStackTop; + + /** Function call stack for saving/restoring the program counter */ + uint CallStack[TGSI_EXEC_MAX_CALL_NESTING]; + int CallStackTop; + + struct tgsi_full_instruction *Instructions; + uint NumInstructions; + + struct tgsi_full_declaration *Declarations; + uint NumDeclarations; + + struct tgsi_exec_labels Labels; + }; + - + void + tgsi_exec_machine_init( ++ struct tgsi_exec_machine *mach ); ++ ++ ++void ++tgsi_exec_machine_bind_shader( + struct tgsi_exec_machine *mach, + const struct tgsi_token *tokens, - unsigned numSamplers, ++ uint numSamplers, + struct tgsi_sampler *samplers); + + uint + tgsi_exec_machine_run( + struct tgsi_exec_machine *mach ); + + + void + tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach); + + + #if defined __cplusplus + } /* extern "C" */ + #endif + + #endif /* TGSI_EXEC_H */ diff --cc src/gallium/aux/tgsi/exec/tgsi_sse2.c index 00000000000,593464db3ed..79209575bc4 mode 000000,100755..100755 --- a/src/gallium/aux/tgsi/exec/tgsi_sse2.c +++ b/src/gallium/aux/tgsi/exec/tgsi_sse2.c @@@ -1,0 -1,2378 +1,2377 @@@ + /************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + #include "pipe/p_util.h" + #include "pipe/p_shader_tokens.h" + #include "tgsi/util/tgsi_parse.h" + #include "tgsi/util/tgsi_util.h" + #include "tgsi_exec.h" + #include "tgsi_sse2.h" + + #include "x86/rtasm/x86sse.h" + + #if defined(__i386__) || defined(__386__) + + #define DUMP_SSE 0 + + #if DUMP_SSE + + static void + _print_reg( + struct x86_reg reg ) + { + if (reg.mod != mod_REG) + debug_printf( "[" ); + + switch( reg.file ) { + case file_REG32: + switch( reg.idx ) { + case reg_AX: + debug_printf( "EAX" ); + break; + case reg_CX: + debug_printf( "ECX" ); + break; + case reg_DX: + debug_printf( "EDX" ); + break; + case reg_BX: + debug_printf( "EBX" ); + break; + case reg_SP: + debug_printf( "ESP" ); + break; + case reg_BP: + debug_printf( "EBP" ); + break; + case reg_SI: + debug_printf( "ESI" ); + break; + case reg_DI: + debug_printf( "EDI" ); + break; + } + break; + case file_MMX: + assert( 0 ); + break; + case file_XMM: + debug_printf( "XMM%u", reg.idx ); + break; + case file_x87: + assert( 0 ); + break; + } + + if (reg.mod == mod_DISP8 || + reg.mod == mod_DISP32) + debug_printf("+%d", reg.disp); + + if (reg.mod != mod_REG) + debug_printf( "]" ); + } + + static void + _fill( + const char *op ) + { + unsigned count = 10 - strlen( op ); + + while( count-- ) { + debug_printf( " " ); + } + } + + #define DUMP_START() debug_printf( "\nsse-dump start ----------------" ) + #define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" ) + #define DUMP( OP ) debug_printf( "\n%s", OP ) + #define DUMP_I( OP, I ) do {\ + debug_printf( "\n%s", OP );\ + _fill( OP );\ + debug_printf( "%u", I ); } while( 0 ) + #define DUMP_R( OP, R0 ) do {\ + debug_printf( "\n%s", OP );\ + _fill( OP );\ + _print_reg( R0 ); } while( 0 ) + #define DUMP_RR( OP, R0, R1 ) do {\ + debug_printf( "\n%s", OP );\ + _fill( OP );\ + _print_reg( R0 );\ + debug_printf( ", " );\ + _print_reg( R1 ); } while( 0 ) + #define DUMP_RRI( OP, R0, R1, I ) do {\ + debug_printf( "\n%s", OP );\ + _fill( OP );\ + _print_reg( R0 );\ + debug_printf( ", " );\ + _print_reg( R1 );\ + debug_printf( ", " );\ + debug_printf( "%u", I ); } while( 0 ) + + #else + + #define DUMP_START() + #define DUMP_END() + #define DUMP( OP ) + #define DUMP_I( OP, I ) + #define DUMP_R( OP, R0 ) + #define DUMP_RR( OP, R0, R1 ) + #define DUMP_RRI( OP, R0, R1, I ) + + #endif + + #define FOR_EACH_CHANNEL( CHAN )\ + for( CHAN = 0; CHAN < 4; CHAN++ ) + + #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\ + ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN))) + + #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\ + if( IS_DST0_CHANNEL_ENABLED( INST, CHAN )) + + #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\ + FOR_EACH_CHANNEL( CHAN )\ + IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN ) + + #define CHAN_X 0 + #define CHAN_Y 1 + #define CHAN_Z 2 + #define CHAN_W 3 + + #define TEMP_R0 TGSI_EXEC_TEMP_R0 + + /** + * X86 utility functions. + */ + + static struct x86_reg + make_xmm( + unsigned xmm ) + { + return x86_make_reg( + file_XMM, + (enum x86_reg_name) xmm ); + } + + /** + * X86 register mapping helpers. + */ + + static struct x86_reg + get_const_base( void ) + { + return x86_make_reg( + file_REG32, + reg_CX ); + } + + static struct x86_reg + get_input_base( void ) + { + return x86_make_reg( + file_REG32, + reg_AX ); + } + + static struct x86_reg + get_output_base( void ) + { + return x86_make_reg( + file_REG32, + reg_DX ); + } + + static struct x86_reg + get_temp_base( void ) + { + #ifdef WIN32 + return x86_make_reg( + file_REG32, + reg_BX ); + #else + return x86_make_reg( + file_REG32, + reg_SI ); + #endif + } + + static struct x86_reg + get_coef_base( void ) + { + return get_output_base(); + } + + /** + * Data access helpers. + */ + + static struct x86_reg + get_argument( + unsigned index ) + { + return x86_make_disp( + x86_make_reg( file_REG32, reg_SP ), + (index + 1) * 4 ); + } + + static struct x86_reg + get_const( + unsigned vec, + unsigned chan ) + { + return x86_make_disp( + get_const_base(), + (vec * 4 + chan) * 4 ); + } + + static struct x86_reg + get_input( + unsigned vec, + unsigned chan ) + { + return x86_make_disp( + get_input_base(), + (vec * 4 + chan) * 16 ); + } + + static struct x86_reg + get_output( + unsigned vec, + unsigned chan ) + { + return x86_make_disp( + get_output_base(), + (vec * 4 + chan) * 16 ); + } + + static struct x86_reg + get_temp( + unsigned vec, + unsigned chan ) + { + return x86_make_disp( + get_temp_base(), + (vec * 4 + chan) * 16 ); + } + + static struct x86_reg + get_coef( + unsigned vec, + unsigned chan, + unsigned member ) + { + return x86_make_disp( + get_coef_base(), + ((vec * 3 + member) * 4 + chan) * 4 ); + } + + /** + * X86 rtasm wrappers. + */ + + static void + emit_addps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "ADDPS", dst, src ); + sse_addps( func, dst, src ); + } + + static void + emit_andnps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "ANDNPS", dst, src ); + sse_andnps( func, dst, src ); + } + + static void + emit_andps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "ANDPS", dst, src ); + sse_andps( func, dst, src ); + } + + static void + emit_call( + struct x86_function *func, + void (* addr)() ) + { + struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); + + DUMP_I( "CALL", addr ); + x86_mov_reg_imm( func, ecx, (unsigned long) addr ); + x86_call( func, ecx ); + } + + static void + emit_cmpps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src, + enum sse_cc cc ) + { + DUMP_RRI( "CMPPS", dst, src, cc ); + sse_cmpps( func, dst, src, cc ); + } + + static void + emit_cvttps2dq( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "CVTTPS2DQ", dst, src ); + sse2_cvttps2dq( func, dst, src ); + } + + static void + emit_maxps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MAXPS", dst, src ); + sse_maxps( func, dst, src ); + } + + static void + emit_minps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MINPS", dst, src ); + sse_minps( func, dst, src ); + } + + static void + emit_mov( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MOV", dst, src ); + x86_mov( func, dst, src ); + } + + static void + emit_movaps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MOVAPS", dst, src ); + sse_movaps( func, dst, src ); + } + + static void + emit_movss( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MOVSS", dst, src ); + sse_movss( func, dst, src ); + } + + static void + emit_movups( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MOVUPS", dst, src ); + sse_movups( func, dst, src ); + } + + static void + emit_mulps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MULPS", dst, src ); + sse_mulps( func, dst, src ); + } + + static void + emit_or( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "OR", dst, src ); + x86_or( func, dst, src ); + } + + static void + emit_orps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "ORPS", dst, src ); + sse_orps( func, dst, src ); + } + + static void + emit_pmovmskb( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "PMOVMSKB", dst, src ); + sse_pmovmskb( func, dst, src ); + } + + static void + emit_pop( + struct x86_function *func, + struct x86_reg dst ) + { + DUMP_R( "POP", dst ); + x86_pop( func, dst ); + } + + static void + emit_push( + struct x86_function *func, + struct x86_reg dst ) + { + DUMP_R( "PUSH", dst ); + x86_push( func, dst ); + } + + static void + emit_rcpps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "RCPPS", dst, src ); + sse2_rcpps( func, dst, src ); + } + + #ifdef WIN32 + static void + emit_retw( + struct x86_function *func, + unsigned size ) + { + DUMP_I( "RET", size ); + x86_retw( func, size ); + } + #else + static void + emit_ret( + struct x86_function *func ) + { + DUMP( "RET" ); + x86_ret( func ); + } + #endif + + static void + emit_rsqrtps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "RSQRTPS", dst, src ); + sse_rsqrtps( func, dst, src ); + } + + static void + emit_shufps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src, + unsigned char shuf ) + { + DUMP_RRI( "SHUFPS", dst, src, shuf ); + sse_shufps( func, dst, src, shuf ); + } + + static void + emit_subps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "SUBPS", dst, src ); + sse_subps( func, dst, src ); + } + + static void + emit_xorps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "XORPS", dst, src ); + sse_xorps( func, dst, src ); + } + + /** + * Data fetch helpers. + */ + + static void + emit_const( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_movss( + func, + make_xmm( xmm ), + get_const( vec, chan ) ); + emit_shufps( + func, + make_xmm( xmm ), + make_xmm( xmm ), + SHUF( 0, 0, 0, 0 ) ); + } + + static void + emit_inputf( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_movups( + func, + make_xmm( xmm ), + get_input( vec, chan ) ); + } + + static void + emit_output( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_movups( + func, + get_output( vec, chan ), + make_xmm( xmm ) ); + } + + static void + emit_tempf( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_movaps( + func, + make_xmm( xmm ), + get_temp( vec, chan ) ); + } + + static void + emit_coef( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan, + unsigned member ) + { + emit_movss( + func, + make_xmm( xmm ), + get_coef( vec, chan, member ) ); + emit_shufps( + func, + make_xmm( xmm ), + make_xmm( xmm ), + SHUF( 0, 0, 0, 0 ) ); + } + + /** + * Data store helpers. + */ + + static void + emit_inputs( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_movups( + func, + get_input( vec, chan ), + make_xmm( xmm ) ); + } + + static void + emit_temps( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_movaps( + func, + get_temp( vec, chan ), + make_xmm( xmm ) ); + } + + static void + emit_addrs( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_temps( + func, + xmm, + vec + TGSI_EXEC_NUM_TEMPS, + chan ); + } + + /** + * Coefficent fetch helpers. + */ + + static void + emit_coef_a0( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_coef( + func, + xmm, + vec, + chan, + 0 ); + } + + static void + emit_coef_dadx( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_coef( + func, + xmm, + vec, + chan, + 1 ); + } + + static void + emit_coef_dady( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_coef( + func, + xmm, + vec, + chan, + 2 ); + } + + /** + * Function call helpers. + */ + + static void + emit_push_gp( + struct x86_function *func ) + { + emit_push( + func, + get_const_base() ); + emit_push( + func, + get_input_base() ); + emit_push( + func, + get_output_base() ); + + /* It is important on non-win32 platforms that temp base is pushed last. + */ + emit_push( + func, + get_temp_base() ); + } + + static void + emit_pop_gp( + struct x86_function *func ) + { + /* Restore GP registers in a reverse order. + */ + emit_pop( + func, + get_temp_base() ); + emit_pop( + func, + get_output_base() ); + emit_pop( + func, + get_input_base() ); + emit_pop( + func, + get_const_base() ); + } + + static void + emit_func_call_dst( + struct x86_function *func, + unsigned xmm_dst, + void (*code)() ) + { + emit_movaps( + func, + get_temp( TEMP_R0, 0 ), + make_xmm( xmm_dst ) ); + + emit_push_gp( + func ); + + #ifdef WIN32 + emit_push( + func, + get_temp( TEMP_R0, 0 ) ); + #endif + + emit_call( + func, + code ); + + emit_pop_gp( + func ); + + emit_movaps( + func, + make_xmm( xmm_dst ), + get_temp( TEMP_R0, 0 ) ); + } + + static void + emit_func_call_dst_src( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src, + void (*code)() ) + { + emit_movaps( + func, + get_temp( TEMP_R0, 1 ), + make_xmm( xmm_src ) ); + + emit_func_call_dst( + func, + xmm_dst, + code ); + } + + /** + * Low-level instruction translators. + */ + + static void + emit_abs( + struct x86_function *func, + unsigned xmm ) + { + emit_andps( + func, + make_xmm( xmm ), + get_temp( + TGSI_EXEC_TEMP_7FFFFFFF_I, + TGSI_EXEC_TEMP_7FFFFFFF_C ) ); + } + + static void + emit_add( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src ) + { + emit_addps( + func, + make_xmm( xmm_dst ), + make_xmm( xmm_src ) ); + } + + static void XSTDCALL + cos4f( + float *store ) + { + #ifdef WIN32 + store[0] = (float) cos( (double) store[0] ); + store[1] = (float) cos( (double) store[1] ); + store[2] = (float) cos( (double) store[2] ); + store[3] = (float) cos( (double) store[3] ); + #else + const unsigned X = TEMP_R0 * 16; + store[X + 0] = cosf( store[X + 0] ); + store[X + 1] = cosf( store[X + 1] ); + store[X + 2] = cosf( store[X + 2] ); + store[X + 3] = cosf( store[X + 3] ); + #endif + } + + static void + emit_cos( + struct x86_function *func, + unsigned xmm_dst ) + { + emit_func_call_dst( + func, + xmm_dst, + cos4f ); + } + + static void XSTDCALL + ex24f( + float *store ) + { + #ifdef WIN32 + store[0] = (float) pow( 2.0, (double) store[0] ); + store[1] = (float) pow( 2.0, (double) store[1] ); + store[2] = (float) pow( 2.0, (double) store[2] ); + store[3] = (float) pow( 2.0, (double) store[3] ); + #else + const unsigned X = TEMP_R0 * 16; + store[X + 0] = powf( 2.0f, store[X + 0] ); + store[X + 1] = powf( 2.0f, store[X + 1] ); + store[X + 2] = powf( 2.0f, store[X + 2] ); + store[X + 3] = powf( 2.0f, store[X + 3] ); + #endif + } + + static void + emit_ex2( + struct x86_function *func, + unsigned xmm_dst ) + { + emit_func_call_dst( + func, + xmm_dst, + ex24f ); + } + + static void + emit_f2it( + struct x86_function *func, + unsigned xmm ) + { + emit_cvttps2dq( + func, + make_xmm( xmm ), + make_xmm( xmm ) ); + } + + static void XSTDCALL + flr4f( + float *store ) + { + #ifdef WIN32 + const unsigned X = 0; + #else + const unsigned X = TEMP_R0 * 16; + #endif + store[X + 0] = (float) floor( (double) store[X + 0] ); + store[X + 1] = (float) floor( (double) store[X + 1] ); + store[X + 2] = (float) floor( (double) store[X + 2] ); + store[X + 3] = (float) floor( (double) store[X + 3] ); + } + + static void + emit_flr( + struct x86_function *func, + unsigned xmm_dst ) + { + emit_func_call_dst( + func, + xmm_dst, + flr4f ); + } + + static void XSTDCALL + frc4f( + float *store ) + { + #ifdef WIN32 + const unsigned X = 0; + #else + const unsigned X = TEMP_R0 * 16; + #endif + store[X + 0] -= (float) floor( (double) store[X + 0] ); + store[X + 1] -= (float) floor( (double) store[X + 1] ); + store[X + 2] -= (float) floor( (double) store[X + 2] ); + store[X + 3] -= (float) floor( (double) store[X + 3] ); + } + + static void + emit_frc( + struct x86_function *func, + unsigned xmm_dst ) + { + emit_func_call_dst( + func, + xmm_dst, + frc4f ); + } + + static void XSTDCALL + lg24f( + float *store ) + { + #ifdef WIN32 + const unsigned X = 0; + #else + const unsigned X = TEMP_R0 * 16; + #endif + store[X + 0] = LOG2( store[X + 0] ); + store[X + 1] = LOG2( store[X + 1] ); + store[X + 2] = LOG2( store[X + 2] ); + store[X + 3] = LOG2( store[X + 3] ); + } + + static void + emit_lg2( + struct x86_function *func, + unsigned xmm_dst ) + { + emit_func_call_dst( + func, + xmm_dst, + lg24f ); + } + + static void + emit_MOV( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src ) + { + emit_movups( + func, + make_xmm( xmm_dst ), + make_xmm( xmm_src ) ); + } + + static void + emit_mul (struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src) + { + emit_mulps( + func, + make_xmm( xmm_dst ), + make_xmm( xmm_src ) ); + } + + static void + emit_neg( + struct x86_function *func, + unsigned xmm ) + { + emit_xorps( + func, + make_xmm( xmm ), + get_temp( + TGSI_EXEC_TEMP_80000000_I, + TGSI_EXEC_TEMP_80000000_C ) ); + } + + static void XSTDCALL + pow4f( + float *store ) + { + #ifdef WIN32 + store[0] = (float) pow( (double) store[0], (double) store[4] ); + store[1] = (float) pow( (double) store[1], (double) store[5] ); + store[2] = (float) pow( (double) store[2], (double) store[6] ); + store[3] = (float) pow( (double) store[3], (double) store[7] ); + #else + const unsigned X = TEMP_R0 * 16; + store[X + 0] = powf( store[X + 0], store[X + 4] ); + store[X + 1] = powf( store[X + 1], store[X + 5] ); + store[X + 2] = powf( store[X + 2], store[X + 6] ); + store[X + 3] = powf( store[X + 3], store[X + 7] ); + #endif + } + + static void + emit_pow( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src ) + { + emit_func_call_dst_src( + func, + xmm_dst, + xmm_src, + pow4f ); + } + + static void + emit_rcp ( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src ) + { + emit_rcpps( + func, + make_xmm( xmm_dst ), + make_xmm( xmm_src ) ); + } + + static void + emit_rsqrt( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src ) + { + emit_rsqrtps( + func, + make_xmm( xmm_dst ), + make_xmm( xmm_src ) ); + } + + static void + emit_setsign( + struct x86_function *func, + unsigned xmm ) + { + emit_orps( + func, + make_xmm( xmm ), + get_temp( + TGSI_EXEC_TEMP_80000000_I, + TGSI_EXEC_TEMP_80000000_C ) ); + } + + static void XSTDCALL + sin4f( + float *store ) + { + #ifdef WIN32 + store[0] = (float) sin( (double) store[0] ); + store[1] = (float) sin( (double) store[1] ); + store[2] = (float) sin( (double) store[2] ); + store[3] = (float) sin( (double) store[3] ); + #else + const unsigned X = TEMP_R0 * 16; + store[X + 0] = sinf( store[X + 0] ); + store[X + 1] = sinf( store[X + 1] ); + store[X + 2] = sinf( store[X + 2] ); + store[X + 3] = sinf( store[X + 3] ); + #endif + } + + static void + emit_sin (struct x86_function *func, + unsigned xmm_dst) + { + emit_func_call_dst( + func, + xmm_dst, + sin4f ); + } + + static void + emit_sub( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src ) + { + emit_subps( + func, + make_xmm( xmm_dst ), + make_xmm( xmm_src ) ); + } + + /** + * Register fetch. + */ + + static void + emit_fetch( + struct x86_function *func, + unsigned xmm, + const struct tgsi_full_src_register *reg, + const unsigned chan_index ) + { + unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index ); + + switch( swizzle ) { + case TGSI_EXTSWIZZLE_X: + case TGSI_EXTSWIZZLE_Y: + case TGSI_EXTSWIZZLE_Z: + case TGSI_EXTSWIZZLE_W: + switch( reg->SrcRegister.File ) { + case TGSI_FILE_CONSTANT: + emit_const( + func, + xmm, + reg->SrcRegister.Index, + swizzle ); + break; + + case TGSI_FILE_INPUT: + emit_inputf( + func, + xmm, + reg->SrcRegister.Index, + swizzle ); + break; + + case TGSI_FILE_TEMPORARY: + emit_tempf( + func, + xmm, + reg->SrcRegister.Index, + swizzle ); + break; + + default: + assert( 0 ); + } + break; + + case TGSI_EXTSWIZZLE_ZERO: + emit_tempf( + func, + xmm, + TGSI_EXEC_TEMP_00000000_I, + TGSI_EXEC_TEMP_00000000_C ); + break; + + case TGSI_EXTSWIZZLE_ONE: + emit_tempf( + func, + xmm, + TGSI_EXEC_TEMP_ONE_I, + TGSI_EXEC_TEMP_ONE_C ); + break; + + default: + assert( 0 ); + } + + switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) { + case TGSI_UTIL_SIGN_CLEAR: + emit_abs( func, xmm ); + break; + + case TGSI_UTIL_SIGN_SET: + emit_setsign( func, xmm ); + break; + + case TGSI_UTIL_SIGN_TOGGLE: + emit_neg( func, xmm ); + break; + + case TGSI_UTIL_SIGN_KEEP: + break; + } + } + + #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\ + emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN ) + + /** + * Register store. + */ + + static void + emit_store( + struct x86_function *func, + unsigned xmm, + const struct tgsi_full_dst_register *reg, + const struct tgsi_full_instruction *inst, + unsigned chan_index ) + { + switch( reg->DstRegister.File ) { + case TGSI_FILE_OUTPUT: + emit_output( + func, + xmm, + reg->DstRegister.Index, + chan_index ); + break; + + case TGSI_FILE_TEMPORARY: + emit_temps( + func, + xmm, + reg->DstRegister.Index, + chan_index ); + break; + + case TGSI_FILE_ADDRESS: + emit_addrs( + func, + xmm, + reg->DstRegister.Index, + chan_index ); + break; + + default: + assert( 0 ); + } + + switch( inst->Instruction.Saturate ) { + case TGSI_SAT_NONE: + break; + + case TGSI_SAT_ZERO_ONE: + // assert( 0 ); + break; + + case TGSI_SAT_MINUS_PLUS_ONE: + assert( 0 ); + break; + } + } + + #define STORE( FUNC, INST, XMM, INDEX, CHAN )\ + emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN ) + + /** + * High-level instruction translators. + */ + + static void + emit_kil( + struct x86_function *func, + const struct tgsi_full_src_register *reg ) + { + unsigned uniquemask; + unsigned registers[4]; + unsigned nextregister = 0; + unsigned firstchan = ~0; + unsigned chan_index; + + /* This mask stores component bits that were already tested. Note that + * we test if the value is less than zero, so 1.0 and 0.0 need not to be + * tested. */ + uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE); + + FOR_EACH_CHANNEL( chan_index ) { + unsigned swizzle; + + /* unswizzle channel */ + swizzle = tgsi_util_get_full_src_register_extswizzle( + reg, + chan_index ); + + /* check if the component has not been already tested */ + if( !(uniquemask & (1 << swizzle)) ) { + uniquemask |= 1 << swizzle; + + /* allocate register */ + registers[chan_index] = nextregister; + emit_fetch( + func, + nextregister, + reg, + chan_index ); + nextregister++; + + /* mark the first channel used */ + if( firstchan == ~0 ) { + firstchan = chan_index; + } + } + } + + emit_push( + func, + x86_make_reg( file_REG32, reg_AX ) ); + emit_push( + func, + x86_make_reg( file_REG32, reg_DX ) ); + + FOR_EACH_CHANNEL( chan_index ) { + if( uniquemask & (1 << chan_index) ) { + emit_cmpps( + func, + make_xmm( registers[chan_index] ), + get_temp( + TGSI_EXEC_TEMP_00000000_I, + TGSI_EXEC_TEMP_00000000_C ), + cc_LessThan ); + + if( chan_index == firstchan ) { + emit_pmovmskb( + func, + x86_make_reg( file_REG32, reg_AX ), + make_xmm( registers[chan_index] ) ); + } + else { + emit_pmovmskb( + func, + x86_make_reg( file_REG32, reg_DX ), + make_xmm( registers[chan_index] ) ); + emit_or( + func, + x86_make_reg( file_REG32, reg_AX ), + x86_make_reg( file_REG32, reg_DX ) ); + } + } + } + + emit_or( + func, + get_temp( + TGSI_EXEC_TEMP_KILMASK_I, + TGSI_EXEC_TEMP_KILMASK_C ), + x86_make_reg( file_REG32, reg_AX ) ); + + emit_pop( + func, + x86_make_reg( file_REG32, reg_DX ) ); + emit_pop( + func, + x86_make_reg( file_REG32, reg_AX ) ); + } + + static void + emit_setcc( + struct x86_function *func, + struct tgsi_full_instruction *inst, + enum sse_cc cc ) + { + unsigned chan_index; + + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + emit_cmpps( + func, + make_xmm( 0 ), + make_xmm( 1 ), + cc ); + emit_andps( + func, + make_xmm( 0 ), + get_temp( + TGSI_EXEC_TEMP_ONE_I, + TGSI_EXEC_TEMP_ONE_C ) ); + STORE( func, *inst, 0, 0, chan_index ); + } + } + + static void + emit_cmp( + struct x86_function *func, + struct tgsi_full_instruction *inst ) + { + unsigned chan_index; + + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + FETCH( func, *inst, 2, 2, chan_index ); + emit_cmpps( + func, + make_xmm( 0 ), + get_temp( + TGSI_EXEC_TEMP_00000000_I, + TGSI_EXEC_TEMP_00000000_C ), + cc_LessThan ); + emit_andps( + func, + make_xmm( 1 ), + make_xmm( 0 ) ); + emit_andnps( + func, + make_xmm( 0 ), + make_xmm( 2 ) ); + emit_orps( + func, + make_xmm( 0 ), + make_xmm( 1 ) ); + STORE( func, *inst, 0, 0, chan_index ); + } + } + + static int + emit_instruction( + struct x86_function *func, + struct tgsi_full_instruction *inst ) + { + unsigned chan_index; + + switch( inst->Instruction.Opcode ) { + case TGSI_OPCODE_ARL: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + emit_f2it( func, 0 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_MOV: + /* TGSI_OPCODE_SWZ */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_LIT: + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || + IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) { + emit_tempf( + func, + 0, + TGSI_EXEC_TEMP_ONE_I, + TGSI_EXEC_TEMP_ONE_C); + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) { + STORE( func, *inst, 0, 0, CHAN_X ); + } + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) { + STORE( func, *inst, 0, 0, CHAN_W ); + } + } + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) || + IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_maxps( + func, + make_xmm( 0 ), + get_temp( + TGSI_EXEC_TEMP_00000000_I, + TGSI_EXEC_TEMP_00000000_C ) ); + STORE( func, *inst, 0, 0, CHAN_Y ); + } + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { + FETCH( func, *inst, 1, 0, CHAN_Y ); + emit_maxps( + func, + make_xmm( 1 ), + get_temp( + TGSI_EXEC_TEMP_00000000_I, + TGSI_EXEC_TEMP_00000000_C ) ); + FETCH( func, *inst, 2, 0, CHAN_W ); + emit_minps( + func, + make_xmm( 2 ), + get_temp( + TGSI_EXEC_TEMP_128_I, + TGSI_EXEC_TEMP_128_C ) ); + emit_maxps( + func, + make_xmm( 2 ), + get_temp( + TGSI_EXEC_TEMP_MINUS_128_I, + TGSI_EXEC_TEMP_MINUS_128_C ) ); + emit_pow( func, 1, 2 ); + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_xorps( + func, + make_xmm( 2 ), + make_xmm( 2 ) ); + emit_cmpps( + func, + make_xmm( 2 ), + make_xmm( 0 ), + cc_LessThanEqual ); + emit_andps( + func, + make_xmm( 2 ), + make_xmm( 1 ) ); + STORE( func, *inst, 2, 0, CHAN_Z ); + } + } + break; + + case TGSI_OPCODE_RCP: + /* TGSI_OPCODE_RECIP */ + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_rcp( func, 0, 0 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_RSQ: + /* TGSI_OPCODE_RECIPSQRT */ + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_rsqrt( func, 0, 0 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_EXP: + return 0; + break; + + case TGSI_OPCODE_LOG: + return 0; + break; + + case TGSI_OPCODE_MUL: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + emit_mul( func, 0, 1 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_ADD: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + emit_add( func, 0, 1 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_DP3: + /* TGSI_OPCODE_DOT3 */ + FETCH( func, *inst, 0, 0, CHAN_X ); + FETCH( func, *inst, 1, 1, CHAN_X ); + emit_mul( func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_Y ); + FETCH( func, *inst, 2, 1, CHAN_Y ); + emit_mul( func, 1, 2 ); + emit_add( func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_Z ); + FETCH( func, *inst, 2, 1, CHAN_Z ); + emit_mul( func, 1, 2 ); + emit_add( func, 0, 1 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_DP4: + /* TGSI_OPCODE_DOT4 */ + FETCH( func, *inst, 0, 0, CHAN_X ); + FETCH( func, *inst, 1, 1, CHAN_X ); + emit_mul( func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_Y ); + FETCH( func, *inst, 2, 1, CHAN_Y ); + emit_mul( func, 1, 2 ); + emit_add( func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_Z ); + FETCH( func, *inst, 2, 1, CHAN_Z ); + emit_mul(func, 1, 2 ); + emit_add(func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_W ); + FETCH( func, *inst, 2, 1, CHAN_W ); + emit_mul( func, 1, 2 ); + emit_add( func, 0, 1 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_DST: + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { + emit_tempf( + func, + 0, + TGSI_EXEC_TEMP_ONE_I, + TGSI_EXEC_TEMP_ONE_C ); + STORE( func, *inst, 0, 0, CHAN_X ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { + FETCH( func, *inst, 0, 0, CHAN_Y ); + FETCH( func, *inst, 1, 1, CHAN_Y ); + emit_mul( func, 0, 1 ); + STORE( func, *inst, 0, 0, CHAN_Y ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { + FETCH( func, *inst, 0, 0, CHAN_Z ); + STORE( func, *inst, 0, 0, CHAN_Z ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) { + FETCH( func, *inst, 0, 1, CHAN_W ); + STORE( func, *inst, 0, 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_MIN: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + emit_minps( + func, + make_xmm( 0 ), + make_xmm( 1 ) ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_MAX: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + emit_maxps( + func, + make_xmm( 0 ), + make_xmm( 1 ) ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_SLT: + /* TGSI_OPCODE_SETLT */ + emit_setcc( func, inst, cc_LessThan ); + break; + + case TGSI_OPCODE_SGE: + /* TGSI_OPCODE_SETGE */ + emit_setcc( func, inst, cc_NotLessThan ); + break; + + case TGSI_OPCODE_MAD: + /* TGSI_OPCODE_MADD */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + FETCH( func, *inst, 2, 2, chan_index ); + emit_mul( func, 0, 1 ); + emit_add( func, 0, 2 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_SUB: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + emit_sub( func, 0, 1 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_LERP: + /* TGSI_OPCODE_LRP */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + FETCH( func, *inst, 2, 2, chan_index ); + emit_sub( func, 1, 2 ); + emit_mul( func, 0, 1 ); + emit_add( func, 0, 2 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_CND: + return 0; + break; + + case TGSI_OPCODE_CND0: + return 0; + break; + + case TGSI_OPCODE_DOT2ADD: + /* TGSI_OPCODE_DP2A */ + return 0; + break; + + case TGSI_OPCODE_INDEX: + return 0; + break; + + case TGSI_OPCODE_NEGATE: + return 0; + break; + + case TGSI_OPCODE_FRAC: + /* TGSI_OPCODE_FRC */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + emit_frc( func, 0 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_CLAMP: + return 0; + break; + + case TGSI_OPCODE_FLOOR: + /* TGSI_OPCODE_FLR */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + emit_flr( func, 0 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_ROUND: + return 0; + break; + + case TGSI_OPCODE_EXPBASE2: + /* TGSI_OPCODE_EX2 */ + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_ex2( func, 0 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_LOGBASE2: + /* TGSI_OPCODE_LG2 */ + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_lg2( func, 0 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_POWER: + /* TGSI_OPCODE_POW */ + FETCH( func, *inst, 0, 0, CHAN_X ); + FETCH( func, *inst, 1, 1, CHAN_X ); + emit_pow( func, 0, 1 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_CROSSPRODUCT: + /* TGSI_OPCODE_XPD */ + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || + IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { + FETCH( func, *inst, 1, 1, CHAN_Z ); + FETCH( func, *inst, 3, 0, CHAN_Z ); + } + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || + IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { + FETCH( func, *inst, 0, 0, CHAN_Y ); + FETCH( func, *inst, 4, 1, CHAN_Y ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { + emit_MOV( func, 2, 0 ); + emit_mul( func, 2, 1 ); + emit_MOV( func, 5, 3 ); + emit_mul( func, 5, 4 ); + emit_sub( func, 2, 5 ); + STORE( func, *inst, 2, 0, CHAN_X ); + } + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) || + IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { + FETCH( func, *inst, 2, 1, CHAN_X ); + FETCH( func, *inst, 5, 0, CHAN_X ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { + emit_mul( func, 3, 2 ); + emit_mul( func, 1, 5 ); + emit_sub( func, 3, 1 ); + STORE( func, *inst, 3, 0, CHAN_Y ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { + emit_mul( func, 5, 4 ); + emit_mul( func, 0, 2 ); + emit_sub( func, 5, 0 ); + STORE( func, *inst, 5, 0, CHAN_Z ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) { + FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C ); + STORE( func, *inst, 0, 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_MULTIPLYMATRIX: + return 0; + break; + + case TGSI_OPCODE_ABS: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + emit_abs( func, 0) ; + + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_RCC: + return 0; + break; + + case TGSI_OPCODE_DPH: + FETCH( func, *inst, 0, 0, CHAN_X ); + FETCH( func, *inst, 1, 1, CHAN_X ); + emit_mul( func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_Y ); + FETCH( func, *inst, 2, 1, CHAN_Y ); + emit_mul( func, 1, 2 ); + emit_add( func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_Z ); + FETCH( func, *inst, 2, 1, CHAN_Z ); + emit_mul( func, 1, 2 ); + emit_add( func, 0, 1 ); + FETCH( func, *inst, 1, 1, CHAN_W ); + emit_add( func, 0, 1 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_COS: + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_cos( func, 0 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_DDX: + return 0; + break; + + case TGSI_OPCODE_DDY: + return 0; + break; + + case TGSI_OPCODE_KIL: + emit_kil( func, &inst->FullSrcRegisters[0] ); + break; + + case TGSI_OPCODE_PK2H: + return 0; + break; + + case TGSI_OPCODE_PK2US: + return 0; + break; + + case TGSI_OPCODE_PK4B: + return 0; + break; + + case TGSI_OPCODE_PK4UB: + return 0; + break; + + case TGSI_OPCODE_RFL: + return 0; + break; + + case TGSI_OPCODE_SEQ: + return 0; + break; + + case TGSI_OPCODE_SFL: + return 0; + break; + + case TGSI_OPCODE_SGT: + return 0; + break; + + case TGSI_OPCODE_SIN: + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_sin( func, 0 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_SLE: + return 0; + break; + + case TGSI_OPCODE_SNE: + return 0; + break; + + case TGSI_OPCODE_STR: + return 0; + break; + + case TGSI_OPCODE_TEX: - emit_tempf( - func, - 0, - TGSI_EXEC_TEMP_ONE_I, - TGSI_EXEC_TEMP_ONE_C ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); ++ if (0) { ++ /* Disable dummy texture code: ++ */ ++ emit_tempf( ++ func, ++ 0, ++ TGSI_EXEC_TEMP_ONE_I, ++ TGSI_EXEC_TEMP_ONE_C ); ++ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { ++ STORE( func, *inst, 0, 0, chan_index ); ++ } ++ } ++ else { ++ return 0; + } + break; + + case TGSI_OPCODE_TXD: + return 0; + break; + + case TGSI_OPCODE_UP2H: + return 0; + break; + + case TGSI_OPCODE_UP2US: + return 0; + break; + + case TGSI_OPCODE_UP4B: + return 0; + break; + + case TGSI_OPCODE_UP4UB: + return 0; + break; + + case TGSI_OPCODE_X2D: + return 0; + break; + + case TGSI_OPCODE_ARA: + return 0; + break; + + case TGSI_OPCODE_ARR: + return 0; + break; + + case TGSI_OPCODE_BRA: + return 0; + break; + + case TGSI_OPCODE_CAL: + return 0; + break; + + case TGSI_OPCODE_RET: + case TGSI_OPCODE_END: + #ifdef WIN32 + emit_retw( func, 16 ); + #else + emit_ret( func ); + #endif + break; + + case TGSI_OPCODE_SSG: + return 0; + break; + + case TGSI_OPCODE_CMP: + emit_cmp (func, inst); + break; + + case TGSI_OPCODE_SCS: + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_cos( func, 0 ); + STORE( func, *inst, 0, 0, CHAN_X ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { + FETCH( func, *inst, 0, 0, CHAN_Y ); + emit_sin( func, 0 ); + STORE( func, *inst, 0, 0, CHAN_Y ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { + FETCH( func, *inst, 0, TGSI_EXEC_TEMP_00000000_I, TGSI_EXEC_TEMP_00000000_C ); + STORE( func, *inst, 0, 0, CHAN_Z ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) { + FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C ); + STORE( func, *inst, 0, 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_TXB: + return 0; + break; + + case TGSI_OPCODE_NRM: + return 0; + break; + + case TGSI_OPCODE_DIV: + return 0; + break; + + case TGSI_OPCODE_DP2: + return 0; + break; + + case TGSI_OPCODE_TXL: + return 0; + break; + + case TGSI_OPCODE_BRK: + return 0; + break; + + case TGSI_OPCODE_IF: + return 0; + break; + + case TGSI_OPCODE_LOOP: + return 0; + break; + + case TGSI_OPCODE_REP: + return 0; + break; + + case TGSI_OPCODE_ELSE: + return 0; + break; + + case TGSI_OPCODE_ENDIF: + return 0; + break; + + case TGSI_OPCODE_ENDLOOP: + return 0; + break; + + case TGSI_OPCODE_ENDREP: + return 0; + break; + + case TGSI_OPCODE_PUSHA: + return 0; + break; + + case TGSI_OPCODE_POPA: + return 0; + break; + + case TGSI_OPCODE_CEIL: + return 0; + break; + + case TGSI_OPCODE_I2F: + return 0; + break; + + case TGSI_OPCODE_NOT: + return 0; + break; + + case TGSI_OPCODE_TRUNC: + return 0; + break; + + case TGSI_OPCODE_SHL: + return 0; + break; + + case TGSI_OPCODE_SHR: + return 0; + break; + + case TGSI_OPCODE_AND: + return 0; + break; + + case TGSI_OPCODE_OR: + return 0; + break; + + case TGSI_OPCODE_MOD: + return 0; + break; + + case TGSI_OPCODE_XOR: + return 0; + break; + + case TGSI_OPCODE_SAD: + return 0; + break; + + case TGSI_OPCODE_TXF: + return 0; + break; + + case TGSI_OPCODE_TXQ: + return 0; + break; + + case TGSI_OPCODE_CONT: + return 0; + break; + + case TGSI_OPCODE_EMIT: + return 0; + break; + + case TGSI_OPCODE_ENDPRIM: + return 0; + break; + + default: + return 0; + } + + return 1; + } + + static void + emit_declaration( + struct x86_function *func, + struct tgsi_full_declaration *decl ) + { + if( decl->Declaration.File == TGSI_FILE_INPUT ) { + unsigned first, last, mask; + unsigned i, j; + + assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE ); + + first = decl->u.DeclarationRange.First; + last = decl->u.DeclarationRange.Last; + mask = decl->Declaration.UsageMask; + - /* Do not touch WPOS.xy */ - if( first == 0 ) { - mask &= ~TGSI_WRITEMASK_XY; - if( mask == TGSI_WRITEMASK_NONE ) { - first++; - } - } - + for( i = first; i <= last; i++ ) { + for( j = 0; j < NUM_CHANNELS; j++ ) { + if( mask & (1 << j) ) { + switch( decl->Interpolation.Interpolate ) { + case TGSI_INTERPOLATE_CONSTANT: + emit_coef_a0( func, 0, i, j ); + emit_inputs( func, 0, i, j ); + break; + + case TGSI_INTERPOLATE_LINEAR: - emit_inputf( func, 0, 0, TGSI_SWIZZLE_X ); ++ emit_tempf( func, 0, 0, TGSI_SWIZZLE_X ); + emit_coef_dadx( func, 1, i, j ); - emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y ); ++ emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y ); + emit_coef_dady( func, 3, i, j ); + emit_mul( func, 0, 1 ); /* x * dadx */ + emit_coef_a0( func, 4, i, j ); + emit_mul( func, 2, 3 ); /* y * dady */ + emit_add( func, 0, 4 ); /* x * dadx + a0 */ + emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */ + emit_inputs( func, 0, i, j ); + break; + + case TGSI_INTERPOLATE_PERSPECTIVE: - emit_inputf( func, 0, 0, TGSI_SWIZZLE_X ); ++ emit_tempf( func, 0, 0, TGSI_SWIZZLE_X ); + emit_coef_dadx( func, 1, i, j ); - emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y ); ++ emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y ); + emit_coef_dady( func, 3, i, j ); + emit_mul( func, 0, 1 ); /* x * dadx */ - emit_inputf( func, 4, 0, TGSI_SWIZZLE_W ); ++ emit_tempf( func, 4, 0, TGSI_SWIZZLE_W ); + emit_coef_a0( func, 5, i, j ); + emit_rcp( func, 4, 4 ); /* 1.0 / w */ + emit_mul( func, 2, 3 ); /* y * dady */ + emit_add( func, 0, 5 ); /* x * dadx + a0 */ + emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */ + emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */ + emit_inputs( func, 0, i, j ); + break; + + default: + assert( 0 ); + break; + } + } + } + } + } + } + + unsigned + tgsi_emit_sse2( + struct tgsi_token *tokens, + struct x86_function *func ) + { + struct tgsi_parse_context parse; + unsigned ok = 1; + + DUMP_START(); + + func->csr = func->store; + + emit_mov( + func, + get_input_base(), + get_argument( 0 ) ); + emit_mov( + func, + get_output_base(), + get_argument( 1 ) ); + emit_mov( + func, + get_const_base(), + get_argument( 2 ) ); + emit_mov( + func, + get_temp_base(), + get_argument( 3 ) ); + + tgsi_parse_init( &parse, tokens ); + + while( !tgsi_parse_end_of_tokens( &parse ) && ok ) { + tgsi_parse_token( &parse ); + + switch( parse.FullToken.Token.Type ) { + case TGSI_TOKEN_TYPE_DECLARATION: + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + ok = emit_instruction( + func, + &parse.FullToken.FullInstruction ); + + if (!ok) { + debug_printf("failed to translate tgsi opcode %d\n", + parse.FullToken.FullInstruction.Instruction.Opcode ); + } + break; + + case TGSI_TOKEN_TYPE_IMMEDIATE: + /* XXX implement this */ + ok = 0; + debug_printf("failed to emit immediate value\n"); + break; + + default: + assert( 0 ); + ok = 0; + break; + } + } + + tgsi_parse_free( &parse ); + + DUMP_END(); + + return ok; + } + + /** + * Fragment shaders are responsible for interpolating shader inputs. Because on + * x86 we have only 4 GP registers, and here we have 5 shader arguments (input, + * output, const, temp and coef), the code is split into two phases -- + * DECLARATION and INSTRUCTION phase. + * GP register holding the output argument is aliased with the coeff argument, + * as outputs are not needed in the DECLARATION phase. + */ + unsigned + tgsi_emit_sse2_fs( + struct tgsi_token *tokens, + struct x86_function *func ) + { + struct tgsi_parse_context parse; + boolean instruction_phase = FALSE; + + DUMP_START(); + + func->csr = func->store; + + /* DECLARATION phase, do not load output argument. */ + emit_mov( + func, + get_input_base(), + get_argument( 0 ) ); + emit_mov( + func, + get_const_base(), + get_argument( 2 ) ); + emit_mov( + func, + get_temp_base(), + get_argument( 3 ) ); + emit_mov( + func, + get_coef_base(), + get_argument( 4 ) ); + + tgsi_parse_init( &parse, tokens ); + + while( !tgsi_parse_end_of_tokens( &parse ) ) { + tgsi_parse_token( &parse ); + + switch( parse.FullToken.Token.Type ) { + case TGSI_TOKEN_TYPE_DECLARATION: + emit_declaration( + func, + &parse.FullToken.FullDeclaration ); + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + if( !instruction_phase ) { + /* INSTRUCTION phase, overwrite coeff with output. */ + instruction_phase = TRUE; + emit_mov( + func, + get_output_base(), + get_argument( 1 ) ); + } + emit_instruction( + func, + &parse.FullToken.FullInstruction ); + break; + + case TGSI_TOKEN_TYPE_IMMEDIATE: + /* XXX implement this */ + assert(0); + break; + + default: + assert( 0 ); + } + } + + tgsi_parse_free( &parse ); + + DUMP_END(); + + return 1; + } + + #endif /* i386 */ diff --cc src/gallium/drivers/softpipe/Makefile index 00000000000,2304ea4246a..5479daf8ea2 mode 000000,100644..100644 --- a/src/gallium/drivers/softpipe/Makefile +++ b/src/gallium/drivers/softpipe/Makefile @@@ -1,0 -1,50 +1,53 @@@ + + TOP = ../../../.. + include $(TOP)/configs/current + + LIBNAME = softpipe + + DRIVER_SOURCES = \ ++ sp_fs_exec.c \ ++ sp_fs_sse.c \ ++ sp_fs_llvm.c \ + sp_clear.c \ + sp_flush.c \ + sp_query.c \ + sp_context.c \ + sp_draw_arrays.c \ + sp_prim_setup.c \ + sp_prim_vbuf.c \ + sp_quad.c \ + sp_quad_alpha_test.c \ + sp_quad_blend.c \ + sp_quad_bufloop.c \ + sp_quad_colormask.c \ + sp_quad_coverage.c \ + sp_quad_depth_test.c \ + sp_quad_earlyz.c \ + sp_quad_fs.c \ + sp_quad_occlusion.c \ + sp_quad_output.c \ + sp_quad_stencil.c \ + sp_quad_stipple.c \ + sp_state_blend.c \ + sp_state_clip.c \ + sp_state_derived.c \ + sp_state_fs.c \ + sp_state_sampler.c \ + sp_state_rasterizer.c \ + sp_state_surface.c \ + sp_state_vertex.c \ + sp_texture.c \ + sp_tex_sample.c \ + sp_tile_cache.c \ + sp_surface.c + + C_SOURCES = \ + $(COMMON_SOURCES) \ + $(DRIVER_SOURCES) + + ASM_SOURCES = + + include ../../Makefile.template + + symlinks: + diff --cc src/gallium/drivers/softpipe/sp_context.h index 00000000000,8c79cb3ce4f..b70d4fea85a mode 000000,100644..100644 --- a/src/gallium/drivers/softpipe/sp_context.h +++ b/src/gallium/drivers/softpipe/sp_context.h @@@ -1,0 -1,152 +1,152 @@@ + /************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /* Authors: Keith Whitwell + */ + + #ifndef SP_CONTEXT_H + #define SP_CONTEXT_H + + #include "pipe/p_context.h" + #include "pipe/p_defines.h" + + #include "draw/draw_vertex.h" + + #include "sp_quad.h" + + + struct softpipe_winsys; + struct softpipe_vbuf_render; + struct draw_context; + struct draw_stage; + struct softpipe_tile_cache; -struct sp_fragment_shader_state; -struct sp_vertex_shader_state; ++struct sp_fragment_shader; ++struct sp_vertex_shader; + + + struct softpipe_context { + struct pipe_context pipe; /**< base class */ + struct softpipe_winsys *winsys; /**< window system interface */ + + + /* The most recent drawing state as set by the driver: + */ + const struct pipe_blend_state *blend; + const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS]; + const struct pipe_depth_stencil_alpha_state *depth_stencil; + const struct pipe_rasterizer_state *rasterizer; - const struct sp_fragment_shader_state *fs; - const struct sp_vertex_shader_state *vs; ++ const struct sp_fragment_shader *fs; ++ const struct sp_vertex_shader *vs; + + struct pipe_blend_color blend_color; + struct pipe_clip_state clip; + struct pipe_constant_buffer constants[2]; + struct pipe_framebuffer_state framebuffer; + struct pipe_poly_stipple poly_stipple; + struct pipe_scissor_state scissor; + struct softpipe_texture *texture[PIPE_MAX_SAMPLERS]; + struct pipe_viewport_state viewport; + struct pipe_vertex_buffer vertex_buffer[PIPE_ATTRIB_MAX]; + struct pipe_vertex_element vertex_element[PIPE_ATTRIB_MAX]; + unsigned dirty; + + /* Counter for occlusion queries. Note this supports overlapping + * queries. + */ + uint64 occlusion_count; + + /* + * Mapped vertex buffers + */ + ubyte *mapped_vbuffer[PIPE_ATTRIB_MAX]; + + /** Mapped constant buffers */ + void *mapped_constants[PIPE_SHADER_TYPES]; + + /** Vertex format */ + struct vertex_info vertex_info; + struct vertex_info vertex_info_vbuf; + + int psize_slot; + + #if 0 + /* Stipple derived state: + */ + ubyte stipple_masks[16][16]; + #endif + + /** Derived from scissor and surface bounds: */ + struct pipe_scissor_state cliprect; + + unsigned line_stipple_counter; + + /** Software quad rendering pipeline */ + struct { + struct quad_stage *polygon_stipple; + struct quad_stage *earlyz; + struct quad_stage *shade; + struct quad_stage *alpha_test; + struct quad_stage *stencil_test; + struct quad_stage *depth_test; + struct quad_stage *occlusion; + struct quad_stage *coverage; + struct quad_stage *bufloop; + struct quad_stage *blend; + struct quad_stage *colormask; + struct quad_stage *output; + + struct quad_stage *first; /**< points to one of the above stages */ + } quad; + + /** The primitive drawing context */ + struct draw_context *draw; + struct draw_stage *setup; + struct draw_stage *vbuf; + struct softpipe_vbuf_render *vbuf_render; + + uint current_cbuf; /**< current color buffer being written to */ + + struct softpipe_tile_cache *cbuf_cache[PIPE_MAX_COLOR_BUFS]; + struct softpipe_tile_cache *zsbuf_cache; + + struct softpipe_tile_cache *tex_cache[PIPE_MAX_SAMPLERS]; + + int use_sse : 1; + int dump_fs : 1; + }; + + + + + static INLINE struct softpipe_context * + softpipe_context( struct pipe_context *pipe ) + { + return (struct softpipe_context *)pipe; + } + + + #endif /* SP_CONTEXT_H */ diff --cc src/gallium/drivers/softpipe/sp_fs.h index 00000000000,00000000000..4792ace3a33 new file mode 100644 --- /dev/null +++ b/src/gallium/drivers/softpipe/sp_fs.h @@@ -1,0 -1,0 +1,54 @@@ ++/************************************************************************** ++ * ++ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. ++ * All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sub license, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the ++ * next paragraph) shall be included in all copies or substantial portions ++ * of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. ++ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR ++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ * ++ **************************************************************************/ ++ ++/* Authors: Keith Whitwell ++ */ ++ ++#ifndef SP_FS_H ++#define SP_FS_H ++ ++struct sp_fragment_shader * ++softpipe_create_fs_exec(struct softpipe_context *softpipe, ++ const struct pipe_shader_state *templ); ++ ++struct sp_fragment_shader * ++softpipe_create_fs_sse(struct softpipe_context *softpipe, ++ const struct pipe_shader_state *templ); ++ ++struct sp_fragment_shader * ++softpipe_create_fs_llvm(struct softpipe_context *softpipe, ++ const struct pipe_shader_state *templ); ++ ++struct tgsi_interp_coef; ++struct tgsi_exec_vector; ++ ++void sp_setup_pos_vector(const struct tgsi_interp_coef *coef, ++ float x, float y, ++ struct tgsi_exec_vector *quadpos); ++ ++ ++#endif diff --cc src/gallium/drivers/softpipe/sp_fs_exec.c index 00000000000,00000000000..9ad30a76817 new file mode 100644 --- /dev/null +++ b/src/gallium/drivers/softpipe/sp_fs_exec.c @@@ -1,0 -1,0 +1,112 @@@ ++/************************************************************************** ++ * ++ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. ++ * All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sub license, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the ++ * next paragraph) shall be included in all copies or substantial portions ++ * of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. ++ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR ++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ * ++ **************************************************************************/ ++ ++ ++#include "sp_context.h" ++#include "sp_state.h" ++#include "sp_fs.h" ++#include "sp_headers.h" ++ ++ ++#include "pipe/p_state.h" ++#include "pipe/p_defines.h" ++#include "pipe/p_util.h" ++#include "pipe/p_inlines.h" ++#include "tgsi/exec/tgsi_exec.h" ++ ++struct sp_exec_fragment_shader { ++ struct sp_fragment_shader base; ++}; ++ ++ ++ ++ ++static void ++exec_prepare( struct sp_fragment_shader *base, ++ struct tgsi_exec_machine *machine, ++ struct tgsi_sampler *samplers ) ++{ ++ tgsi_exec_machine_bind_shader( machine, ++ base->shader.tokens, ++ PIPE_MAX_SAMPLERS, ++ samplers ); ++} ++ ++ ++ ++ ++/* TODO: hide the machine struct in here somewhere, remove from this ++ * interface: ++ */ ++static unsigned ++exec_run( struct sp_fragment_shader *base, ++ struct tgsi_exec_machine *machine, ++ struct quad_header *quad ) ++{ ++ ++ /* Compute X, Y, Z, W vals for this quad */ ++ sp_setup_pos_vector(quad->posCoef, ++ (float)quad->x0, (float)quad->y0, ++ &machine->QuadPos); ++ ++ return tgsi_exec_machine_run( machine ); ++} ++ ++ ++ ++static void ++exec_delete( struct sp_fragment_shader *base ) ++{ ++ FREE(base); ++} ++ ++ ++ ++ ++ ++struct sp_fragment_shader * ++softpipe_create_fs_exec(struct softpipe_context *softpipe, ++ const struct pipe_shader_state *templ) ++{ ++ struct sp_exec_fragment_shader *shader; ++ ++ /* Decide whether we'll be codegenerating this shader and if so do ++ * that now. ++ */ ++ ++ shader = CALLOC_STRUCT(sp_exec_fragment_shader); ++ if (!shader) ++ return NULL; ++ ++ shader->base.shader = *templ; ++ shader->base.prepare = exec_prepare; ++ shader->base.run = exec_run; ++ shader->base.delete = exec_delete; ++ ++ return &shader->base; ++} ++ diff --cc src/gallium/drivers/softpipe/sp_fs_llvm.c index 00000000000,00000000000..22da4714533 new file mode 100644 --- /dev/null +++ b/src/gallium/drivers/softpipe/sp_fs_llvm.c @@@ -1,0 -1,0 +1,200 @@@ ++/************************************************************************** ++ * ++ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. ++ * All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sub license, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the ++ * next paragraph) shall be included in all copies or substantial portions ++ * of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. ++ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR ++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ * ++ **************************************************************************/ ++ ++/* Authors: ++ * Zack Rusin ++ */ ++ ++#include "sp_context.h" ++#include "sp_state.h" ++#include "sp_fs.h" ++ ++ ++#include "pipe/p_state.h" ++#include "pipe/p_defines.h" ++#include "pipe/p_util.h" ++#include "pipe/p_inlines.h" ++#include "tgsi/exec/tgsi_sse2.h" ++ ++#if 0 ++ ++struct sp_llvm_fragment_shader { ++ struct sp_fragment_shader base; ++ struct gallivm_prog *llvm_prog; ++}; ++ ++static void ++shade_quad_llvm(struct quad_stage *qs, ++ struct quad_header *quad) ++{ ++ struct quad_shade_stage *qss = quad_shade_stage(qs); ++ struct softpipe_context *softpipe = qs->softpipe; ++ float dests[4][16][4] ALIGN16_ATTRIB; ++ float inputs[4][16][4] ALIGN16_ATTRIB; ++ const float fx = (float) quad->x0; ++ const float fy = (float) quad->y0; ++ struct gallivm_prog *llvm = qss->llvm_prog; ++ ++ inputs[0][0][0] = fx; ++ inputs[1][0][0] = fx + 1.0f; ++ inputs[2][0][0] = fx; ++ inputs[3][0][0] = fx + 1.0f; ++ ++ inputs[0][0][1] = fy; ++ inputs[1][0][1] = fy; ++ inputs[2][0][1] = fy + 1.0f; ++ inputs[3][0][1] = fy + 1.0f; ++ ++ ++ gallivm_prog_inputs_interpolate(llvm, inputs, quad->coef); ++ ++#if DLLVM ++ debug_printf("MASK = %d\n", quad->mask); ++ for (int i = 0; i < 4; ++i) { ++ for (int j = 0; j < 2; ++j) { ++ debug_printf("IN(%d,%d) [%f %f %f %f]\n", i, j, ++ inputs[i][j][0], inputs[i][j][1], inputs[i][j][2], inputs[i][j][3]); ++ } ++ } ++#endif ++ ++ quad->mask &= ++ gallivm_fragment_shader_exec(llvm, fx, fy, dests, inputs, ++ softpipe->mapped_constants[PIPE_SHADER_FRAGMENT], ++ qss->samplers); ++#if DLLVM ++ debug_printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n", ++ dests[0][0][0], dests[0][0][1], dests[0][0][2], dests[0][0][3], ++ dests[0][1][0], dests[0][1][1], dests[0][1][2], dests[0][1][3]); ++#endif ++ ++ /* store result color */ ++ if (qss->colorOutSlot >= 0) { ++ unsigned i; ++ /* XXX need to handle multiple color outputs someday */ ++ allvmrt(qss->stage.softpipe->fs->shader.output_semantic_name[qss->colorOutSlot] ++ == TGSI_SEMANTIC_COLOR); ++ for (i = 0; i < QUAD_SIZE; ++i) { ++ quad->outputs.color[0][i] = dests[i][qss->colorOutSlot][0]; ++ quad->outputs.color[1][i] = dests[i][qss->colorOutSlot][1]; ++ quad->outputs.color[2][i] = dests[i][qss->colorOutSlot][2]; ++ quad->outputs.color[3][i] = dests[i][qss->colorOutSlot][3]; ++ } ++ } ++#if DLLVM ++ for (int i = 0; i < QUAD_SIZE; ++i) { ++ debug_printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot, ++ quad->outputs.color[0][i], ++ quad->outputs.color[1][i], ++ quad->outputs.color[2][i], ++ quad->outputs.color[3][i]); ++ } ++#endif ++ ++ /* store result Z */ ++ if (qss->depthOutSlot >= 0) { ++ /* output[slot] is new Z */ ++ uint i; ++ for (i = 0; i < 4; i++) { ++ quad->outputs.depth[i] = dests[i][0][2]; ++ } ++ } ++ else { ++ /* copy input Z (which was interpolated by the executor) to output Z */ ++ uint i; ++ for (i = 0; i < 4; i++) { ++ quad->outputs.depth[i] = inputs[i][0][2]; ++ } ++ } ++#if DLLVM ++ debug_printf("D [%f, %f, %f, %f] mask = %d\n", ++ quad->outputs.depth[0], ++ quad->outputs.depth[1], ++ quad->outputs.depth[2], ++ quad->outputs.depth[3], quad->mask); ++#endif ++ ++ /* shader may cull fragments */ ++ if( quad->mask ) { ++ qs->next->run( qs->next, quad ); ++ } ++} ++ ++ ++unsigned ++run_llvm_fs( struct sp_fragment_shader *base, ++ struct foo *machine ) ++{ ++} ++ ++ ++void ++delete_llvm_fs( struct sp_fragment_shader *base ) ++{ ++ FREE(base); ++} ++ ++ ++struct sp_fragment_shader * ++softpipe_create_fs_llvm(struct softpipe_context *softpipe, ++ const struct pipe_shader_state *templ) ++{ ++ struct sp_llvm_fragment_shader *shader = NULL; ++ ++ /* LLVM fragment shaders currently disabled: ++ */ ++ state = CALLOC_STRUCT(sp_llvm_shader_state); ++ if (!state) ++ return NULL; ++ ++ state->llvm_prog = 0; ++ ++ if (!gallivm_global_cpu_engine()) { ++ gallivm_cpu_engine_create(state->llvm_prog); ++ } ++ else ++ gallivm_cpu_jit_compile(gallivm_global_cpu_engine(), state->llvm_prog); ++ ++ if (shader) { ++ shader->base.run = run_llvm_fs; ++ shader->base.delete = delete_llvm_fs; ++ } ++ ++ return shader; ++} ++ ++ ++#else ++ ++struct sp_fragment_shader * ++softpipe_create_fs_llvm(struct softpipe_context *softpipe, ++ const struct pipe_shader_state *templ) ++{ ++ return NULL; ++} ++ ++#endif diff --cc src/gallium/drivers/softpipe/sp_fs_sse.c index 00000000000,00000000000..28c5d8c556d new file mode 100644 --- /dev/null +++ b/src/gallium/drivers/softpipe/sp_fs_sse.c @@@ -1,0 -1,0 +1,192 @@@ ++/************************************************************************** ++ * ++ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. ++ * All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sub license, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the ++ * next paragraph) shall be included in all copies or substantial portions ++ * of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. ++ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR ++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ * ++ **************************************************************************/ ++ ++ ++#include "sp_context.h" ++#include "sp_state.h" ++#include "sp_fs.h" ++#include "sp_headers.h" ++ ++ ++#include "pipe/p_state.h" ++#include "pipe/p_defines.h" ++#include "pipe/p_util.h" ++#include "pipe/p_inlines.h" ++#include "tgsi/exec/tgsi_exec.h" ++#include "tgsi/exec/tgsi_sse2.h" ++ ++ ++#if defined(__i386__) || defined(__386__) ++ ++#include "x86/rtasm/x86sse.h" ++ ++/* Surely this should be defined somewhere in a tgsi header: ++ */ ++typedef void (XSTDCALL *codegen_function)( ++ const struct tgsi_exec_vector *input, ++ struct tgsi_exec_vector *output, ++ float (*constant)[4], ++ struct tgsi_exec_vector *temporary, ++ const struct tgsi_interp_coef *coef ++ //, const struct tgsi_exec_vector *quadPos ++ ); ++ ++ ++struct sp_sse_fragment_shader { ++ struct sp_fragment_shader base; ++ struct x86_function sse2_program; ++ codegen_function func; ++}; ++ ++ ++/** ++ * Compute quad X,Y,Z,W for the four fragments in a quad. ++ * ++ * This should really be part of the compiled shader. ++ */ ++void ++sp_setup_pos_vector(const struct tgsi_interp_coef *coef, ++ float x, float y, ++ struct tgsi_exec_vector *quadpos) ++{ ++ uint chan; ++ /* do X */ ++ quadpos->xyzw[0].f[0] = x; ++ quadpos->xyzw[0].f[1] = x + 1; ++ quadpos->xyzw[0].f[2] = x; ++ quadpos->xyzw[0].f[3] = x + 1; ++ ++ /* do Y */ ++ quadpos->xyzw[1].f[0] = y; ++ quadpos->xyzw[1].f[1] = y; ++ quadpos->xyzw[1].f[2] = y + 1; ++ quadpos->xyzw[1].f[3] = y + 1; ++ ++ /* do Z and W for all fragments in the quad */ ++ for (chan = 2; chan < 4; chan++) { ++ const float dadx = coef->dadx[chan]; ++ const float dady = coef->dady[chan]; ++ const float a0 = coef->a0[chan] + dadx * x + dady * y; ++ quadpos->xyzw[chan].f[0] = a0; ++ quadpos->xyzw[chan].f[1] = a0 + dadx; ++ quadpos->xyzw[chan].f[2] = a0 + dady; ++ quadpos->xyzw[chan].f[3] = a0 + dadx + dady; ++ } ++} ++ ++ ++static void ++sse_prepare( struct sp_fragment_shader *base, ++ struct tgsi_exec_machine *machine, ++ struct tgsi_sampler *samplers ) ++{ ++} ++ ++ ++/* TODO: codegenerate the whole run function, skip this wrapper. ++ * TODO: break dependency on tgsi_exec_machine struct ++ * TODO: push Position calculation into the generated shader ++ * TODO: process >1 quad at a time ++ */ ++static unsigned ++sse_run( struct sp_fragment_shader *base, ++ struct tgsi_exec_machine *machine, ++ struct quad_header *quad ) ++{ ++ struct sp_sse_fragment_shader *shader = (struct sp_sse_fragment_shader *) base; ++ ++ /* Compute X, Y, Z, W vals for this quad -- place in temp[0] for now */ ++ sp_setup_pos_vector(quad->posCoef, ++ (float)quad->x0, (float)quad->y0, ++ machine->Temps); ++ ++ shader->func( machine->Inputs, ++ machine->Outputs, ++ machine->Consts, ++ machine->Temps, ++ machine->InterpCoefs ++ // , &machine->QuadPos ++ ); ++ ++ return ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]); ++} ++ ++ ++static void ++sse_delete( struct sp_fragment_shader *base ) ++{ ++ struct sp_sse_fragment_shader *shader = (struct sp_sse_fragment_shader *) base; ++ ++ x86_release_func( &shader->sse2_program ); ++ FREE(shader); ++} ++ ++ ++struct sp_fragment_shader * ++softpipe_create_fs_sse(struct softpipe_context *softpipe, ++ const struct pipe_shader_state *templ) ++{ ++ struct sp_sse_fragment_shader *shader; ++ ++ if (!softpipe->use_sse) ++ return NULL; ++ ++ shader = CALLOC_STRUCT(sp_sse_fragment_shader); ++ if (!shader) ++ return NULL; ++ ++ x86_init_func( &shader->sse2_program ); ++ ++ if (!tgsi_emit_sse2_fs( templ->tokens, &shader->sse2_program )) { ++ FREE(shader); ++ return NULL; ++ } ++ ++ shader->func = (codegen_function) x86_get_func( &shader->sse2_program ); ++ assert(shader->func); ++ ++ shader->base.shader = *templ; ++ shader->base.prepare = sse_prepare; ++ shader->base.run = sse_run; ++ shader->base.delete = sse_delete; ++ ++ return &shader->base; ++} ++ ++ ++#else ++ ++/* Maybe put this varient in the header file. ++ */ ++struct sp_fragment_shader * ++softpipe_create_fs_sse(struct softpipe_context *softpipe, ++ const struct pipe_shader_state *templ) ++{ ++ return NULL; ++} ++ ++#endif diff --cc src/gallium/drivers/softpipe/sp_quad_fs.c index 00000000000,b4c01a7ea8c..cf1b1eff75c mode 000000,100644..100644 --- a/src/gallium/drivers/softpipe/sp_quad_fs.c +++ b/src/gallium/drivers/softpipe/sp_quad_fs.c @@@ -1,0 -1,390 +1,208 @@@ + /************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /* Vertices are just an array of floats, with all the attributes + * packed. We currently assume a layout like: + * + * attr[0][0..3] - window position + * attr[1..n][0..3] - remaining attributes. + * + * Attributes are assumed to be 4 floats wide but are packed so that + * all the enabled attributes run contiguously. + */ + + #include "pipe/p_util.h" + #include "pipe/p_defines.h" + #include "pipe/p_shader_tokens.h" + -#include "x86/rtasm/x86sse.h" - -#ifdef MESA_LLVM -#include "llvm/gallivm.h" -#endif - + #include "sp_context.h" + #include "sp_state.h" + #include "sp_headers.h" + #include "sp_quad.h" + #include "sp_texture.h" + #include "sp_tex_sample.h" + + + struct quad_shade_stage + { + struct quad_stage stage; + struct tgsi_sampler samplers[PIPE_MAX_SAMPLERS]; + struct tgsi_exec_machine machine; + struct tgsi_exec_vector *inputs, *outputs; + int colorOutSlot, depthOutSlot; -#ifdef MESA_LLVM - struct gallivm_prog *llvm_prog; -#endif + }; + + + /** cast wrapper */ + static INLINE struct quad_shade_stage * + quad_shade_stage(struct quad_stage *qs) + { + return (struct quad_shade_stage *) qs; + } + + -/** - * Compute quad X,Y,Z,W for the four fragments in a quad. - * Note that we only need to "compute" X and Y for the upper-left fragment. - * We could do less work if we're not depth testing, or there's no - * perspective-corrected attributes, but that's seldom. - */ -static void -setup_pos_vector(const struct tgsi_interp_coef *coef, - float x, float y, - struct tgsi_exec_vector *quadpos) -{ - uint chan; - /* do X */ - quadpos->xyzw[0].f[0] = x; - /* do Y */ - quadpos->xyzw[1].f[0] = y; - /* do Z and W for all fragments in the quad */ - for (chan = 2; chan < 4; chan++) { - const float dadx = coef->dadx[chan]; - const float dady = coef->dady[chan]; - const float a0 = coef->a0[chan] + dadx * x + dady * y; - quadpos->xyzw[chan].f[0] = a0; - quadpos->xyzw[chan].f[1] = a0 + dadx; - quadpos->xyzw[chan].f[2] = a0 + dady; - quadpos->xyzw[chan].f[3] = a0 + dadx + dady; - } -} - - -typedef void (XSTDCALL *codegen_function)( - const struct tgsi_exec_vector *input, - struct tgsi_exec_vector *output, - float (*constant)[4], - struct tgsi_exec_vector *temporary, - const struct tgsi_interp_coef *coef -#if 0 - ,const struct tgsi_exec_vector *quadPos -#endif - ); - + + /** + * Execute fragment shader for the four fragments in the quad. + */ + static void + shade_quad( + struct quad_stage *qs, + struct quad_header *quad ) + { + struct quad_shade_stage *qss = quad_shade_stage( qs ); + struct softpipe_context *softpipe = qs->softpipe; + struct tgsi_exec_machine *machine = &qss->machine; + + /* Consts do not require 16 byte alignment. */ + machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT]; + + machine->InterpCoefs = quad->coef; + - /* Compute X, Y, Z, W vals for this quad */ - setup_pos_vector(quad->posCoef, (float) quad->x0, (float) quad->y0, &machine->QuadPos); - + /* run shader */ -#if defined(__i386__) || defined(__386__) - if( softpipe->use_sse ) { - codegen_function func = (codegen_function) x86_get_func( &softpipe->fs->sse2_program ); - func( - machine->Inputs, - machine->Outputs, - machine->Consts, - machine->Temps, - machine->InterpCoefs -#if 0 - ,machine->QuadPos -#endif - ); - quad->mask &= ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]); - } - else -#endif - { - quad->mask &= tgsi_exec_machine_run( machine ); - } ++ quad->mask &= softpipe->fs->run( softpipe->fs, ++ &qss->machine, ++ quad ); + + /* store result color */ + if (qss->colorOutSlot >= 0) { + /* XXX need to handle multiple color outputs someday */ + assert(qss->stage.softpipe->fs->shader.output_semantic_name[qss->colorOutSlot] + == TGSI_SEMANTIC_COLOR); + memcpy( + quad->outputs.color, + &machine->Outputs[qss->colorOutSlot].xyzw[0].f[0], + sizeof( quad->outputs.color ) ); + } + + /* + * XXX the following code for updating quad->outputs.depth + * isn't really needed if we did early z testing. + */ + + /* store result Z */ + if (qss->depthOutSlot >= 0) { + /* output[slot] is new Z */ + uint i; + for (i = 0; i < 4; i++) { + quad->outputs.depth[i] = machine->Outputs[0].xyzw[2].f[i]; + } + } + else { + /* copy input Z (which was interpolated by the executor) to output Z */ + uint i; + for (i = 0; i < 4; i++) { + quad->outputs.depth[i] = machine->Inputs[0].xyzw[2].f[i]; + /* XXX not sure the above line is always correct. The following + * might be better: + quad->outputs.depth[i] = machine->QuadPos.xyzw[2].f[i]; + */ + } + } + + /* shader may cull fragments */ + if( quad->mask ) { + qs->next->run( qs->next, quad ); + } + } + -#if 0 -#ifdef MESA_LLVM -#define DLLVM 0 -static void -shade_quad_llvm(struct quad_stage *qs, - struct quad_header *quad) -{ - struct quad_shade_stage *qss = quad_shade_stage(qs); - struct softpipe_context *softpipe = qs->softpipe; - float dests[4][16][4] ALIGN16_ATTRIB; - float inputs[4][16][4] ALIGN16_ATTRIB; - const float fx = (float) quad->x0; - const float fy = (float) quad->y0; - struct gallivm_prog *llvm = qss->llvm_prog; - - inputs[0][0][0] = fx; - inputs[1][0][0] = fx + 1.0f; - inputs[2][0][0] = fx; - inputs[3][0][0] = fx + 1.0f; - - inputs[0][0][1] = fy; - inputs[1][0][1] = fy; - inputs[2][0][1] = fy + 1.0f; - inputs[3][0][1] = fy + 1.0f; -#if DLLVM - debug_printf("MASK = %d\n", quad->mask); -#endif - gallivm_prog_inputs_interpolate(llvm, inputs, quad->coef); -#if DLLVM - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 2; ++j) { - debug_printf("IN(%d,%d) [%f %f %f %f]\n", i, j, - inputs[i][j][0], inputs[i][j][1], inputs[i][j][2], inputs[i][j][3]); - } - } -#endif - - quad->mask &= - gallivm_fragment_shader_exec(llvm, fx, fy, dests, inputs, - softpipe->mapped_constants[PIPE_SHADER_FRAGMENT], - qss->samplers); -#if DLLVM - debug_printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n", - dests[0][0][0], dests[0][0][1], dests[0][0][2], dests[0][0][3], - dests[0][1][0], dests[0][1][1], dests[0][1][2], dests[0][1][3]); -#endif - - /* store result color */ - if (qss->colorOutSlot >= 0) { - unsigned i; - /* XXX need to handle multiple color outputs someday */ - assert(qss->stage.softpipe->fs->shader.output_semantic_name[qss->colorOutSlot] - == TGSI_SEMANTIC_COLOR); - for (i = 0; i < QUAD_SIZE; ++i) { - quad->outputs.color[0][i] = dests[i][qss->colorOutSlot][0]; - quad->outputs.color[1][i] = dests[i][qss->colorOutSlot][1]; - quad->outputs.color[2][i] = dests[i][qss->colorOutSlot][2]; - quad->outputs.color[3][i] = dests[i][qss->colorOutSlot][3]; - } - } -#if DLLVM - for (int i = 0; i < QUAD_SIZE; ++i) { - debug_printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot, - quad->outputs.color[0][i], - quad->outputs.color[1][i], - quad->outputs.color[2][i], - quad->outputs.color[3][i]); - } -#endif - - /* store result Z */ - if (qss->depthOutSlot >= 0) { - /* output[slot] is new Z */ - uint i; - for (i = 0; i < 4; i++) { - quad->outputs.depth[i] = dests[i][0][2]; - } - } - else { - /* copy input Z (which was interpolated by the executor) to output Z */ - uint i; - for (i = 0; i < 4; i++) { - quad->outputs.depth[i] = inputs[i][0][2]; - } - } -#if DLLVM - debug_printf("D [%f, %f, %f, %f] mask = %d\n", - quad->outputs.depth[0], - quad->outputs.depth[1], - quad->outputs.depth[2], - quad->outputs.depth[3], quad->mask); -#endif - - /* shader may cull fragments */ - if( quad->mask ) { - qs->next->run( qs->next, quad ); - } -} -#endif /*MESA_LLVM*/ -#endif - + /** + * Per-primitive (or per-begin?) setup + */ + static void shade_begin(struct quad_stage *qs) + { + struct quad_shade_stage *qss = quad_shade_stage(qs); + struct softpipe_context *softpipe = qs->softpipe; + unsigned i; + + /* set TGSI sampler state that varies */ + for (i = 0; i < PIPE_MAX_SAMPLERS; i++) { + qss->samplers[i].state = softpipe->sampler[i]; + qss->samplers[i].texture = &softpipe->texture[i]->base; + } + -#ifdef MESA_LLVM - qss->llvm_prog = softpipe->fs->llvm_prog; -#endif - /* XXX only do this if the fragment shader changes... */ - tgsi_exec_machine_init(&qss->machine, - softpipe->fs->shader.tokens, - PIPE_MAX_SAMPLERS, - qss->samplers ); - + /* find output slots for depth, color */ + qss->colorOutSlot = -1; + qss->depthOutSlot = -1; + for (i = 0; i < qss->stage.softpipe->fs->shader.num_outputs; i++) { + switch (qss->stage.softpipe->fs->shader.output_semantic_name[i]) { + case TGSI_SEMANTIC_POSITION: + qss->depthOutSlot = i; + break; + case TGSI_SEMANTIC_COLOR: + qss->colorOutSlot = i; + break; + } + } ++ ++ softpipe->fs->prepare( softpipe->fs, ++ &qss->machine, ++ qss->samplers ); + + qs->next->begin(qs->next); + } + + + static void shade_destroy(struct quad_stage *qs) + { + struct quad_shade_stage *qss = (struct quad_shade_stage *) qs; + + tgsi_exec_machine_free_data(&qss->machine); + FREE( qss->inputs ); + FREE( qss->outputs ); + FREE( qs ); + } + + + struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe ) + { + struct quad_shade_stage *qss = CALLOC_STRUCT(quad_shade_stage); + uint i; + + /* allocate storage for program inputs/outputs, aligned to 16 bytes */ + qss->inputs = MALLOC(PIPE_ATTRIB_MAX * sizeof(*qss->inputs) + 16); + qss->outputs = MALLOC(PIPE_ATTRIB_MAX * sizeof(*qss->outputs) + 16); + qss->machine.Inputs = align16(qss->inputs); + qss->machine.Outputs = align16(qss->outputs); + + qss->stage.softpipe = softpipe; + qss->stage.begin = shade_begin; -#ifdef MESA_LLVM - /* disable until ported to accept - * x/y and soa layout - qss->stage.run = shade_quad_llvm; - */ - softpipe->use_sse = FALSE; - qss->stage.run = shade_quad; -#else + qss->stage.run = shade_quad; -#endif + qss->stage.destroy = shade_destroy; + + /* set TGSI sampler state that's constant */ + for (i = 0; i < PIPE_MAX_SAMPLERS; i++) { + assert(softpipe->tex_cache[i]); + qss->samplers[i].get_samples = sp_get_samples; + qss->samplers[i].pipe = &softpipe->pipe; + qss->samplers[i].cache = softpipe->tex_cache[i]; + } + ++ tgsi_exec_machine_init( &qss->machine ); ++ + return &qss->stage; + } diff --cc src/gallium/drivers/softpipe/sp_state.h index 00000000000,b79db0d1f19..ef8cf67d4c3 mode 000000,100644..100644 --- a/src/gallium/drivers/softpipe/sp_state.h +++ b/src/gallium/drivers/softpipe/sp_state.h @@@ -1,0 -1,187 +1,193 @@@ + /************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /* Authors: Keith Whitwell + */ + + #ifndef SP_STATE_H + #define SP_STATE_H + + #include "pipe/p_state.h" + -#include "x86/rtasm/x86sse.h" - + + #define SP_NEW_VIEWPORT 0x1 + #define SP_NEW_RASTERIZER 0x2 + #define SP_NEW_FS 0x4 + #define SP_NEW_BLEND 0x8 + #define SP_NEW_CLIP 0x10 + #define SP_NEW_SCISSOR 0x20 + #define SP_NEW_STIPPLE 0x40 + #define SP_NEW_FRAMEBUFFER 0x80 + #define SP_NEW_DEPTH_STENCIL_ALPHA 0x100 + #define SP_NEW_CONSTANTS 0x200 + #define SP_NEW_SAMPLER 0x400 + #define SP_NEW_TEXTURE 0x800 + #define SP_NEW_VERTEX 0x1000 + #define SP_NEW_VS 0x2000 + #define SP_NEW_QUERY 0x4000 + + ++struct tgsi_sampler; ++struct tgsi_interp_coef; ++struct tgsi_exec_machine; + -#ifdef MESA_LLVM -struct gallivm_prog; -#endif + ++/** Subclass of pipe_shader_state (though it doesn't really need to be). ++ * ++ * This is starting to look an awful lot like a quad pipeline stage... ++ */ ++struct sp_fragment_shader { ++ struct pipe_shader_state shader; + -struct vertex_info; ++ void (*prepare)( struct sp_fragment_shader *shader, ++ struct tgsi_exec_machine *machine, ++ struct tgsi_sampler *samplers); + ++ /* Run the shader - this interface will get cleaned up in the ++ * future: ++ */ ++ unsigned (*run)( struct sp_fragment_shader *shader, ++ struct tgsi_exec_machine *machine, ++ struct quad_header *quad ); + -/** Subclass of pipe_shader_state */ -struct sp_fragment_shader_state { - struct pipe_shader_state shader; -#if defined(__i386__) || defined(__386__) - struct x86_function sse2_program; -#endif -#ifdef MESA_LLVM - struct gallivm_prog *llvm_prog; -#endif ++ ++ void (*delete)( struct sp_fragment_shader * ); + }; + ++struct vertex_info; + + /** Subclass of pipe_shader_state */ -struct sp_vertex_shader_state { ++struct sp_vertex_shader { + struct pipe_shader_state shader; + struct draw_vertex_shader *draw_data; + }; + + + + void * + softpipe_create_blend_state(struct pipe_context *, + const struct pipe_blend_state *); + void softpipe_bind_blend_state(struct pipe_context *, + void *); + void softpipe_delete_blend_state(struct pipe_context *, + void *); + + void * + softpipe_create_sampler_state(struct pipe_context *, + const struct pipe_sampler_state *); + void softpipe_bind_sampler_state(struct pipe_context *, unsigned, void *); + void softpipe_delete_sampler_state(struct pipe_context *, void *); + + void * + softpipe_create_depth_stencil_state(struct pipe_context *, + const struct pipe_depth_stencil_alpha_state *); + void softpipe_bind_depth_stencil_state(struct pipe_context *, void *); + void softpipe_delete_depth_stencil_state(struct pipe_context *, void *); + + void * + softpipe_create_rasterizer_state(struct pipe_context *, + const struct pipe_rasterizer_state *); + void softpipe_bind_rasterizer_state(struct pipe_context *, void *); + void softpipe_delete_rasterizer_state(struct pipe_context *, void *); + + void softpipe_set_framebuffer_state( struct pipe_context *, + const struct pipe_framebuffer_state * ); + + void softpipe_set_blend_color( struct pipe_context *pipe, + const struct pipe_blend_color *blend_color ); + + void softpipe_set_clip_state( struct pipe_context *, + const struct pipe_clip_state * ); + + void softpipe_set_constant_buffer(struct pipe_context *, + uint shader, uint index, + const struct pipe_constant_buffer *buf); + + void *softpipe_create_fs_state(struct pipe_context *, + const struct pipe_shader_state *); + void softpipe_bind_fs_state(struct pipe_context *, void *); + void softpipe_delete_fs_state(struct pipe_context *, void *); + void *softpipe_create_vs_state(struct pipe_context *, + const struct pipe_shader_state *); + void softpipe_bind_vs_state(struct pipe_context *, void *); + void softpipe_delete_vs_state(struct pipe_context *, void *); + + void softpipe_set_polygon_stipple( struct pipe_context *, + const struct pipe_poly_stipple * ); + + void softpipe_set_scissor_state( struct pipe_context *, + const struct pipe_scissor_state * ); + + void softpipe_set_sampler_texture( struct pipe_context *, + unsigned unit, + struct pipe_texture * ); + + void softpipe_set_viewport_state( struct pipe_context *, + const struct pipe_viewport_state * ); + + void softpipe_set_vertex_element(struct pipe_context *, + unsigned index, + const struct pipe_vertex_element *); + + void softpipe_set_vertex_buffer(struct pipe_context *, + unsigned index, + const struct pipe_vertex_buffer *); + + + void softpipe_update_derived( struct softpipe_context *softpipe ); + + + boolean softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode, + unsigned start, unsigned count); + + boolean softpipe_draw_elements(struct pipe_context *pipe, + struct pipe_buffer *indexBuffer, + unsigned indexSize, + unsigned mode, unsigned start, unsigned count); + + + void + softpipe_map_surfaces(struct softpipe_context *sp); + + void + softpipe_unmap_surfaces(struct softpipe_context *sp); + + void + softpipe_map_texture_surfaces(struct softpipe_context *sp); + + void + softpipe_unmap_texture_surfaces(struct softpipe_context *sp); + + + struct vertex_info * + softpipe_get_vertex_info(struct softpipe_context *softpipe); + + struct vertex_info * + softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe); + + + #endif diff --cc src/gallium/drivers/softpipe/sp_state_fs.c index 00000000000,1e3cadd43d1..b0238f81737 mode 000000,100644..100644 --- a/src/gallium/drivers/softpipe/sp_state_fs.c +++ b/src/gallium/drivers/softpipe/sp_state_fs.c @@@ -1,0 -1,179 +1,156 @@@ + /************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + #include "sp_context.h" + #include "sp_state.h" ++#include "sp_fs.h" + + #include "pipe/p_defines.h" + #include "pipe/p_util.h" + #include "pipe/p_inlines.h" + #include "pipe/p_winsys.h" -#include "draw/draw_context.h" + #include "pipe/p_shader_tokens.h" -#include "llvm/gallivm.h" ++#include "draw/draw_context.h" + #include "tgsi/util/tgsi_dump.h" -#include "tgsi/exec/tgsi_sse2.h" + + + void * + softpipe_create_fs_state(struct pipe_context *pipe, + const struct pipe_shader_state *templ) + { + struct softpipe_context *softpipe = softpipe_context(pipe); - struct sp_fragment_shader_state *state; ++ struct sp_fragment_shader *state; + - /* Decide whether we'll be codegenerating this shader and if so do - * that now. - */ - - state = CALLOC_STRUCT(sp_fragment_shader_state); - if (!state) - return NULL; ++ if (softpipe->dump_fs) ++ tgsi_dump(templ->tokens, 0); + - state->shader = *templ; - - if (softpipe->dump_fs) { - tgsi_dump(state->shader.tokens, 0); - } ++ state = softpipe_create_fs_llvm( softpipe, templ ); ++ if (state) ++ return state; ++ ++ state = softpipe_create_fs_sse( softpipe, templ ); ++ if (state) ++ return state; + -#ifdef MESA_LLVM - state->llvm_prog = 0; - -#if 0 - if (!gallivm_global_cpu_engine()) { - gallivm_cpu_engine_create(state->llvm_prog); - } - else - gallivm_cpu_jit_compile(gallivm_global_cpu_engine(), state->llvm_prog); -#endif - -#elif defined(__i386__) || defined(__386__) - if (softpipe->use_sse) { - x86_init_func( &state->sse2_program ); - tgsi_emit_sse2_fs( state->shader.tokens, &state->sse2_program ); - } -#endif ++ state = softpipe_create_fs_exec( softpipe, templ ); + ++ assert(state); + return state; + } + + + void + softpipe_bind_fs_state(struct pipe_context *pipe, void *fs) + { + struct softpipe_context *softpipe = softpipe_context(pipe); + - softpipe->fs = (struct sp_fragment_shader_state *) fs; ++ softpipe->fs = (struct sp_fragment_shader *) fs; + + softpipe->dirty |= SP_NEW_FS; + } + + + void + softpipe_delete_fs_state(struct pipe_context *pipe, void *fs) + { - struct sp_fragment_shader_state *state = fs; - -#if defined(__i386__) || defined(__386__) - x86_release_func( &state->sse2_program ); -#endif - - FREE( state ); ++ struct sp_fragment_shader *state = fs; ++ ++ state->delete( state ); + } + + + void * + softpipe_create_vs_state(struct pipe_context *pipe, + const struct pipe_shader_state *templ) + { + struct softpipe_context *softpipe = softpipe_context(pipe); - struct sp_vertex_shader_state *state; ++ struct sp_vertex_shader *state; + - state = CALLOC_STRUCT(sp_vertex_shader_state); ++ state = CALLOC_STRUCT(sp_vertex_shader); + if (state == NULL ) { + return NULL; + } + + state->shader = *templ; + + state->draw_data = draw_create_vertex_shader(softpipe->draw, + &state->shader); + if (state->draw_data == NULL) { + FREE( state ); + return NULL; + } + + return state; + } + + + void + softpipe_bind_vs_state(struct pipe_context *pipe, void *vs) + { + struct softpipe_context *softpipe = softpipe_context(pipe); + - softpipe->vs = (const struct sp_vertex_shader_state *)vs; ++ softpipe->vs = (const struct sp_vertex_shader *)vs; + + draw_bind_vertex_shader(softpipe->draw, softpipe->vs->draw_data); + + softpipe->dirty |= SP_NEW_VS; + } + + + void + softpipe_delete_vs_state(struct pipe_context *pipe, void *vs) + { + struct softpipe_context *softpipe = softpipe_context(pipe); + - struct sp_vertex_shader_state *state = - (struct sp_vertex_shader_state *)vs; ++ struct sp_vertex_shader *state = ++ (struct sp_vertex_shader *)vs; + + draw_delete_vertex_shader(softpipe->draw, state->draw_data); + FREE( state ); + } + + + + void + softpipe_set_constant_buffer(struct pipe_context *pipe, + uint shader, uint index, + const struct pipe_constant_buffer *buf) + { + struct softpipe_context *softpipe = softpipe_context(pipe); + struct pipe_winsys *ws = pipe->winsys; + + assert(shader < PIPE_SHADER_TYPES); + assert(index == 0); + + /* note: reference counting */ + pipe_buffer_reference(ws, + &softpipe->constants[shader].buffer, + buf->buffer); + softpipe->constants[shader].size = buf->size; + + softpipe->dirty |= SP_NEW_CONSTANTS; + } diff --cc src/gallium/include/pipe/p_state.h index 00000000000,4d3a6b2f413..1082343e2f5 mode 000000,100644..100644 --- a/src/gallium/include/pipe/p_state.h +++ b/src/gallium/include/pipe/p_state.h @@@ -1,0 -1,322 +1,322 @@@ + /************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + /** + * Abstract graphics pipe state objects. + * + * Basic notes: + * 1. Want compact representations, so we use bitfields. + * 2. Put bitfields before other (GLfloat) fields. + */ + + + #ifndef PIPE_STATE_H + #define PIPE_STATE_H + + #include "p_compiler.h" + #include "p_defines.h" + #include "p_format.h" + + /** + * Implementation limits + */ + #define PIPE_MAX_SAMPLERS 8 + #define PIPE_MAX_CLIP_PLANES 6 + #define PIPE_MAX_CONSTANT 32 + #define PIPE_ATTRIB_MAX 32 + #define PIPE_MAX_COLOR_BUFS 8 + #define PIPE_MAX_TEXTURE_LEVELS 16 + #define PIPE_MAX_FEEDBACK_ATTRIBS 16 + #define PIPE_MAX_SHADER_INPUTS 16 + #define PIPE_MAX_SHADER_OUTPUTS 16 + + + /* fwd decls */ + struct pipe_surface; + struct pipe_winsys; + + + + /** + * The driver will certainly subclass this to include actual memory + * management information. + */ + struct pipe_buffer { + unsigned alignment; + unsigned usage; + unsigned size; + + /** Reference count */ + unsigned refcount; + }; + + + + + /** + * Primitive (point/line/tri) rasterization info + */ + struct pipe_rasterizer_state + { + unsigned flatshade:1; + unsigned light_twoside:1; + unsigned front_winding:2; /**< PIPE_WINDING_x */ + unsigned cull_mode:2; /**< PIPE_WINDING_x */ + unsigned fill_cw:2; /**< PIPE_POLYGON_MODE_x */ + unsigned fill_ccw:2; /**< PIPE_POLYGON_MODE_x */ + unsigned offset_cw:1; + unsigned offset_ccw:1; + unsigned scissor:1; + unsigned poly_smooth:1; + unsigned poly_stipple_enable:1; + unsigned point_smooth:1; + unsigned point_sprite:1; + unsigned point_size_per_vertex:1; /**< size computed in vertex shader */ + unsigned multisample:1; /* XXX maybe more ms state in future */ + unsigned line_smooth:1; + unsigned line_stipple_enable:1; + unsigned line_stipple_factor:8; /**< [1..256] actually */ + unsigned line_stipple_pattern:16; + unsigned bypass_clipping:1; + unsigned origin_lower_left:1; /**< Is (0,0) the lower-left corner? */ + + float line_width; + float point_size; /**< used when no per-vertex size */ + float offset_units; + float offset_scale; + ubyte sprite_coord_mode[PIPE_MAX_SHADER_OUTPUTS]; /**< PIPE_SPRITE_COORD_ */ + }; + + + struct pipe_poly_stipple { + unsigned stipple[32]; + }; + + + struct pipe_viewport_state { + float scale[4]; + float translate[4]; + }; + + + struct pipe_scissor_state { + unsigned minx:16; + unsigned miny:16; + unsigned maxx:16; + unsigned maxy:16; + }; + + + struct pipe_clip_state { + float ucp[PIPE_MAX_CLIP_PLANES][4]; + unsigned nr; + }; + + + /** + * Constants for vertex/fragment shaders + */ + struct pipe_constant_buffer { + struct pipe_buffer *buffer; + unsigned size; /** in bytes */ + }; + + + struct pipe_shader_state { + const struct tgsi_token *tokens; + ubyte num_inputs; + ubyte num_outputs; + ubyte input_map[PIPE_MAX_SHADER_INPUTS]; /* XXX this may be temporary */ + ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */ + ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS]; + ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */ + ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS]; + }; + + + struct pipe_depth_stencil_alpha_state + { + struct { + unsigned enabled:1; /**< depth test enabled? */ + unsigned writemask:1; /**< allow depth buffer writes? */ + unsigned func:3; /**< depth test func (PIPE_FUNC_x) */ + unsigned occlusion_count:1; /**< do occlusion counting? */ + } depth; + struct { + unsigned enabled:1; + unsigned func:3; /**< PIPE_FUNC_x */ + unsigned fail_op:3; /**< PIPE_STENCIL_OP_x */ + unsigned zpass_op:3; /**< PIPE_STENCIL_OP_x */ + unsigned zfail_op:3; /**< PIPE_STENCIL_OP_x */ + ubyte ref_value; + ubyte value_mask; + ubyte write_mask; + } stencil[2]; /**< [0] = front, [1] = back */ + struct { + unsigned enabled:1; + unsigned func:3; /**< PIPE_FUNC_x */ + float ref; /**< reference value */ + } alpha; + }; + + + struct pipe_blend_state { + unsigned blend_enable:1; + + unsigned rgb_func:3; /**< PIPE_BLEND_x */ + unsigned rgb_src_factor:5; /**< PIPE_BLENDFACTOR_x */ + unsigned rgb_dst_factor:5; /**< PIPE_BLENDFACTOR_x */ + + unsigned alpha_func:3; /**< PIPE_BLEND_x */ + unsigned alpha_src_factor:5; /**< PIPE_BLENDFACTOR_x */ + unsigned alpha_dst_factor:5; /**< PIPE_BLENDFACTOR_x */ + + unsigned logicop_enable:1; + unsigned logicop_func:4; /**< PIPE_LOGICOP_x */ + + unsigned colormask:4; /**< bitmask of PIPE_MASK_R/G/B/A */ + unsigned dither:1; + }; + + + struct pipe_blend_color { + float color[4]; + }; + + + struct pipe_framebuffer_state + { + /** multiple colorbuffers for multiple render targets */ + unsigned num_cbufs; + struct pipe_surface *cbufs[PIPE_MAX_COLOR_BUFS]; + + struct pipe_surface *zsbuf; /**< Z/stencil buffer */ + }; + + + /** + * Texture sampler state. + */ + struct pipe_sampler_state + { + unsigned wrap_s:3; /**< PIPE_TEX_WRAP_x */ + unsigned wrap_t:3; /**< PIPE_TEX_WRAP_x */ + unsigned wrap_r:3; /**< PIPE_TEX_WRAP_x */ + unsigned min_img_filter:2; /**< PIPE_TEX_FILTER_x */ + unsigned min_mip_filter:2; /**< PIPE_TEX_MIPFILTER_x */ + unsigned mag_img_filter:2; /**< PIPE_TEX_FILTER_x */ + unsigned compare:1; /**< shadow/depth compare enabled? */ + unsigned compare_mode:1; /**< PIPE_TEX_COMPARE_x */ + unsigned compare_func:3; /**< PIPE_FUNC_x */ + unsigned normalized_coords:1; /**< Are coords normalized to [0,1]? */ + float shadow_ambient; /**< shadow test fail color/intensity */ + float lod_bias; /**< LOD/lambda bias */ + float min_lod, max_lod; /**< LOD clamp range, after bias */ + float border_color[4]; + float max_anisotropy; + }; + + + /** + * 2D surface. This is basically a view into a memory buffer. + * May be a renderbuffer, texture mipmap level, etc. + */ + struct pipe_surface + { + struct pipe_buffer *buffer; /**< driver private buffer handle */ + enum pipe_format format; /**< PIPE_FORMAT_x */ + unsigned status; /**< PIPE_SURFACE_STATUS_x */ + unsigned clear_value; /**< may be temporary */ + unsigned cpp; /**< bytes per pixel */ - unsigned width, height; ++ unsigned width; ++ unsigned height; + unsigned pitch; /**< in pixels */ + unsigned offset; /**< offset from start of buffer, in bytes */ + unsigned refcount; + struct pipe_winsys *winsys; /**< winsys which owns/created the surface */ + }; + + + /** + * Texture. Represents one or several texture images on one or several mipmap + * levels. + */ + struct pipe_texture + { + /* Effectively the key: + */ + enum pipe_texture_target target; /**< PIPE_TEXTURE_x */ + enum pipe_format format; /**< PIPE_FORMAT_x */ + - unsigned last_level; /**< Index of last mipmap level present/defined */ - + unsigned width[PIPE_MAX_TEXTURE_LEVELS]; + unsigned height[PIPE_MAX_TEXTURE_LEVELS]; + unsigned depth[PIPE_MAX_TEXTURE_LEVELS]; - unsigned cpp; + ++ unsigned cpp:8; ++ unsigned last_level:8; /**< Index of last mipmap level present/defined */ + unsigned compressed:1; + + /* These are also refcounted: + */ + unsigned refcount; + }; + + + /** + * A vertex buffer. Typically, all the vertex data/attributes for + * drawing something will be in one buffer. But it's also possible, for + * example, to put colors in one buffer and texcoords in another. + */ + struct pipe_vertex_buffer + { - unsigned pitch:11; /**< stride to same attrib in next vertex, in bytes */ ++ unsigned pitch; /**< stride to same attrib in next vertex, in bytes */ + unsigned max_index; /**< number of vertices in this buffer */ + unsigned buffer_offset; /**< offset to start of data in buffer, in bytes */ + struct pipe_buffer *buffer; /**< the actual buffer */ + }; + + + /** + * Information to describe a vertex attribute (position, color, etc) + */ + struct pipe_vertex_element + { + /** Offset of this attribute, in bytes, from the start of the vertex */ - unsigned src_offset:11; ++ unsigned src_offset; + + /** Which vertex_buffer (as given to pipe->set_vertex_buffer()) does + * this attribute live in? + */ - unsigned vertex_buffer_index:5; - unsigned nr_components:3; ++ unsigned vertex_buffer_index:8; ++ unsigned nr_components:8; + + enum pipe_format src_format; /**< PIPE_FORMAT_* */ + }; + + + #endif