From: Keith Whitwell Date: Fri, 15 Feb 2008 11:18:04 +0000 (+0000) Subject: Merge commit 'origin/gallium-0.1' into gallium-0.1 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=30d0bacf7aecfb6013ddd665d7385209899eeebd;p=mesa.git Merge commit 'origin/gallium-0.1' into gallium-0.1 --- 30d0bacf7aecfb6013ddd665d7385209899eeebd diff --cc src/gallium/auxiliary/draw/draw_vertex_shader.c index 00000000000,377ecbb931c..9413f8b43a4 mode 000000,100644..100644 --- a/src/gallium/auxiliary/draw/draw_vertex_shader.c +++ b/src/gallium/auxiliary/draw/draw_vertex_shader.c @@@ -1,0 -1,325 +1,327 @@@ + /************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /* + * Authors: + * Keith Whitwell + * Brian Paul + */ + + #include "pipe/p_util.h" + #include "pipe/p_shader_tokens.h" + #if defined(__i386__) || defined(__386__) + #include "tgsi/exec/tgsi_sse2.h" + #endif + #include "draw_private.h" + #include "draw_context.h" + + #include "x86/rtasm/x86sse.h" + #include "llvm/gallivm.h" + + + #define DBG_VS 0 + + + static INLINE unsigned + compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr) + { + unsigned mask = 0; + unsigned i; + + /* Do the hardwired planes first: + */ + if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT; + if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT; + if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT; + if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT; + if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT; + if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT; + + /* Followed by any remaining ones: + */ + for (i = 6; i < nr; i++) { + if (dot4(clip, plane[i]) < 0) + mask |= (1<machine; + unsigned int j; + + ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX); + ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX); + const float *scale = draw->viewport.scale; + const float *trans = draw->viewport.translate; + + assert(count <= 4); + assert(draw->vertex_shader->state->output_semantic_name[0] + == TGSI_SEMANTIC_POSITION); + + /* Consts does not require 16 byte alignment. */ + machine->Consts = (float (*)[4]) draw->user.constants; + + machine->Inputs = ALIGN16_ASSIGN(inputs); + machine->Outputs = ALIGN16_ASSIGN(outputs); + + draw->vertex_fetch.fetch_func( draw, machine, elts, count ); + + /* run shader */ + #ifdef MESA_LLVM + if (1) { + struct gallivm_prog *prog = draw->vertex_shader->llvm_prog; + gallivm_cpu_vs_exec(prog, + machine->Inputs, + machine->Outputs, + machine->Consts, + machine->Temps); + } else + #elif defined(__i386__) || defined(__386__) + if (draw->use_sse) { + /* SSE */ + /* cast away const */ + struct draw_vertex_shader *shader + = (struct draw_vertex_shader *)draw->vertex_shader; + codegen_function func + = (codegen_function) x86_get_func( &shader->sse2_program ); + + if (func) + func( + machine->Inputs, + machine->Outputs, + machine->Consts, + machine->Temps ); + else + /* interpreter */ + tgsi_exec_machine_run( machine ); + } + else + #endif + { + /* interpreter */ + tgsi_exec_machine_run( machine ); + } + + /* store machine results */ + for (j = 0; j < count; j++) { + unsigned slot; + float x, y, z, w; + + /* Handle attr[0] (position) specially: + * + * XXX: Computing the clipmask should be done in the vertex + * program as a set of DP4 instructions appended to the + * user-provided code. + */ + x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j]; + y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j]; + z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j]; + w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j]; + + vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes); + vOut[j]->edgeflag = 1; + + /* divide by w */ + w = 1.0f / w; + x *= w; + y *= w; + z *= w; + + /* Viewport mapping */ + vOut[j]->data[0][0] = x * scale[0] + trans[0]; + vOut[j]->data[0][1] = y * scale[1] + trans[1]; + vOut[j]->data[0][2] = z * scale[2] + trans[2]; + vOut[j]->data[0][3] = w; + + #if DBG_VS + debug_printf("output[%d]win: %f %f %f %f\n", j, + vOut[j]->data[0][0], + vOut[j]->data[0][1], + vOut[j]->data[0][2], + vOut[j]->data[0][3]); + #endif + /* Remaining attributes are packed into sequential post-transform + * vertex attrib slots. + */ + for (slot = 1; slot < draw->num_vs_outputs; slot++) { + vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j]; + vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j]; + vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j]; + vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j]; + #if DBG_VS + debug_printf("output[%d][%d]: %f %f %f %f\n", j, slot, + vOut[j]->data[slot][0], + vOut[j]->data[slot][1], + vOut[j]->data[slot][2], + vOut[j]->data[slot][3]); + #endif + } + } /* loop over vertices */ + } + + + /** + * Run the vertex shader on all vertices in the vertex queue. + * Called by the draw module when the vertx cache needs to be flushed. + */ + void + draw_vertex_shader_queue_flush(struct draw_context *draw) + { + unsigned i; + + assert(draw->vs.queue_nr != 0); + + /* XXX: do this on statechange: + */ + draw_update_vertex_fetch( draw ); + + // fprintf(stderr, " q(%d) ", draw->vs.queue_nr ); + + /* run vertex shader on vertex cache entries, four per invokation */ + for (i = 0; i < draw->vs.queue_nr; i += 4) { + struct vertex_header *dests[4]; + unsigned elts[4]; + int j, n = MIN2(4, draw->vs.queue_nr - i); + + for (j = 0; j < n; j++) { + elts[j] = draw->vs.queue[i + j].elt; + dests[j] = draw->vs.queue[i + j].dest; + } + + for ( ; j < 4; j++) { + elts[j] = elts[0]; + dests[j] = dests[0]; + } + + assert(n > 0); + assert(n <= 4); + + run_vertex_program(draw, elts, n, dests); + } + + draw->vs.queue_nr = 0; + } + + + struct draw_vertex_shader * + draw_create_vertex_shader(struct draw_context *draw, + const struct pipe_shader_state *shader) + { + struct draw_vertex_shader *vs; + + vs = CALLOC_STRUCT( draw_vertex_shader ); + if (vs == NULL) { + return NULL; + } + + vs->state = shader; + + #ifdef MESA_LLVM + struct gallivm_ir *ir = gallivm_ir_new(GALLIVM_VS); + gallivm_ir_set_layout(ir, GALLIVM_SOA); + gallivm_ir_set_components(ir, 4); + gallivm_ir_fill_from_tgsi(ir, shader->tokens); + vs->llvm_prog = gallivm_ir_compile(ir); + gallivm_ir_delete(ir); + + draw->engine = gallivm_global_cpu_engine(); + if (!draw->engine) { + draw->engine = gallivm_cpu_engine_create(vs->llvm_prog); + } + else { + gallivm_cpu_jit_compile(draw->engine, vs->llvm_prog); + } + #elif defined(__i386__) || defined(__386__) + if (draw->use_sse) { + /* cast-away const */ + struct pipe_shader_state *sh = (struct pipe_shader_state *) shader; + + x86_init_func( &vs->sse2_program ); + if (!tgsi_emit_sse2( (struct tgsi_token *) sh->tokens, + &vs->sse2_program )) { + x86_release_func( (struct x86_function *) &vs->sse2_program ); + fprintf(stdout /*err*/, + "tgsi_emit_sse2() failed, falling back to interpreter\n"); + } + } + #endif + + return vs; + } + + + void + draw_bind_vertex_shader(struct draw_context *draw, + struct draw_vertex_shader *dvs) + { + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); + + draw->vertex_shader = dvs; + draw->num_vs_outputs = dvs->state->num_outputs; + ++ tgsi_exec_machine_init(&draw->machine); ++ + /* specify the vertex program to interpret/execute */ - tgsi_exec_machine_init(&draw->machine, - draw->vertex_shader->state->tokens, - PIPE_MAX_SAMPLERS, - NULL /*samplers*/ ); ++ tgsi_exec_machine_bind_shader(&draw->machine, ++ draw->vertex_shader->state->tokens, ++ PIPE_MAX_SAMPLERS, ++ NULL /*samplers*/ ); + } + + + void + draw_delete_vertex_shader(struct draw_context *draw, + struct draw_vertex_shader *dvs) + { + #if defined(__i386__) || defined(__386__) + x86_release_func( (struct x86_function *) &dvs->sse2_program ); + #endif + + FREE( dvs ); + } diff --cc src/gallium/auxiliary/tgsi/exec/tgsi_exec.c index 00000000000,a8f64c2287f..d7b18dc9c59 mode 000000,100644..100644 --- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c @@@ -1,0 -1,2485 +1,2476 @@@ + /************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /** + * TGSI interpretor/executor. + * + * Flow control information: + * + * Since we operate on 'quads' (4 pixels or 4 vertices in parallel) + * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special + * care since a condition may be true for some quad components but false + * for other components. + * + * We basically execute all statements (even if they're in the part of + * an IF/ELSE clause that's "not taken") and use a special mask to + * control writing to destination registers. This is the ExecMask. + * See store_dest(). + * + * The ExecMask is computed from three other masks (CondMask, LoopMask and + * ContMask) which are controlled by the flow control instructions (namely: + * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT). + * + * + * Authors: + * Michal Krol + * Brian Paul + */ + + #include "pipe/p_compiler.h" + #include "pipe/p_state.h" + #include "pipe/p_util.h" + #include "pipe/p_shader_tokens.h" + #include "tgsi/util/tgsi_parse.h" + #include "tgsi/util/tgsi_util.h" + #include "tgsi_exec.h" + + #define TILE_TOP_LEFT 0 + #define TILE_TOP_RIGHT 1 + #define TILE_BOTTOM_LEFT 2 + #define TILE_BOTTOM_RIGHT 3 + + /* + * Shorthand locations of various utility registers (_I = Index, _C = Channel) + */ + #define TEMP_0_I TGSI_EXEC_TEMP_00000000_I + #define TEMP_0_C TGSI_EXEC_TEMP_00000000_C + #define TEMP_7F_I TGSI_EXEC_TEMP_7FFFFFFF_I + #define TEMP_7F_C TGSI_EXEC_TEMP_7FFFFFFF_C + #define TEMP_80_I TGSI_EXEC_TEMP_80000000_I + #define TEMP_80_C TGSI_EXEC_TEMP_80000000_C + #define TEMP_FF_I TGSI_EXEC_TEMP_FFFFFFFF_I + #define TEMP_FF_C TGSI_EXEC_TEMP_FFFFFFFF_C + #define TEMP_1_I TGSI_EXEC_TEMP_ONE_I + #define TEMP_1_C TGSI_EXEC_TEMP_ONE_C + #define TEMP_2_I TGSI_EXEC_TEMP_TWO_I + #define TEMP_2_C TGSI_EXEC_TEMP_TWO_C + #define TEMP_128_I TGSI_EXEC_TEMP_128_I + #define TEMP_128_C TGSI_EXEC_TEMP_128_C + #define TEMP_M128_I TGSI_EXEC_TEMP_MINUS_128_I + #define TEMP_M128_C TGSI_EXEC_TEMP_MINUS_128_C + #define TEMP_KILMASK_I TGSI_EXEC_TEMP_KILMASK_I + #define TEMP_KILMASK_C TGSI_EXEC_TEMP_KILMASK_C + #define TEMP_OUTPUT_I TGSI_EXEC_TEMP_OUTPUT_I + #define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C + #define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I + #define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C + #define TEMP_R0 TGSI_EXEC_TEMP_R0 + + #define FOR_EACH_CHANNEL(CHAN)\ + for (CHAN = 0; CHAN < 4; CHAN++) + + #define IS_CHANNEL_ENABLED(INST, CHAN)\ + ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN))) + + #define IS_CHANNEL_ENABLED2(INST, CHAN)\ + ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN))) + + #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\ + FOR_EACH_CHANNEL( CHAN )\ + if (IS_CHANNEL_ENABLED( INST, CHAN )) + + #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\ + FOR_EACH_CHANNEL( CHAN )\ + if (IS_CHANNEL_ENABLED2( INST, CHAN )) + + + /** The execution mask depends on the conditional mask and the loop mask */ + #define UPDATE_EXEC_MASK(MACH) \ + MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask + + + #define CHAN_X 0 + #define CHAN_Y 1 + #define CHAN_Z 2 + #define CHAN_W 3 + + + -static void -tgsi_exec_prepare( struct tgsi_exec_machine *mach ) ++/** ++ * Initialize machine state by expanding tokens to full instructions, ++ * allocating temporary storage, setting up constants, etc. ++ * After this, we can call tgsi_exec_machine_run() many times. ++ */ ++void ++tgsi_exec_machine_bind_shader( ++ struct tgsi_exec_machine *mach, ++ const struct tgsi_token *tokens, ++ uint numSamplers, ++ struct tgsi_sampler *samplers) + { - struct tgsi_exec_labels *labels = &mach->Labels; ++ uint k; + struct tgsi_parse_context parse; ++ struct tgsi_exec_labels *labels = &mach->Labels; + struct tgsi_full_instruction *instructions; + struct tgsi_full_declaration *declarations; + uint maxInstructions = 10, numInstructions = 0; + uint maxDeclarations = 10, numDeclarations = 0; - uint k; + uint instno = 0; + ++#if 0 ++ tgsi_dump(tokens, 0); ++#endif ++ ++ mach->Tokens = tokens; ++ mach->Samplers = samplers; ++ ++ k = tgsi_parse_init (&parse, mach->Tokens); ++ if (k != TGSI_PARSE_OK) { ++ debug_printf( "Problem parsing!\n" ); ++ return; ++ } ++ ++ mach->Processor = parse.FullHeader.Processor.Processor; + mach->ImmLimit = 0; + labels->count = 0; + + declarations = (struct tgsi_full_declaration *) + MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) ); + + instructions = (struct tgsi_full_instruction *) + MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) ); + - k = tgsi_parse_init( &parse, mach->Tokens ); - if (k != TGSI_PARSE_OK) { - debug_printf("Problem parsing!\n"); - return; - } + + while( !tgsi_parse_end_of_tokens( &parse ) ) { + uint pointer = parse.Position; + uint i; + + tgsi_parse_token( &parse ); + switch( parse.FullToken.Token.Type ) { + case TGSI_TOKEN_TYPE_DECLARATION: + /* save expanded declaration */ + if (numDeclarations == maxDeclarations) { + declarations = REALLOC(declarations, + maxDeclarations + * sizeof(struct tgsi_full_declaration), + (maxDeclarations + 10) + * sizeof(struct tgsi_full_declaration)); + maxDeclarations += 10; + } + memcpy(declarations + numDeclarations, + &parse.FullToken.FullDeclaration, + sizeof(declarations[0])); + numDeclarations++; + break; + + case TGSI_TOKEN_TYPE_IMMEDIATE: + { + uint size = parse.FullToken.FullImmediate.Immediate.Size - 1; + assert( size % 4 == 0 ); + assert( mach->ImmLimit + size / 4 <= TGSI_EXEC_NUM_IMMEDIATES ); + + for( i = 0; i < size; i++ ) { - mach->Imms[mach->ImmLimit + i / 4][i % 4] = parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float; ++ mach->Imms[mach->ImmLimit + i / 4][i % 4] = ++ parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float; + } + mach->ImmLimit += size / 4; + } + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + assert( labels->count < 128 ); + + labels->labels[labels->count][0] = instno; + labels->labels[labels->count][1] = pointer; + labels->count++; + + /* save expanded instruction */ + if (numInstructions == maxInstructions) { + instructions = REALLOC(instructions, + maxInstructions + * sizeof(struct tgsi_full_instruction), + (maxInstructions + 10) + * sizeof(struct tgsi_full_instruction)); + maxInstructions += 10; + } + memcpy(instructions + numInstructions, + &parse.FullToken.FullInstruction, + sizeof(instructions[0])); + numInstructions++; + break; + + default: + assert( 0 ); + } + } + tgsi_parse_free (&parse); + + if (mach->Declarations) { + FREE( mach->Declarations ); + } + mach->Declarations = declarations; + mach->NumDeclarations = numDeclarations; + + if (mach->Instructions) { + FREE( mach->Instructions ); + } + mach->Instructions = instructions; + mach->NumInstructions = numInstructions; + } + + -/** - * Initialize machine state by expanding tokens to full instructions, - * allocating temporary storage, setting up constants, etc. - * After this, we can call tgsi_exec_machine_run() many times. - */ + void + tgsi_exec_machine_init( - struct tgsi_exec_machine *mach, - const struct tgsi_token *tokens, - uint numSamplers, - struct tgsi_sampler *samplers) ++ struct tgsi_exec_machine *mach ) + { - uint i, k; - struct tgsi_parse_context parse; - -#if 0 - tgsi_dump(tokens, 0); -#endif - - mach->Tokens = tokens; - - mach->Samplers = samplers; - - k = tgsi_parse_init (&parse, mach->Tokens); - if (k != TGSI_PARSE_OK) { - debug_printf( "Problem parsing!\n" ); - return; - } - - mach->Processor = parse.FullHeader.Processor.Processor; - tgsi_parse_free (&parse); ++ uint i; + + mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps); + mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS]; + + /* Setup constants. */ + for( i = 0; i < 4; i++ ) { + mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000; + mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF; + mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000; + mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF; + mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f; + mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f; + mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f; + mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f; + } - - tgsi_exec_prepare( mach ); + } + + + void + tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach) + { + if (mach->Instructions) { + FREE(mach->Instructions); + mach->Instructions = NULL; + mach->NumInstructions = 0; + } + if (mach->Declarations) { + FREE(mach->Declarations); + mach->Declarations = NULL; + mach->NumDeclarations = 0; + } + } + + + static void + micro_abs( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) fabs( (double) src->f[0] ); + dst->f[1] = (float) fabs( (double) src->f[1] ); + dst->f[2] = (float) fabs( (double) src->f[2] ); + dst->f[3] = (float) fabs( (double) src->f[3] ); + } + + static void + micro_add( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = src0->f[0] + src1->f[0]; + dst->f[1] = src0->f[1] + src1->f[1]; + dst->f[2] = src0->f[2] + src1->f[2]; + dst->f[3] = src0->f[3] + src1->f[3]; + } + + static void + micro_iadd( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->i[0] = src0->i[0] + src1->i[0]; + dst->i[1] = src0->i[1] + src1->i[1]; + dst->i[2] = src0->i[2] + src1->i[2]; + dst->i[3] = src0->i[3] + src1->i[3]; + } + + static void + micro_and( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] & src1->u[0]; + dst->u[1] = src0->u[1] & src1->u[1]; + dst->u[2] = src0->u[2] & src1->u[2]; + dst->u[3] = src0->u[3] & src1->u[3]; + } + + static void + micro_ceil( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) ceil( (double) src->f[0] ); + dst->f[1] = (float) ceil( (double) src->f[1] ); + dst->f[2] = (float) ceil( (double) src->f[2] ); + dst->f[3] = (float) ceil( (double) src->f[3] ); + } + + static void + micro_cos( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) cos( (double) src->f[0] ); + dst->f[1] = (float) cos( (double) src->f[1] ); + dst->f[2] = (float) cos( (double) src->f[2] ); + dst->f[3] = (float) cos( (double) src->f[3] ); + } + + static void + micro_ddx( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = + dst->f[1] = + dst->f[2] = + dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT]; + } + + static void + micro_ddy( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = + dst->f[1] = + dst->f[2] = + dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT]; + } + + static void + micro_div( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = src0->f[0] / src1->f[0]; + dst->f[1] = src0->f[1] / src1->f[1]; + dst->f[2] = src0->f[2] / src1->f[2]; + dst->f[3] = src0->f[3] / src1->f[3]; + } + + static void + micro_udiv( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] / src1->u[0]; + dst->u[1] = src0->u[1] / src1->u[1]; + dst->u[2] = src0->u[2] / src1->u[2]; + dst->u[3] = src0->u[3] / src1->u[3]; + } + + static void + micro_eq( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2, + const union tgsi_exec_channel *src3 ) + { + dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0]; + dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1]; + dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2]; + dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3]; + } + + static void + micro_ieq( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2, + const union tgsi_exec_channel *src3 ) + { + dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0]; + dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1]; + dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2]; + dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3]; + } + + static void + micro_exp2( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src) + { + dst->f[0] = (float) pow( 2.0, (double) src->f[0] ); + dst->f[1] = (float) pow( 2.0, (double) src->f[1] ); + dst->f[2] = (float) pow( 2.0, (double) src->f[2] ); + dst->f[3] = (float) pow( 2.0, (double) src->f[3] ); + } + + static void + micro_f2it( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->i[0] = (int) src->f[0]; + dst->i[1] = (int) src->f[1]; + dst->i[2] = (int) src->f[2]; + dst->i[3] = (int) src->f[3]; + } + + static void + micro_f2ut( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->u[0] = (uint) src->f[0]; + dst->u[1] = (uint) src->f[1]; + dst->u[2] = (uint) src->f[2]; + dst->u[3] = (uint) src->f[3]; + } + + static void + micro_flr( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) floor( (double) src->f[0] ); + dst->f[1] = (float) floor( (double) src->f[1] ); + dst->f[2] = (float) floor( (double) src->f[2] ); + dst->f[3] = (float) floor( (double) src->f[3] ); + } + + static void + micro_frc( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] ); + dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] ); + dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] ); + dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] ); + } + + static void + micro_ge( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2, + const union tgsi_exec_channel *src3 ) + { + dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0]; + dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1]; + dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2]; + dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3]; + } + + static void + micro_i2f( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) src->i[0]; + dst->f[1] = (float) src->i[1]; + dst->f[2] = (float) src->i[2]; + dst->f[3] = (float) src->i[3]; + } + + static void + micro_lg2( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f; + dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f; + dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f; + dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f; + } + + static void + micro_lt( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2, + const union tgsi_exec_channel *src3 ) + { + dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0]; + dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1]; + dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2]; + dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3]; + } + + static void + micro_ilt( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2, + const union tgsi_exec_channel *src3 ) + { + dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0]; + dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1]; + dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2]; + dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3]; + } + + static void + micro_ult( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2, + const union tgsi_exec_channel *src3 ) + { + dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0]; + dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1]; + dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2]; + dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3]; + } + + static void + micro_max( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0]; + dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1]; + dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2]; + dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3]; + } + + static void + micro_imax( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0]; + dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1]; + dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2]; + dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3]; + } + + static void + micro_umax( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0]; + dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1]; + dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2]; + dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3]; + } + + static void + micro_min( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0]; + dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1]; + dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2]; + dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3]; + } + + static void + micro_imin( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0]; + dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1]; + dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2]; + dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3]; + } + + static void + micro_umin( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0]; + dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1]; + dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2]; + dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3]; + } + + static void + micro_umod( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] % src1->u[0]; + dst->u[1] = src0->u[1] % src1->u[1]; + dst->u[2] = src0->u[2] % src1->u[2]; + dst->u[3] = src0->u[3] % src1->u[3]; + } + + static void + micro_mul( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = src0->f[0] * src1->f[0]; + dst->f[1] = src0->f[1] * src1->f[1]; + dst->f[2] = src0->f[2] * src1->f[2]; + dst->f[3] = src0->f[3] * src1->f[3]; + } + + static void + micro_imul( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->i[0] = src0->i[0] * src1->i[0]; + dst->i[1] = src0->i[1] * src1->i[1]; + dst->i[2] = src0->i[2] * src1->i[2]; + dst->i[3] = src0->i[3] * src1->i[3]; + } + + static void + micro_imul64( + union tgsi_exec_channel *dst0, + union tgsi_exec_channel *dst1, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst1->i[0] = src0->i[0] * src1->i[0]; + dst1->i[1] = src0->i[1] * src1->i[1]; + dst1->i[2] = src0->i[2] * src1->i[2]; + dst1->i[3] = src0->i[3] * src1->i[3]; + dst0->i[0] = 0; + dst0->i[1] = 0; + dst0->i[2] = 0; + dst0->i[3] = 0; + } + + static void + micro_umul64( + union tgsi_exec_channel *dst0, + union tgsi_exec_channel *dst1, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst1->u[0] = src0->u[0] * src1->u[0]; + dst1->u[1] = src0->u[1] * src1->u[1]; + dst1->u[2] = src0->u[2] * src1->u[2]; + dst1->u[3] = src0->u[3] * src1->u[3]; + dst0->u[0] = 0; + dst0->u[1] = 0; + dst0->u[2] = 0; + dst0->u[3] = 0; + } + + static void + micro_movc( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1, + const union tgsi_exec_channel *src2 ) + { + dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0]; + dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1]; + dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2]; + dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3]; + } + + static void + micro_neg( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = -src->f[0]; + dst->f[1] = -src->f[1]; + dst->f[2] = -src->f[2]; + dst->f[3] = -src->f[3]; + } + + static void + micro_ineg( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->i[0] = -src->i[0]; + dst->i[1] = -src->i[1]; + dst->i[2] = -src->i[2]; + dst->i[3] = -src->i[3]; + } + + static void + micro_not( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->u[0] = ~src->u[0]; + dst->u[1] = ~src->u[1]; + dst->u[2] = ~src->u[2]; + dst->u[3] = ~src->u[3]; + } + + static void + micro_or( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] | src1->u[0]; + dst->u[1] = src0->u[1] | src1->u[1]; + dst->u[2] = src0->u[2] | src1->u[2]; + dst->u[3] = src0->u[3] | src1->u[3]; + } + + static void + micro_pow( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] ); + dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] ); + dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] ); + dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] ); + } + + static void + micro_rnd( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) ); + dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) ); + dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) ); + dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) ); + } + + static void + micro_shl( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->i[0] = src0->i[0] << src1->i[0]; + dst->i[1] = src0->i[1] << src1->i[1]; + dst->i[2] = src0->i[2] << src1->i[2]; + dst->i[3] = src0->i[3] << src1->i[3]; + } + + static void + micro_ishr( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->i[0] = src0->i[0] >> src1->i[0]; + dst->i[1] = src0->i[1] >> src1->i[1]; + dst->i[2] = src0->i[2] >> src1->i[2]; + dst->i[3] = src0->i[3] >> src1->i[3]; + } + + static void + micro_trunc( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0 ) + { + dst->f[0] = (float) (int) src0->f[0]; + dst->f[1] = (float) (int) src0->f[1]; + dst->f[2] = (float) (int) src0->f[2]; + dst->f[3] = (float) (int) src0->f[3]; + } + + static void + micro_ushr( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] >> src1->u[0]; + dst->u[1] = src0->u[1] >> src1->u[1]; + dst->u[2] = src0->u[2] >> src1->u[2]; + dst->u[3] = src0->u[3] >> src1->u[3]; + } + + static void + micro_sin( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) sin( (double) src->f[0] ); + dst->f[1] = (float) sin( (double) src->f[1] ); + dst->f[2] = (float) sin( (double) src->f[2] ); + dst->f[3] = (float) sin( (double) src->f[3] ); + } + + static void + micro_sqrt( union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) sqrt( (double) src->f[0] ); + dst->f[1] = (float) sqrt( (double) src->f[1] ); + dst->f[2] = (float) sqrt( (double) src->f[2] ); + dst->f[3] = (float) sqrt( (double) src->f[3] ); + } + + static void + micro_sub( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->f[0] = src0->f[0] - src1->f[0]; + dst->f[1] = src0->f[1] - src1->f[1]; + dst->f[2] = src0->f[2] - src1->f[2]; + dst->f[3] = src0->f[3] - src1->f[3]; + } + + static void + micro_u2f( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src ) + { + dst->f[0] = (float) src->u[0]; + dst->f[1] = (float) src->u[1]; + dst->f[2] = (float) src->u[2]; + dst->f[3] = (float) src->u[3]; + } + + static void + micro_xor( + union tgsi_exec_channel *dst, + const union tgsi_exec_channel *src0, + const union tgsi_exec_channel *src1 ) + { + dst->u[0] = src0->u[0] ^ src1->u[0]; + dst->u[1] = src0->u[1] ^ src1->u[1]; + dst->u[2] = src0->u[2] ^ src1->u[2]; + dst->u[3] = src0->u[3] ^ src1->u[3]; + } + + static void + fetch_src_file_channel( + const struct tgsi_exec_machine *mach, + const uint file, + const uint swizzle, + const union tgsi_exec_channel *index, + union tgsi_exec_channel *chan ) + { + switch( swizzle ) { + case TGSI_EXTSWIZZLE_X: + case TGSI_EXTSWIZZLE_Y: + case TGSI_EXTSWIZZLE_Z: + case TGSI_EXTSWIZZLE_W: + switch( file ) { + case TGSI_FILE_CONSTANT: + chan->f[0] = mach->Consts[index->i[0]][swizzle]; + chan->f[1] = mach->Consts[index->i[1]][swizzle]; + chan->f[2] = mach->Consts[index->i[2]][swizzle]; + chan->f[3] = mach->Consts[index->i[3]][swizzle]; + break; + + case TGSI_FILE_INPUT: + chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3]; + break; + + case TGSI_FILE_TEMPORARY: + chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3]; + break; + + case TGSI_FILE_IMMEDIATE: + assert( index->i[0] < (int) mach->ImmLimit ); + chan->f[0] = mach->Imms[index->i[0]][swizzle]; + assert( index->i[1] < (int) mach->ImmLimit ); + chan->f[1] = mach->Imms[index->i[1]][swizzle]; + assert( index->i[2] < (int) mach->ImmLimit ); + chan->f[2] = mach->Imms[index->i[2]][swizzle]; + assert( index->i[3] < (int) mach->ImmLimit ); + chan->f[3] = mach->Imms[index->i[3]][swizzle]; + break; + + case TGSI_FILE_ADDRESS: + chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3]; + break; + + case TGSI_FILE_OUTPUT: + /* vertex/fragment output vars can be read too */ + chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0]; + chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1]; + chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2]; + chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3]; + break; + + default: + assert( 0 ); + } + break; + + case TGSI_EXTSWIZZLE_ZERO: + *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]; + break; + + case TGSI_EXTSWIZZLE_ONE: + *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]; + break; + + default: + assert( 0 ); + } + } + + static void + fetch_source( + const struct tgsi_exec_machine *mach, + union tgsi_exec_channel *chan, + const struct tgsi_full_src_register *reg, + const uint chan_index ) + { + union tgsi_exec_channel index; + uint swizzle; + + index.i[0] = + index.i[1] = + index.i[2] = + index.i[3] = reg->SrcRegister.Index; + + if (reg->SrcRegister.Indirect) { + union tgsi_exec_channel index2; + union tgsi_exec_channel indir_index; + + index2.i[0] = + index2.i[1] = + index2.i[2] = + index2.i[3] = reg->SrcRegisterInd.Index; + + swizzle = tgsi_util_get_src_register_swizzle( ®->SrcRegisterInd, CHAN_X ); + fetch_src_file_channel( + mach, + reg->SrcRegisterInd.File, + swizzle, + &index2, + &indir_index ); + + index.i[0] += indir_index.i[0]; + index.i[1] += indir_index.i[1]; + index.i[2] += indir_index.i[2]; + index.i[3] += indir_index.i[3]; + } + + if( reg->SrcRegister.Dimension ) { + switch( reg->SrcRegister.File ) { + case TGSI_FILE_INPUT: + index.i[0] *= 17; + index.i[1] *= 17; + index.i[2] *= 17; + index.i[3] *= 17; + break; + case TGSI_FILE_CONSTANT: + index.i[0] *= 4096; + index.i[1] *= 4096; + index.i[2] *= 4096; + index.i[3] *= 4096; + break; + default: + assert( 0 ); + } + + index.i[0] += reg->SrcRegisterDim.Index; + index.i[1] += reg->SrcRegisterDim.Index; + index.i[2] += reg->SrcRegisterDim.Index; + index.i[3] += reg->SrcRegisterDim.Index; + + if (reg->SrcRegisterDim.Indirect) { + union tgsi_exec_channel index2; + union tgsi_exec_channel indir_index; + + index2.i[0] = + index2.i[1] = + index2.i[2] = + index2.i[3] = reg->SrcRegisterDimInd.Index; + + swizzle = tgsi_util_get_src_register_swizzle( ®->SrcRegisterDimInd, CHAN_X ); + fetch_src_file_channel( + mach, + reg->SrcRegisterDimInd.File, + swizzle, + &index2, + &indir_index ); + + index.i[0] += indir_index.i[0]; + index.i[1] += indir_index.i[1]; + index.i[2] += indir_index.i[2]; + index.i[3] += indir_index.i[3]; + } + } + + swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index ); + fetch_src_file_channel( + mach, + reg->SrcRegister.File, + swizzle, + &index, + chan ); + + switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) { + case TGSI_UTIL_SIGN_CLEAR: + micro_abs( chan, chan ); + break; + + case TGSI_UTIL_SIGN_SET: + micro_abs( chan, chan ); + micro_neg( chan, chan ); + break; + + case TGSI_UTIL_SIGN_TOGGLE: + micro_neg( chan, chan ); + break; + + case TGSI_UTIL_SIGN_KEEP: + break; + } + + if (reg->SrcRegisterExtMod.Complement) { + micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan ); + } + } + + static void + store_dest( + struct tgsi_exec_machine *mach, + const union tgsi_exec_channel *chan, + const struct tgsi_full_dst_register *reg, + const struct tgsi_full_instruction *inst, + uint chan_index ) + { + union tgsi_exec_channel *dst; + + switch( reg->DstRegister.File ) { + case TGSI_FILE_NULL: + return; + + case TGSI_FILE_OUTPUT: + dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] + + reg->DstRegister.Index].xyzw[chan_index]; + break; + + case TGSI_FILE_TEMPORARY: + dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index]; + break; + + case TGSI_FILE_ADDRESS: + dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index]; + break; + + default: + assert( 0 ); + return; + } + + switch (inst->Instruction.Saturate) + { + case TGSI_SAT_NONE: + if (mach->ExecMask & 0x1) + dst->i[0] = chan->i[0]; + if (mach->ExecMask & 0x2) + dst->i[1] = chan->i[1]; + if (mach->ExecMask & 0x4) + dst->i[2] = chan->i[2]; + if (mach->ExecMask & 0x8) + dst->i[3] = chan->i[3]; + break; + + case TGSI_SAT_ZERO_ONE: + /* XXX need to obey ExecMask here */ + micro_max(dst, chan, &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]); + micro_min(dst, dst, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]); + break; + + case TGSI_SAT_MINUS_PLUS_ONE: + assert( 0 ); + break; + + default: + assert( 0 ); + } + } + + #define FETCH(VAL,INDEX,CHAN)\ + fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN) + + #define STORE(VAL,INDEX,CHAN)\ + store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN ) + + + /** + * Execute ARB-style KIL which is predicated by a src register. + * Kill fragment if any of the four values is less than zero. + */ + static void + exec_kilp(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) + { + uint uniquemask; + uint chan_index; + uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */ + union tgsi_exec_channel r[1]; + + /* This mask stores component bits that were already tested. Note that + * we test if the value is less than zero, so 1.0 and 0.0 need not to be + * tested. */ + uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE); + + for (chan_index = 0; chan_index < 4; chan_index++) + { + uint swizzle; + uint i; + + /* unswizzle channel */ + swizzle = tgsi_util_get_full_src_register_extswizzle ( + &inst->FullSrcRegisters[0], + chan_index); + + /* check if the component has not been already tested */ + if (uniquemask & (1 << swizzle)) + continue; + uniquemask |= 1 << swizzle; + + FETCH(&r[0], 0, chan_index); + for (i = 0; i < 4; i++) + if (r[0].f[i] < 0.0f) + kilmask |= 1 << i; + } + + mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask; + } + + + /* + * Fetch a texel using STR texture coordinates. + */ + static void + fetch_texel( struct tgsi_sampler *sampler, + const union tgsi_exec_channel *s, + const union tgsi_exec_channel *t, + const union tgsi_exec_channel *p, + float lodbias, /* XXX should be float[4] */ + union tgsi_exec_channel *r, + union tgsi_exec_channel *g, + union tgsi_exec_channel *b, + union tgsi_exec_channel *a ) + { + uint j; + float rgba[NUM_CHANNELS][QUAD_SIZE]; + + sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba); + + for (j = 0; j < 4; j++) { + r->f[j] = rgba[0][j]; + g->f[j] = rgba[1][j]; + b->f[j] = rgba[2][j]; + a->f[j] = rgba[3][j]; + } + } + + + static void + exec_tex(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst, + boolean biasLod) + { + const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index; + union tgsi_exec_channel r[8]; + uint chan_index; + float lodBias; + + /* debug_printf("Sampler %u unit %u\n", sampler, unit); */ + + switch (inst->InstructionExtTexture.Texture) { + case TGSI_TEXTURE_1D: + + FETCH(&r[0], 0, CHAN_X); + + switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) { + case TGSI_EXTSWIZZLE_W: + FETCH(&r[1], 0, CHAN_W); + micro_div( &r[0], &r[0], &r[1] ); + break; + + case TGSI_EXTSWIZZLE_ONE: + break; + + default: + assert (0); + } + + if (biasLod) { + FETCH(&r[1], 0, CHAN_W); + lodBias = r[2].f[0]; + } + else + lodBias = 0.0; + + fetch_texel(&mach->Samplers[unit], + &r[0], NULL, NULL, lodBias, /* S, T, P, BIAS */ + &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */ + break; + + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 0, CHAN_Z); + + switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) { + case TGSI_EXTSWIZZLE_W: + FETCH(&r[3], 0, CHAN_W); + micro_div( &r[0], &r[0], &r[3] ); + micro_div( &r[1], &r[1], &r[3] ); + micro_div( &r[2], &r[2], &r[3] ); + break; + + case TGSI_EXTSWIZZLE_ONE: + break; + + default: + assert (0); + } + + if (biasLod) { + FETCH(&r[3], 0, CHAN_W); + lodBias = r[3].f[0]; + } + else + lodBias = 0.0; + + fetch_texel(&mach->Samplers[unit], + &r[0], &r[1], &r[2], lodBias, /* inputs */ + &r[0], &r[1], &r[2], &r[3]); /* outputs */ + break; + + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 0, CHAN_Z); + + switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) { + case TGSI_EXTSWIZZLE_W: + FETCH(&r[3], 0, CHAN_W); + micro_div( &r[0], &r[0], &r[3] ); + micro_div( &r[1], &r[1], &r[3] ); + micro_div( &r[2], &r[2], &r[3] ); + break; + + case TGSI_EXTSWIZZLE_ONE: + break; + + default: + assert (0); + } + + if (biasLod) { + FETCH(&r[3], 0, CHAN_W); + lodBias = r[3].f[0]; + } + else + lodBias = 0.0; + + fetch_texel(&mach->Samplers[unit], + &r[0], &r[1], &r[2], lodBias, + &r[0], &r[1], &r[2], &r[3]); + break; + + default: + assert (0); + } + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[chan_index], 0, chan_index ); + } + } + + + /** + * Evaluate a constant-valued coefficient at the position of the + * current quad. + */ + static void + eval_constant_coef( + struct tgsi_exec_machine *mach, + unsigned attrib, + unsigned chan ) + { + unsigned i; + + for( i = 0; i < QUAD_SIZE; i++ ) { + mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan]; + } + } + + /** + * Evaluate a linear-valued coefficient at the position of the + * current quad. + */ + static void + eval_linear_coef( + struct tgsi_exec_machine *mach, + unsigned attrib, + unsigned chan ) + { + const float x = mach->QuadPos.xyzw[0].f[0]; + const float y = mach->QuadPos.xyzw[1].f[0]; + const float dadx = mach->InterpCoefs[attrib].dadx[chan]; + const float dady = mach->InterpCoefs[attrib].dady[chan]; + const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y; + mach->Inputs[attrib].xyzw[chan].f[0] = a0; + mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx; + mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady; + mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady; + } + + /** + * Evaluate a perspective-valued coefficient at the position of the + * current quad. + */ + static void + eval_perspective_coef( + struct tgsi_exec_machine *mach, + unsigned attrib, + unsigned chan ) + { + const float x = mach->QuadPos.xyzw[0].f[0]; + const float y = mach->QuadPos.xyzw[1].f[0]; + const float dadx = mach->InterpCoefs[attrib].dadx[chan]; + const float dady = mach->InterpCoefs[attrib].dady[chan]; + const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y; + const float *w = mach->QuadPos.xyzw[3].f; + /* divide by W here */ + mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0]; + mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1]; + mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2]; + mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3]; + } + + + typedef void (* eval_coef_func)( + struct tgsi_exec_machine *mach, + unsigned attrib, + unsigned chan ); + + static void + exec_declaration( + struct tgsi_exec_machine *mach, + const struct tgsi_full_declaration *decl ) + { + if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) { + if( decl->Declaration.File == TGSI_FILE_INPUT ) { + unsigned first, last, mask; + eval_coef_func eval; + + assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE ); + + first = decl->u.DeclarationRange.First; + last = decl->u.DeclarationRange.Last; + mask = decl->Declaration.UsageMask; + + switch( decl->Interpolation.Interpolate ) { + case TGSI_INTERPOLATE_CONSTANT: + eval = eval_constant_coef; + break; + + case TGSI_INTERPOLATE_LINEAR: + eval = eval_linear_coef; + break; + + case TGSI_INTERPOLATE_PERSPECTIVE: + eval = eval_perspective_coef; + break; + + default: + assert( 0 ); + } + + if( mask == TGSI_WRITEMASK_XYZW ) { + unsigned i, j; + + for( i = first; i <= last; i++ ) { + for( j = 0; j < NUM_CHANNELS; j++ ) { + eval( mach, i, j ); + } + } + } + else { + unsigned i, j; + + for( j = 0; j < NUM_CHANNELS; j++ ) { + if( mask & (1 << j) ) { + for( i = first; i <= last; i++ ) { + eval( mach, i, j ); + } + } + } + } + } + } + } + + static void + exec_instruction( + struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst, + int *pc ) + { + uint chan_index; + union tgsi_exec_channel r[8]; + + (*pc)++; + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_ARL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_f2it( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_MOV: + /* TGSI_OPCODE_SWZ */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_LIT: + if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + FETCH( &r[0], 0, CHAN_X ); + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { + micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + STORE( &r[0], 0, CHAN_Y ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + FETCH( &r[1], 0, CHAN_Y ); + micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + + FETCH( &r[2], 0, CHAN_W ); + micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] ); + micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] ); + micro_pow( &r[1], &r[1], &r[2] ); + micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + STORE( &r[0], 0, CHAN_Z ); + } + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_RCP: + /* TGSI_OPCODE_RECIP */ + FETCH( &r[0], 0, CHAN_X ); + micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] ); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_RSQ: + /* TGSI_OPCODE_RECIPSQRT */ + FETCH( &r[0], 0, CHAN_X ); + micro_sqrt( &r[0], &r[0] ); + micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] ); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_EXP: + assert (0); + break; + + case TGSI_OPCODE_LOG: + assert (0); + break; + + case TGSI_OPCODE_MUL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) + { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + micro_mul( &r[0], &r[0], &r[1] ); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_ADD: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_add( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DP3: + /* TGSI_OPCODE_DOT3 */ + FETCH( &r[0], 0, CHAN_X ); + FETCH( &r[1], 1, CHAN_X ); + micro_mul( &r[0], &r[0], &r[1] ); + + FETCH( &r[1], 0, CHAN_Y ); + FETCH( &r[2], 1, CHAN_Y ); + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FETCH( &r[1], 0, CHAN_Z ); + FETCH( &r[2], 1, CHAN_Z ); + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DP4: + /* TGSI_OPCODE_DOT4 */ + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 1, CHAN_X); + + micro_mul( &r[0], &r[0], &r[1] ); + + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 1, CHAN_Y); + + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FETCH(&r[1], 0, CHAN_Z); + FETCH(&r[2], 1, CHAN_Z); + + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FETCH(&r[1], 0, CHAN_W); + FETCH(&r[2], 1, CHAN_W); + + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DST: + if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { + FETCH( &r[0], 0, CHAN_Y ); + FETCH( &r[1], 1, CHAN_Y); + micro_mul( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, CHAN_Y ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + FETCH( &r[0], 0, CHAN_Z ); + STORE( &r[0], 0, CHAN_Z ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { + FETCH( &r[0], 1, CHAN_W ); + STORE( &r[0], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_MIN: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + /* XXX use micro_min()?? */ + micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] ); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_MAX: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + /* XXX use micro_max()?? */ + micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] ); + + STORE(&r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SLT: + /* TGSI_OPCODE_SETLT */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SGE: + /* TGSI_OPCODE_SETGE */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_MAD: + /* TGSI_OPCODE_MADD */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_mul( &r[0], &r[0], &r[1] ); + FETCH( &r[1], 2, chan_index ); + micro_add( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SUB: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + + micro_sub( &r[0], &r[0], &r[1] ); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_LERP: + /* TGSI_OPCODE_LRP */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + FETCH(&r[2], 2, chan_index); + + micro_sub( &r[1], &r[1], &r[2] ); + micro_mul( &r[0], &r[0], &r[1] ); + micro_add( &r[0], &r[0], &r[2] ); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_CND: + assert (0); + break; + + case TGSI_OPCODE_CND0: + assert (0); + break; + + case TGSI_OPCODE_DOT2ADD: + /* TGSI_OPCODE_DP2A */ + assert (0); + break; + + case TGSI_OPCODE_INDEX: + assert (0); + break; + + case TGSI_OPCODE_NEGATE: + assert (0); + break; + + case TGSI_OPCODE_FRAC: + /* TGSI_OPCODE_FRC */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_frc( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_CLAMP: + assert (0); + break; + + case TGSI_OPCODE_FLOOR: + /* TGSI_OPCODE_FLR */ + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_flr( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_ROUND: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_rnd( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_EXPBASE2: + /* TGSI_OPCODE_EX2 */ + FETCH(&r[0], 0, CHAN_X); + + micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_LOGBASE2: + /* TGSI_OPCODE_LG2 */ + FETCH( &r[0], 0, CHAN_X ); + micro_lg2( &r[0], &r[0] ); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_POWER: + /* TGSI_OPCODE_POW */ + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 1, CHAN_X); + + micro_pow( &r[0], &r[0], &r[1] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_CROSSPRODUCT: + /* TGSI_OPCODE_XPD */ + FETCH(&r[0], 0, CHAN_Y); + FETCH(&r[1], 1, CHAN_Z); + + micro_mul( &r[2], &r[0], &r[1] ); + + FETCH(&r[3], 0, CHAN_Z); + FETCH(&r[4], 1, CHAN_Y); + + micro_mul( &r[5], &r[3], &r[4] ); + micro_sub( &r[2], &r[2], &r[5] ); + + if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { + STORE( &r[2], 0, CHAN_X ); + } + + FETCH(&r[2], 1, CHAN_X); + + micro_mul( &r[3], &r[3], &r[2] ); + + FETCH(&r[5], 0, CHAN_X); + + micro_mul( &r[1], &r[1], &r[5] ); + micro_sub( &r[3], &r[3], &r[1] ); + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { + STORE( &r[3], 0, CHAN_Y ); + } + + micro_mul( &r[5], &r[5], &r[4] ); + micro_mul( &r[0], &r[0], &r[2] ); + micro_sub( &r[5], &r[5], &r[0] ); + + if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { + STORE( &r[5], 0, CHAN_Z ); + } + + if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_MULTIPLYMATRIX: + assert (0); + break; + + case TGSI_OPCODE_ABS: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + + micro_abs( &r[0], &r[0] ); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_RCC: + assert (0); + break; + + case TGSI_OPCODE_DPH: + FETCH(&r[0], 0, CHAN_X); + FETCH(&r[1], 1, CHAN_X); + + micro_mul( &r[0], &r[0], &r[1] ); + + FETCH(&r[1], 0, CHAN_Y); + FETCH(&r[2], 1, CHAN_Y); + + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FETCH(&r[1], 0, CHAN_Z); + FETCH(&r[2], 1, CHAN_Z); + + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FETCH(&r[1], 1, CHAN_W); + + micro_add( &r[0], &r[0], &r[1] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_COS: + FETCH(&r[0], 0, CHAN_X); + + micro_cos( &r[0], &r[0] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DDX: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_ddx( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_DDY: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_ddy( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_KILP: + exec_kilp (mach, inst); + break; + + case TGSI_OPCODE_KIL: + /* for enabled ExecMask bits, set the killed bit */ + mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= mach->ExecMask; + break; + + case TGSI_OPCODE_PK2H: + assert (0); + break; + + case TGSI_OPCODE_PK2US: + assert (0); + break; + + case TGSI_OPCODE_PK4B: + assert (0); + break; + + case TGSI_OPCODE_PK4UB: + assert (0); + break; + + case TGSI_OPCODE_RFL: + assert (0); + break; + + case TGSI_OPCODE_SEQ: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_eq( &r[0], &r[0], &r[1], + &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], + &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SFL: + assert (0); + break; + + case TGSI_OPCODE_SGT: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SIN: + FETCH( &r[0], 0, CHAN_X ); + micro_sin( &r[0], &r[0] ); + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SLE: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SNE: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_STR: + assert (0); + break; + + case TGSI_OPCODE_TEX: + /* simple texture lookup */ + /* src[0] = texcoord */ + /* src[1] = sampler unit */ + exec_tex(mach, inst, FALSE); + break; + + case TGSI_OPCODE_TXB: + /* Texture lookup with lod bias */ + /* src[0] = texcoord (src[0].w = LOD bias) */ + /* src[1] = sampler unit */ + exec_tex(mach, inst, TRUE); + break; + + case TGSI_OPCODE_TXD: + /* Texture lookup with explict partial derivatives */ + /* src[0] = texcoord */ + /* src[1] = d[strq]/dx */ + /* src[2] = d[strq]/dy */ + /* src[3] = sampler unit */ + assert (0); + break; + + case TGSI_OPCODE_TXL: + /* Texture lookup with explit LOD */ + /* src[0] = texcoord (src[0].w = LOD) */ + /* src[1] = sampler unit */ + exec_tex(mach, inst, TRUE); + break; + + case TGSI_OPCODE_UP2H: + assert (0); + break; + + case TGSI_OPCODE_UP2US: + assert (0); + break; + + case TGSI_OPCODE_UP4B: + assert (0); + break; + + case TGSI_OPCODE_UP4UB: + assert (0); + break; + + case TGSI_OPCODE_X2D: + assert (0); + break; + + case TGSI_OPCODE_ARA: + assert (0); + break; + + case TGSI_OPCODE_ARR: + assert (0); + break; + + case TGSI_OPCODE_BRA: + assert (0); + break; + + case TGSI_OPCODE_CAL: + /* skip the call if no execution channels are enabled */ + if (mach->ExecMask) { + /* do the call */ + + /* push the Cond, Loop, Cont stacks */ + assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING); + mach->CondStack[mach->CondStackTop++] = mach->CondMask; + assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask; + assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->ContStack[mach->ContStackTop++] = mach->ContMask; + + assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING); + mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask; + + /* note that PC was already incremented above */ + mach->CallStack[mach->CallStackTop++] = *pc; + *pc = inst->InstructionExtLabel.Label; + } + break; + + case TGSI_OPCODE_RET: + mach->FuncMask &= ~mach->ExecMask; + UPDATE_EXEC_MASK(mach); + + if (mach->ExecMask == 0x0) { + /* really return now (otherwise, keep executing */ + + if (mach->CallStackTop == 0) { + /* returning from main() */ + *pc = -1; + return; + } + *pc = mach->CallStack[--mach->CallStackTop]; + + /* pop the Cond, Loop, Cont stacks */ + assert(mach->CondStackTop > 0); + mach->CondMask = mach->CondStack[--mach->CondStackTop]; + assert(mach->LoopStackTop > 0); + mach->LoopMask = mach->LoopStack[--mach->LoopStackTop]; + assert(mach->ContStackTop > 0); + mach->ContMask = mach->ContStack[--mach->ContStackTop]; + assert(mach->FuncStackTop > 0); + mach->FuncMask = mach->FuncStack[--mach->FuncStackTop]; + + UPDATE_EXEC_MASK(mach); + } + break; + + case TGSI_OPCODE_SSG: + assert (0); + break; + + case TGSI_OPCODE_CMP: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH(&r[0], 0, chan_index); + FETCH(&r[1], 1, chan_index); + FETCH(&r[2], 2, chan_index); + + micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] ); + + STORE(&r[0], 0, chan_index); + } + break; + + case TGSI_OPCODE_SCS: + if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { + FETCH( &r[0], 0, CHAN_X ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) { + micro_cos( &r[1], &r[0] ); + STORE( &r[1], 0, CHAN_X ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { + micro_sin( &r[1], &r[0] ); + STORE( &r[1], 0, CHAN_Y ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { + STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z ); + } + if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) { + STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_NRM: + assert (0); + break; + + case TGSI_OPCODE_DIV: + assert( 0 ); + break; + + case TGSI_OPCODE_DP2: + FETCH( &r[0], 0, CHAN_X ); + FETCH( &r[1], 1, CHAN_X ); + micro_mul( &r[0], &r[0], &r[1] ); + + FETCH( &r[1], 0, CHAN_Y ); + FETCH( &r[2], 1, CHAN_Y ); + micro_mul( &r[1], &r[1], &r[2] ); + micro_add( &r[0], &r[0], &r[1] ); + + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_IF: + /* push CondMask */ + assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING); + mach->CondStack[mach->CondStackTop++] = mach->CondMask; + FETCH( &r[0], 0, CHAN_X ); + /* update CondMask */ + if( ! r[0].u[0] ) { + mach->CondMask &= ~0x1; + } + if( ! r[0].u[1] ) { + mach->CondMask &= ~0x2; + } + if( ! r[0].u[2] ) { + mach->CondMask &= ~0x4; + } + if( ! r[0].u[3] ) { + mach->CondMask &= ~0x8; + } + UPDATE_EXEC_MASK(mach); + /* Todo: If CondMask==0, jump to ELSE */ + break; + + case TGSI_OPCODE_ELSE: + /* invert CondMask wrt previous mask */ + { + uint prevMask; + assert(mach->CondStackTop > 0); + prevMask = mach->CondStack[mach->CondStackTop - 1]; + mach->CondMask = ~mach->CondMask & prevMask; + UPDATE_EXEC_MASK(mach); + /* Todo: If CondMask==0, jump to ENDIF */ + } + break; + + case TGSI_OPCODE_ENDIF: + /* pop CondMask */ + assert(mach->CondStackTop > 0); + mach->CondMask = mach->CondStack[--mach->CondStackTop]; + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_END: + /* halt execution */ + *pc = -1; + break; + + case TGSI_OPCODE_REP: + assert (0); + break; + + case TGSI_OPCODE_ENDREP: + assert (0); + break; + + case TGSI_OPCODE_PUSHA: + assert (0); + break; + + case TGSI_OPCODE_POPA: + assert (0); + break; + + case TGSI_OPCODE_CEIL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_ceil( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_I2F: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_i2f( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_NOT: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_not( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_TRUNC: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + micro_trunc( &r[0], &r[0] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SHL: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_shl( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SHR: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_ishr( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_AND: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_and( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_OR: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_or( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_MOD: + assert (0); + break; + + case TGSI_OPCODE_XOR: + FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( &r[0], 0, chan_index ); + FETCH( &r[1], 1, chan_index ); + micro_xor( &r[0], &r[0], &r[1] ); + STORE( &r[0], 0, chan_index ); + } + break; + + case TGSI_OPCODE_SAD: + assert (0); + break; + + case TGSI_OPCODE_TXF: + assert (0); + break; + + case TGSI_OPCODE_TXQ: + assert (0); + break; + + case TGSI_OPCODE_EMIT: + mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16; + mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++; + break; + + case TGSI_OPCODE_ENDPRIM: + mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++; + mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0; + break; + + case TGSI_OPCODE_LOOP: + /* fall-through (for now) */ + case TGSI_OPCODE_BGNLOOP2: + /* push LoopMask and ContMasks */ + assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask; + assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING); + mach->ContStack[mach->ContStackTop++] = mach->ContMask; + break; + + case TGSI_OPCODE_ENDLOOP: + /* fall-through (for now at least) */ + case TGSI_OPCODE_ENDLOOP2: + /* Restore ContMask, but don't pop */ + assert(mach->ContStackTop > 0); + mach->ContMask = mach->ContStack[mach->ContStackTop - 1]; + if (mach->LoopMask) { + /* repeat loop: jump to instruction just past BGNLOOP */ + *pc = inst->InstructionExtLabel.Label + 1; + } + else { + /* exit loop: pop LoopMask */ + assert(mach->LoopStackTop > 0); + mach->LoopMask = mach->LoopStack[--mach->LoopStackTop]; + /* pop ContMask */ + assert(mach->ContStackTop > 0); + mach->ContMask = mach->ContStack[--mach->ContStackTop]; + } + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_BRK: + /* turn off loop channels for each enabled exec channel */ + mach->LoopMask &= ~mach->ExecMask; + /* Todo: if mach->LoopMask == 0, jump to end of loop */ + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_CONT: + /* turn off cont channels for each enabled exec channel */ + mach->ContMask &= ~mach->ExecMask; + /* Todo: if mach->LoopMask == 0, jump to end of loop */ + UPDATE_EXEC_MASK(mach); + break; + + case TGSI_OPCODE_BGNSUB: + /* no-op */ + break; + + case TGSI_OPCODE_ENDSUB: + /* no-op */ + break; + + case TGSI_OPCODE_NOISE1: + assert( 0 ); + break; + + case TGSI_OPCODE_NOISE2: + assert( 0 ); + break; + + case TGSI_OPCODE_NOISE3: + assert( 0 ); + break; + + case TGSI_OPCODE_NOISE4: + assert( 0 ); + break; + + case TGSI_OPCODE_NOP: + break; + + default: + assert( 0 ); + } + } + + + /** + * Run TGSI interpreter. + * \return bitmask of "alive" quad components + */ + uint + tgsi_exec_machine_run( struct tgsi_exec_machine *mach ) + { + uint i; + int pc = 0; + + mach->CondMask = 0xf; + mach->LoopMask = 0xf; + mach->ContMask = 0xf; + mach->FuncMask = 0xf; + mach->ExecMask = 0xf; + + mach->CondStackTop = 0; /* temporarily subvert this assertion */ + assert(mach->CondStackTop == 0); + assert(mach->LoopStackTop == 0); + assert(mach->ContStackTop == 0); + assert(mach->CallStackTop == 0); + + mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0; + mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0; + + if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) { + mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0; + mach->Primitives[0] = 0; + } + + + /* execute declarations (interpolants) */ + for (i = 0; i < mach->NumDeclarations; i++) { + exec_declaration( mach, mach->Declarations+i ); + } + + /* execute instructions, until pc is set to -1 */ + while (pc != -1) { + assert(pc < mach->NumInstructions); + exec_instruction( mach, mach->Instructions + pc, &pc ); + } + + #if 0 + /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */ + if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) { + /* + * Scale back depth component. + */ + for (i = 0; i < 4; i++) + mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF; + } + #endif + + return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; + } + + diff --cc src/gallium/auxiliary/tgsi/exec/tgsi_exec.h index 00000000000,1fb66ee960f..45c49dd007c mode 000000,100644..100644 --- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h +++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h @@@ -1,0 -1,239 +1,243 @@@ + /************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + #if !defined TGSI_EXEC_H + #define TGSI_EXEC_H + + #include "pipe/p_compiler.h" + + #if defined __cplusplus + extern "C" { + #endif + + #define NUM_CHANNELS 4 /* R,G,B,A */ + #define QUAD_SIZE 4 /* 4 pixel/quad */ + + /** + * Registers may be treated as float, signed int or unsigned int. + */ + union tgsi_exec_channel + { + float f[QUAD_SIZE]; + int i[QUAD_SIZE]; + unsigned u[QUAD_SIZE]; + }; + + /** + * A vector[RGBA] of channels[4 pixels] + */ + struct tgsi_exec_vector + { + union tgsi_exec_channel xyzw[NUM_CHANNELS]; + }; + + /** + * For fragment programs, information for computing fragment input + * values from plane equation of the triangle/line. + */ + struct tgsi_interp_coef + { + float a0[NUM_CHANNELS]; /* in an xyzw layout */ + float dadx[NUM_CHANNELS]; + float dady[NUM_CHANNELS]; + }; + + + struct softpipe_tile_cache; /**< Opaque to TGSI */ + + /** + * Information for sampling textures, which must be implemented + * by code outside the TGSI executor. + */ + struct tgsi_sampler + { + const struct pipe_sampler_state *state; + struct pipe_texture *texture; + /** Get samples for four fragments in a quad */ + void (*get_samples)(struct tgsi_sampler *sampler, + const float s[QUAD_SIZE], + const float t[QUAD_SIZE], + const float p[QUAD_SIZE], + float lodbias, + float rgba[NUM_CHANNELS][QUAD_SIZE]); + void *pipe; /*XXX temporary*/ + struct softpipe_tile_cache *cache; + }; + + /** + * For branching/calling subroutines. + */ + struct tgsi_exec_labels + { + unsigned labels[128][2]; + unsigned count; + }; + + /* + * Locations of various utility registers (_I = Index, _C = Channel) + */ + #define TGSI_EXEC_TEMP_00000000_I 32 + #define TGSI_EXEC_TEMP_00000000_C 0 + + #define TGSI_EXEC_TEMP_7FFFFFFF_I 32 + #define TGSI_EXEC_TEMP_7FFFFFFF_C 1 + + #define TGSI_EXEC_TEMP_80000000_I 32 + #define TGSI_EXEC_TEMP_80000000_C 2 + + #define TGSI_EXEC_TEMP_FFFFFFFF_I 32 + #define TGSI_EXEC_TEMP_FFFFFFFF_C 3 + + #define TGSI_EXEC_TEMP_ONE_I 33 + #define TGSI_EXEC_TEMP_ONE_C 0 + + #define TGSI_EXEC_TEMP_TWO_I 33 + #define TGSI_EXEC_TEMP_TWO_C 1 + + #define TGSI_EXEC_TEMP_128_I 33 + #define TGSI_EXEC_TEMP_128_C 2 + + #define TGSI_EXEC_TEMP_MINUS_128_I 33 + #define TGSI_EXEC_TEMP_MINUS_128_C 3 + + #define TGSI_EXEC_TEMP_KILMASK_I 34 + #define TGSI_EXEC_TEMP_KILMASK_C 0 + + #define TGSI_EXEC_TEMP_OUTPUT_I 34 + #define TGSI_EXEC_TEMP_OUTPUT_C 1 + + #define TGSI_EXEC_TEMP_PRIMITIVE_I 34 + #define TGSI_EXEC_TEMP_PRIMITIVE_C 2 + + #define TGSI_EXEC_TEMP_R0 35 + + #define TGSI_EXEC_NUM_TEMPS (32 + 4) + #define TGSI_EXEC_NUM_ADDRS 1 + #define TGSI_EXEC_NUM_IMMEDIATES 256 + + #define TGSI_EXEC_MAX_COND_NESTING 10 + #define TGSI_EXEC_MAX_LOOP_NESTING 10 + #define TGSI_EXEC_MAX_CALL_NESTING 10 + + /** + * Run-time virtual machine state for executing TGSI shader. + */ + struct tgsi_exec_machine + { + /* + * 32 program temporaries + * 4 internal temporaries + * 1 address + * 1 temporary of padding to align to 16 bytes + */ + struct tgsi_exec_vector _Temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_ADDRS + 1]; + + /* + * This will point to _Temps after aligning to 16B boundary. + */ + struct tgsi_exec_vector *Temps; + struct tgsi_exec_vector *Addrs; + + struct tgsi_sampler *Samplers; + + float Imms[TGSI_EXEC_NUM_IMMEDIATES][4]; + unsigned ImmLimit; + float (*Consts)[4]; + struct tgsi_exec_vector *Inputs; + struct tgsi_exec_vector *Outputs; + const struct tgsi_token *Tokens; + unsigned Processor; + + /* GEOMETRY processor only. */ + unsigned *Primitives; + + /* FRAGMENT processor only. */ + const struct tgsi_interp_coef *InterpCoefs; + struct tgsi_exec_vector QuadPos; + + /* Conditional execution masks */ + uint CondMask; /**< For IF/ELSE/ENDIF */ + uint LoopMask; /**< For BGNLOOP/ENDLOOP */ + uint ContMask; /**< For loop CONT statements */ + uint FuncMask; /**< For function calls */ + uint ExecMask; /**< = CondMask & LoopMask */ + + /** Condition mask stack (for nested conditionals) */ + uint CondStack[TGSI_EXEC_MAX_COND_NESTING]; + int CondStackTop; + + /** Loop mask stack (for nested loops) */ + uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING]; + int LoopStackTop; + + /** Loop continue mask stack (see comments in tgsi_exec.c) */ + uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING]; + int ContStackTop; + + /** Function execution mask stack (for executing subroutine code) */ + uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING]; + int FuncStackTop; + + /** Function call stack for saving/restoring the program counter */ + uint CallStack[TGSI_EXEC_MAX_CALL_NESTING]; + int CallStackTop; + + struct tgsi_full_instruction *Instructions; + uint NumInstructions; + + struct tgsi_full_declaration *Declarations; + uint NumDeclarations; + + struct tgsi_exec_labels Labels; + }; + - + void + tgsi_exec_machine_init( ++ struct tgsi_exec_machine *mach ); ++ ++ ++void ++tgsi_exec_machine_bind_shader( + struct tgsi_exec_machine *mach, + const struct tgsi_token *tokens, - unsigned numSamplers, ++ uint numSamplers, + struct tgsi_sampler *samplers); + + uint + tgsi_exec_machine_run( + struct tgsi_exec_machine *mach ); + + + void + tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach); + + + #if defined __cplusplus + } /* extern "C" */ + #endif + + #endif /* TGSI_EXEC_H */ diff --cc src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c index 00000000000,593464db3ed..79209575bc4 mode 000000,100755..100755 --- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c @@@ -1,0 -1,2378 +1,2377 @@@ + /************************************************************************** + * + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + #include "pipe/p_util.h" + #include "pipe/p_shader_tokens.h" + #include "tgsi/util/tgsi_parse.h" + #include "tgsi/util/tgsi_util.h" + #include "tgsi_exec.h" + #include "tgsi_sse2.h" + + #include "x86/rtasm/x86sse.h" + + #if defined(__i386__) || defined(__386__) + + #define DUMP_SSE 0 + + #if DUMP_SSE + + static void + _print_reg( + struct x86_reg reg ) + { + if (reg.mod != mod_REG) + debug_printf( "[" ); + + switch( reg.file ) { + case file_REG32: + switch( reg.idx ) { + case reg_AX: + debug_printf( "EAX" ); + break; + case reg_CX: + debug_printf( "ECX" ); + break; + case reg_DX: + debug_printf( "EDX" ); + break; + case reg_BX: + debug_printf( "EBX" ); + break; + case reg_SP: + debug_printf( "ESP" ); + break; + case reg_BP: + debug_printf( "EBP" ); + break; + case reg_SI: + debug_printf( "ESI" ); + break; + case reg_DI: + debug_printf( "EDI" ); + break; + } + break; + case file_MMX: + assert( 0 ); + break; + case file_XMM: + debug_printf( "XMM%u", reg.idx ); + break; + case file_x87: + assert( 0 ); + break; + } + + if (reg.mod == mod_DISP8 || + reg.mod == mod_DISP32) + debug_printf("+%d", reg.disp); + + if (reg.mod != mod_REG) + debug_printf( "]" ); + } + + static void + _fill( + const char *op ) + { + unsigned count = 10 - strlen( op ); + + while( count-- ) { + debug_printf( " " ); + } + } + + #define DUMP_START() debug_printf( "\nsse-dump start ----------------" ) + #define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" ) + #define DUMP( OP ) debug_printf( "\n%s", OP ) + #define DUMP_I( OP, I ) do {\ + debug_printf( "\n%s", OP );\ + _fill( OP );\ + debug_printf( "%u", I ); } while( 0 ) + #define DUMP_R( OP, R0 ) do {\ + debug_printf( "\n%s", OP );\ + _fill( OP );\ + _print_reg( R0 ); } while( 0 ) + #define DUMP_RR( OP, R0, R1 ) do {\ + debug_printf( "\n%s", OP );\ + _fill( OP );\ + _print_reg( R0 );\ + debug_printf( ", " );\ + _print_reg( R1 ); } while( 0 ) + #define DUMP_RRI( OP, R0, R1, I ) do {\ + debug_printf( "\n%s", OP );\ + _fill( OP );\ + _print_reg( R0 );\ + debug_printf( ", " );\ + _print_reg( R1 );\ + debug_printf( ", " );\ + debug_printf( "%u", I ); } while( 0 ) + + #else + + #define DUMP_START() + #define DUMP_END() + #define DUMP( OP ) + #define DUMP_I( OP, I ) + #define DUMP_R( OP, R0 ) + #define DUMP_RR( OP, R0, R1 ) + #define DUMP_RRI( OP, R0, R1, I ) + + #endif + + #define FOR_EACH_CHANNEL( CHAN )\ + for( CHAN = 0; CHAN < 4; CHAN++ ) + + #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\ + ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN))) + + #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\ + if( IS_DST0_CHANNEL_ENABLED( INST, CHAN )) + + #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\ + FOR_EACH_CHANNEL( CHAN )\ + IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN ) + + #define CHAN_X 0 + #define CHAN_Y 1 + #define CHAN_Z 2 + #define CHAN_W 3 + + #define TEMP_R0 TGSI_EXEC_TEMP_R0 + + /** + * X86 utility functions. + */ + + static struct x86_reg + make_xmm( + unsigned xmm ) + { + return x86_make_reg( + file_XMM, + (enum x86_reg_name) xmm ); + } + + /** + * X86 register mapping helpers. + */ + + static struct x86_reg + get_const_base( void ) + { + return x86_make_reg( + file_REG32, + reg_CX ); + } + + static struct x86_reg + get_input_base( void ) + { + return x86_make_reg( + file_REG32, + reg_AX ); + } + + static struct x86_reg + get_output_base( void ) + { + return x86_make_reg( + file_REG32, + reg_DX ); + } + + static struct x86_reg + get_temp_base( void ) + { + #ifdef WIN32 + return x86_make_reg( + file_REG32, + reg_BX ); + #else + return x86_make_reg( + file_REG32, + reg_SI ); + #endif + } + + static struct x86_reg + get_coef_base( void ) + { + return get_output_base(); + } + + /** + * Data access helpers. + */ + + static struct x86_reg + get_argument( + unsigned index ) + { + return x86_make_disp( + x86_make_reg( file_REG32, reg_SP ), + (index + 1) * 4 ); + } + + static struct x86_reg + get_const( + unsigned vec, + unsigned chan ) + { + return x86_make_disp( + get_const_base(), + (vec * 4 + chan) * 4 ); + } + + static struct x86_reg + get_input( + unsigned vec, + unsigned chan ) + { + return x86_make_disp( + get_input_base(), + (vec * 4 + chan) * 16 ); + } + + static struct x86_reg + get_output( + unsigned vec, + unsigned chan ) + { + return x86_make_disp( + get_output_base(), + (vec * 4 + chan) * 16 ); + } + + static struct x86_reg + get_temp( + unsigned vec, + unsigned chan ) + { + return x86_make_disp( + get_temp_base(), + (vec * 4 + chan) * 16 ); + } + + static struct x86_reg + get_coef( + unsigned vec, + unsigned chan, + unsigned member ) + { + return x86_make_disp( + get_coef_base(), + ((vec * 3 + member) * 4 + chan) * 4 ); + } + + /** + * X86 rtasm wrappers. + */ + + static void + emit_addps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "ADDPS", dst, src ); + sse_addps( func, dst, src ); + } + + static void + emit_andnps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "ANDNPS", dst, src ); + sse_andnps( func, dst, src ); + } + + static void + emit_andps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "ANDPS", dst, src ); + sse_andps( func, dst, src ); + } + + static void + emit_call( + struct x86_function *func, + void (* addr)() ) + { + struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); + + DUMP_I( "CALL", addr ); + x86_mov_reg_imm( func, ecx, (unsigned long) addr ); + x86_call( func, ecx ); + } + + static void + emit_cmpps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src, + enum sse_cc cc ) + { + DUMP_RRI( "CMPPS", dst, src, cc ); + sse_cmpps( func, dst, src, cc ); + } + + static void + emit_cvttps2dq( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "CVTTPS2DQ", dst, src ); + sse2_cvttps2dq( func, dst, src ); + } + + static void + emit_maxps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MAXPS", dst, src ); + sse_maxps( func, dst, src ); + } + + static void + emit_minps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MINPS", dst, src ); + sse_minps( func, dst, src ); + } + + static void + emit_mov( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MOV", dst, src ); + x86_mov( func, dst, src ); + } + + static void + emit_movaps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MOVAPS", dst, src ); + sse_movaps( func, dst, src ); + } + + static void + emit_movss( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MOVSS", dst, src ); + sse_movss( func, dst, src ); + } + + static void + emit_movups( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MOVUPS", dst, src ); + sse_movups( func, dst, src ); + } + + static void + emit_mulps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "MULPS", dst, src ); + sse_mulps( func, dst, src ); + } + + static void + emit_or( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "OR", dst, src ); + x86_or( func, dst, src ); + } + + static void + emit_orps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "ORPS", dst, src ); + sse_orps( func, dst, src ); + } + + static void + emit_pmovmskb( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "PMOVMSKB", dst, src ); + sse_pmovmskb( func, dst, src ); + } + + static void + emit_pop( + struct x86_function *func, + struct x86_reg dst ) + { + DUMP_R( "POP", dst ); + x86_pop( func, dst ); + } + + static void + emit_push( + struct x86_function *func, + struct x86_reg dst ) + { + DUMP_R( "PUSH", dst ); + x86_push( func, dst ); + } + + static void + emit_rcpps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "RCPPS", dst, src ); + sse2_rcpps( func, dst, src ); + } + + #ifdef WIN32 + static void + emit_retw( + struct x86_function *func, + unsigned size ) + { + DUMP_I( "RET", size ); + x86_retw( func, size ); + } + #else + static void + emit_ret( + struct x86_function *func ) + { + DUMP( "RET" ); + x86_ret( func ); + } + #endif + + static void + emit_rsqrtps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "RSQRTPS", dst, src ); + sse_rsqrtps( func, dst, src ); + } + + static void + emit_shufps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src, + unsigned char shuf ) + { + DUMP_RRI( "SHUFPS", dst, src, shuf ); + sse_shufps( func, dst, src, shuf ); + } + + static void + emit_subps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "SUBPS", dst, src ); + sse_subps( func, dst, src ); + } + + static void + emit_xorps( + struct x86_function *func, + struct x86_reg dst, + struct x86_reg src ) + { + DUMP_RR( "XORPS", dst, src ); + sse_xorps( func, dst, src ); + } + + /** + * Data fetch helpers. + */ + + static void + emit_const( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_movss( + func, + make_xmm( xmm ), + get_const( vec, chan ) ); + emit_shufps( + func, + make_xmm( xmm ), + make_xmm( xmm ), + SHUF( 0, 0, 0, 0 ) ); + } + + static void + emit_inputf( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_movups( + func, + make_xmm( xmm ), + get_input( vec, chan ) ); + } + + static void + emit_output( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_movups( + func, + get_output( vec, chan ), + make_xmm( xmm ) ); + } + + static void + emit_tempf( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_movaps( + func, + make_xmm( xmm ), + get_temp( vec, chan ) ); + } + + static void + emit_coef( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan, + unsigned member ) + { + emit_movss( + func, + make_xmm( xmm ), + get_coef( vec, chan, member ) ); + emit_shufps( + func, + make_xmm( xmm ), + make_xmm( xmm ), + SHUF( 0, 0, 0, 0 ) ); + } + + /** + * Data store helpers. + */ + + static void + emit_inputs( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_movups( + func, + get_input( vec, chan ), + make_xmm( xmm ) ); + } + + static void + emit_temps( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_movaps( + func, + get_temp( vec, chan ), + make_xmm( xmm ) ); + } + + static void + emit_addrs( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_temps( + func, + xmm, + vec + TGSI_EXEC_NUM_TEMPS, + chan ); + } + + /** + * Coefficent fetch helpers. + */ + + static void + emit_coef_a0( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_coef( + func, + xmm, + vec, + chan, + 0 ); + } + + static void + emit_coef_dadx( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_coef( + func, + xmm, + vec, + chan, + 1 ); + } + + static void + emit_coef_dady( + struct x86_function *func, + unsigned xmm, + unsigned vec, + unsigned chan ) + { + emit_coef( + func, + xmm, + vec, + chan, + 2 ); + } + + /** + * Function call helpers. + */ + + static void + emit_push_gp( + struct x86_function *func ) + { + emit_push( + func, + get_const_base() ); + emit_push( + func, + get_input_base() ); + emit_push( + func, + get_output_base() ); + + /* It is important on non-win32 platforms that temp base is pushed last. + */ + emit_push( + func, + get_temp_base() ); + } + + static void + emit_pop_gp( + struct x86_function *func ) + { + /* Restore GP registers in a reverse order. + */ + emit_pop( + func, + get_temp_base() ); + emit_pop( + func, + get_output_base() ); + emit_pop( + func, + get_input_base() ); + emit_pop( + func, + get_const_base() ); + } + + static void + emit_func_call_dst( + struct x86_function *func, + unsigned xmm_dst, + void (*code)() ) + { + emit_movaps( + func, + get_temp( TEMP_R0, 0 ), + make_xmm( xmm_dst ) ); + + emit_push_gp( + func ); + + #ifdef WIN32 + emit_push( + func, + get_temp( TEMP_R0, 0 ) ); + #endif + + emit_call( + func, + code ); + + emit_pop_gp( + func ); + + emit_movaps( + func, + make_xmm( xmm_dst ), + get_temp( TEMP_R0, 0 ) ); + } + + static void + emit_func_call_dst_src( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src, + void (*code)() ) + { + emit_movaps( + func, + get_temp( TEMP_R0, 1 ), + make_xmm( xmm_src ) ); + + emit_func_call_dst( + func, + xmm_dst, + code ); + } + + /** + * Low-level instruction translators. + */ + + static void + emit_abs( + struct x86_function *func, + unsigned xmm ) + { + emit_andps( + func, + make_xmm( xmm ), + get_temp( + TGSI_EXEC_TEMP_7FFFFFFF_I, + TGSI_EXEC_TEMP_7FFFFFFF_C ) ); + } + + static void + emit_add( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src ) + { + emit_addps( + func, + make_xmm( xmm_dst ), + make_xmm( xmm_src ) ); + } + + static void XSTDCALL + cos4f( + float *store ) + { + #ifdef WIN32 + store[0] = (float) cos( (double) store[0] ); + store[1] = (float) cos( (double) store[1] ); + store[2] = (float) cos( (double) store[2] ); + store[3] = (float) cos( (double) store[3] ); + #else + const unsigned X = TEMP_R0 * 16; + store[X + 0] = cosf( store[X + 0] ); + store[X + 1] = cosf( store[X + 1] ); + store[X + 2] = cosf( store[X + 2] ); + store[X + 3] = cosf( store[X + 3] ); + #endif + } + + static void + emit_cos( + struct x86_function *func, + unsigned xmm_dst ) + { + emit_func_call_dst( + func, + xmm_dst, + cos4f ); + } + + static void XSTDCALL + ex24f( + float *store ) + { + #ifdef WIN32 + store[0] = (float) pow( 2.0, (double) store[0] ); + store[1] = (float) pow( 2.0, (double) store[1] ); + store[2] = (float) pow( 2.0, (double) store[2] ); + store[3] = (float) pow( 2.0, (double) store[3] ); + #else + const unsigned X = TEMP_R0 * 16; + store[X + 0] = powf( 2.0f, store[X + 0] ); + store[X + 1] = powf( 2.0f, store[X + 1] ); + store[X + 2] = powf( 2.0f, store[X + 2] ); + store[X + 3] = powf( 2.0f, store[X + 3] ); + #endif + } + + static void + emit_ex2( + struct x86_function *func, + unsigned xmm_dst ) + { + emit_func_call_dst( + func, + xmm_dst, + ex24f ); + } + + static void + emit_f2it( + struct x86_function *func, + unsigned xmm ) + { + emit_cvttps2dq( + func, + make_xmm( xmm ), + make_xmm( xmm ) ); + } + + static void XSTDCALL + flr4f( + float *store ) + { + #ifdef WIN32 + const unsigned X = 0; + #else + const unsigned X = TEMP_R0 * 16; + #endif + store[X + 0] = (float) floor( (double) store[X + 0] ); + store[X + 1] = (float) floor( (double) store[X + 1] ); + store[X + 2] = (float) floor( (double) store[X + 2] ); + store[X + 3] = (float) floor( (double) store[X + 3] ); + } + + static void + emit_flr( + struct x86_function *func, + unsigned xmm_dst ) + { + emit_func_call_dst( + func, + xmm_dst, + flr4f ); + } + + static void XSTDCALL + frc4f( + float *store ) + { + #ifdef WIN32 + const unsigned X = 0; + #else + const unsigned X = TEMP_R0 * 16; + #endif + store[X + 0] -= (float) floor( (double) store[X + 0] ); + store[X + 1] -= (float) floor( (double) store[X + 1] ); + store[X + 2] -= (float) floor( (double) store[X + 2] ); + store[X + 3] -= (float) floor( (double) store[X + 3] ); + } + + static void + emit_frc( + struct x86_function *func, + unsigned xmm_dst ) + { + emit_func_call_dst( + func, + xmm_dst, + frc4f ); + } + + static void XSTDCALL + lg24f( + float *store ) + { + #ifdef WIN32 + const unsigned X = 0; + #else + const unsigned X = TEMP_R0 * 16; + #endif + store[X + 0] = LOG2( store[X + 0] ); + store[X + 1] = LOG2( store[X + 1] ); + store[X + 2] = LOG2( store[X + 2] ); + store[X + 3] = LOG2( store[X + 3] ); + } + + static void + emit_lg2( + struct x86_function *func, + unsigned xmm_dst ) + { + emit_func_call_dst( + func, + xmm_dst, + lg24f ); + } + + static void + emit_MOV( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src ) + { + emit_movups( + func, + make_xmm( xmm_dst ), + make_xmm( xmm_src ) ); + } + + static void + emit_mul (struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src) + { + emit_mulps( + func, + make_xmm( xmm_dst ), + make_xmm( xmm_src ) ); + } + + static void + emit_neg( + struct x86_function *func, + unsigned xmm ) + { + emit_xorps( + func, + make_xmm( xmm ), + get_temp( + TGSI_EXEC_TEMP_80000000_I, + TGSI_EXEC_TEMP_80000000_C ) ); + } + + static void XSTDCALL + pow4f( + float *store ) + { + #ifdef WIN32 + store[0] = (float) pow( (double) store[0], (double) store[4] ); + store[1] = (float) pow( (double) store[1], (double) store[5] ); + store[2] = (float) pow( (double) store[2], (double) store[6] ); + store[3] = (float) pow( (double) store[3], (double) store[7] ); + #else + const unsigned X = TEMP_R0 * 16; + store[X + 0] = powf( store[X + 0], store[X + 4] ); + store[X + 1] = powf( store[X + 1], store[X + 5] ); + store[X + 2] = powf( store[X + 2], store[X + 6] ); + store[X + 3] = powf( store[X + 3], store[X + 7] ); + #endif + } + + static void + emit_pow( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src ) + { + emit_func_call_dst_src( + func, + xmm_dst, + xmm_src, + pow4f ); + } + + static void + emit_rcp ( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src ) + { + emit_rcpps( + func, + make_xmm( xmm_dst ), + make_xmm( xmm_src ) ); + } + + static void + emit_rsqrt( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src ) + { + emit_rsqrtps( + func, + make_xmm( xmm_dst ), + make_xmm( xmm_src ) ); + } + + static void + emit_setsign( + struct x86_function *func, + unsigned xmm ) + { + emit_orps( + func, + make_xmm( xmm ), + get_temp( + TGSI_EXEC_TEMP_80000000_I, + TGSI_EXEC_TEMP_80000000_C ) ); + } + + static void XSTDCALL + sin4f( + float *store ) + { + #ifdef WIN32 + store[0] = (float) sin( (double) store[0] ); + store[1] = (float) sin( (double) store[1] ); + store[2] = (float) sin( (double) store[2] ); + store[3] = (float) sin( (double) store[3] ); + #else + const unsigned X = TEMP_R0 * 16; + store[X + 0] = sinf( store[X + 0] ); + store[X + 1] = sinf( store[X + 1] ); + store[X + 2] = sinf( store[X + 2] ); + store[X + 3] = sinf( store[X + 3] ); + #endif + } + + static void + emit_sin (struct x86_function *func, + unsigned xmm_dst) + { + emit_func_call_dst( + func, + xmm_dst, + sin4f ); + } + + static void + emit_sub( + struct x86_function *func, + unsigned xmm_dst, + unsigned xmm_src ) + { + emit_subps( + func, + make_xmm( xmm_dst ), + make_xmm( xmm_src ) ); + } + + /** + * Register fetch. + */ + + static void + emit_fetch( + struct x86_function *func, + unsigned xmm, + const struct tgsi_full_src_register *reg, + const unsigned chan_index ) + { + unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index ); + + switch( swizzle ) { + case TGSI_EXTSWIZZLE_X: + case TGSI_EXTSWIZZLE_Y: + case TGSI_EXTSWIZZLE_Z: + case TGSI_EXTSWIZZLE_W: + switch( reg->SrcRegister.File ) { + case TGSI_FILE_CONSTANT: + emit_const( + func, + xmm, + reg->SrcRegister.Index, + swizzle ); + break; + + case TGSI_FILE_INPUT: + emit_inputf( + func, + xmm, + reg->SrcRegister.Index, + swizzle ); + break; + + case TGSI_FILE_TEMPORARY: + emit_tempf( + func, + xmm, + reg->SrcRegister.Index, + swizzle ); + break; + + default: + assert( 0 ); + } + break; + + case TGSI_EXTSWIZZLE_ZERO: + emit_tempf( + func, + xmm, + TGSI_EXEC_TEMP_00000000_I, + TGSI_EXEC_TEMP_00000000_C ); + break; + + case TGSI_EXTSWIZZLE_ONE: + emit_tempf( + func, + xmm, + TGSI_EXEC_TEMP_ONE_I, + TGSI_EXEC_TEMP_ONE_C ); + break; + + default: + assert( 0 ); + } + + switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) { + case TGSI_UTIL_SIGN_CLEAR: + emit_abs( func, xmm ); + break; + + case TGSI_UTIL_SIGN_SET: + emit_setsign( func, xmm ); + break; + + case TGSI_UTIL_SIGN_TOGGLE: + emit_neg( func, xmm ); + break; + + case TGSI_UTIL_SIGN_KEEP: + break; + } + } + + #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\ + emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN ) + + /** + * Register store. + */ + + static void + emit_store( + struct x86_function *func, + unsigned xmm, + const struct tgsi_full_dst_register *reg, + const struct tgsi_full_instruction *inst, + unsigned chan_index ) + { + switch( reg->DstRegister.File ) { + case TGSI_FILE_OUTPUT: + emit_output( + func, + xmm, + reg->DstRegister.Index, + chan_index ); + break; + + case TGSI_FILE_TEMPORARY: + emit_temps( + func, + xmm, + reg->DstRegister.Index, + chan_index ); + break; + + case TGSI_FILE_ADDRESS: + emit_addrs( + func, + xmm, + reg->DstRegister.Index, + chan_index ); + break; + + default: + assert( 0 ); + } + + switch( inst->Instruction.Saturate ) { + case TGSI_SAT_NONE: + break; + + case TGSI_SAT_ZERO_ONE: + // assert( 0 ); + break; + + case TGSI_SAT_MINUS_PLUS_ONE: + assert( 0 ); + break; + } + } + + #define STORE( FUNC, INST, XMM, INDEX, CHAN )\ + emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN ) + + /** + * High-level instruction translators. + */ + + static void + emit_kil( + struct x86_function *func, + const struct tgsi_full_src_register *reg ) + { + unsigned uniquemask; + unsigned registers[4]; + unsigned nextregister = 0; + unsigned firstchan = ~0; + unsigned chan_index; + + /* This mask stores component bits that were already tested. Note that + * we test if the value is less than zero, so 1.0 and 0.0 need not to be + * tested. */ + uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE); + + FOR_EACH_CHANNEL( chan_index ) { + unsigned swizzle; + + /* unswizzle channel */ + swizzle = tgsi_util_get_full_src_register_extswizzle( + reg, + chan_index ); + + /* check if the component has not been already tested */ + if( !(uniquemask & (1 << swizzle)) ) { + uniquemask |= 1 << swizzle; + + /* allocate register */ + registers[chan_index] = nextregister; + emit_fetch( + func, + nextregister, + reg, + chan_index ); + nextregister++; + + /* mark the first channel used */ + if( firstchan == ~0 ) { + firstchan = chan_index; + } + } + } + + emit_push( + func, + x86_make_reg( file_REG32, reg_AX ) ); + emit_push( + func, + x86_make_reg( file_REG32, reg_DX ) ); + + FOR_EACH_CHANNEL( chan_index ) { + if( uniquemask & (1 << chan_index) ) { + emit_cmpps( + func, + make_xmm( registers[chan_index] ), + get_temp( + TGSI_EXEC_TEMP_00000000_I, + TGSI_EXEC_TEMP_00000000_C ), + cc_LessThan ); + + if( chan_index == firstchan ) { + emit_pmovmskb( + func, + x86_make_reg( file_REG32, reg_AX ), + make_xmm( registers[chan_index] ) ); + } + else { + emit_pmovmskb( + func, + x86_make_reg( file_REG32, reg_DX ), + make_xmm( registers[chan_index] ) ); + emit_or( + func, + x86_make_reg( file_REG32, reg_AX ), + x86_make_reg( file_REG32, reg_DX ) ); + } + } + } + + emit_or( + func, + get_temp( + TGSI_EXEC_TEMP_KILMASK_I, + TGSI_EXEC_TEMP_KILMASK_C ), + x86_make_reg( file_REG32, reg_AX ) ); + + emit_pop( + func, + x86_make_reg( file_REG32, reg_DX ) ); + emit_pop( + func, + x86_make_reg( file_REG32, reg_AX ) ); + } + + static void + emit_setcc( + struct x86_function *func, + struct tgsi_full_instruction *inst, + enum sse_cc cc ) + { + unsigned chan_index; + + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + emit_cmpps( + func, + make_xmm( 0 ), + make_xmm( 1 ), + cc ); + emit_andps( + func, + make_xmm( 0 ), + get_temp( + TGSI_EXEC_TEMP_ONE_I, + TGSI_EXEC_TEMP_ONE_C ) ); + STORE( func, *inst, 0, 0, chan_index ); + } + } + + static void + emit_cmp( + struct x86_function *func, + struct tgsi_full_instruction *inst ) + { + unsigned chan_index; + + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + FETCH( func, *inst, 2, 2, chan_index ); + emit_cmpps( + func, + make_xmm( 0 ), + get_temp( + TGSI_EXEC_TEMP_00000000_I, + TGSI_EXEC_TEMP_00000000_C ), + cc_LessThan ); + emit_andps( + func, + make_xmm( 1 ), + make_xmm( 0 ) ); + emit_andnps( + func, + make_xmm( 0 ), + make_xmm( 2 ) ); + emit_orps( + func, + make_xmm( 0 ), + make_xmm( 1 ) ); + STORE( func, *inst, 0, 0, chan_index ); + } + } + + static int + emit_instruction( + struct x86_function *func, + struct tgsi_full_instruction *inst ) + { + unsigned chan_index; + + switch( inst->Instruction.Opcode ) { + case TGSI_OPCODE_ARL: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + emit_f2it( func, 0 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_MOV: + /* TGSI_OPCODE_SWZ */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_LIT: + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || + IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) { + emit_tempf( + func, + 0, + TGSI_EXEC_TEMP_ONE_I, + TGSI_EXEC_TEMP_ONE_C); + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) { + STORE( func, *inst, 0, 0, CHAN_X ); + } + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) { + STORE( func, *inst, 0, 0, CHAN_W ); + } + } + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) || + IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_maxps( + func, + make_xmm( 0 ), + get_temp( + TGSI_EXEC_TEMP_00000000_I, + TGSI_EXEC_TEMP_00000000_C ) ); + STORE( func, *inst, 0, 0, CHAN_Y ); + } + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { + FETCH( func, *inst, 1, 0, CHAN_Y ); + emit_maxps( + func, + make_xmm( 1 ), + get_temp( + TGSI_EXEC_TEMP_00000000_I, + TGSI_EXEC_TEMP_00000000_C ) ); + FETCH( func, *inst, 2, 0, CHAN_W ); + emit_minps( + func, + make_xmm( 2 ), + get_temp( + TGSI_EXEC_TEMP_128_I, + TGSI_EXEC_TEMP_128_C ) ); + emit_maxps( + func, + make_xmm( 2 ), + get_temp( + TGSI_EXEC_TEMP_MINUS_128_I, + TGSI_EXEC_TEMP_MINUS_128_C ) ); + emit_pow( func, 1, 2 ); + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_xorps( + func, + make_xmm( 2 ), + make_xmm( 2 ) ); + emit_cmpps( + func, + make_xmm( 2 ), + make_xmm( 0 ), + cc_LessThanEqual ); + emit_andps( + func, + make_xmm( 2 ), + make_xmm( 1 ) ); + STORE( func, *inst, 2, 0, CHAN_Z ); + } + } + break; + + case TGSI_OPCODE_RCP: + /* TGSI_OPCODE_RECIP */ + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_rcp( func, 0, 0 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_RSQ: + /* TGSI_OPCODE_RECIPSQRT */ + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_rsqrt( func, 0, 0 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_EXP: + return 0; + break; + + case TGSI_OPCODE_LOG: + return 0; + break; + + case TGSI_OPCODE_MUL: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + emit_mul( func, 0, 1 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_ADD: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + emit_add( func, 0, 1 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_DP3: + /* TGSI_OPCODE_DOT3 */ + FETCH( func, *inst, 0, 0, CHAN_X ); + FETCH( func, *inst, 1, 1, CHAN_X ); + emit_mul( func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_Y ); + FETCH( func, *inst, 2, 1, CHAN_Y ); + emit_mul( func, 1, 2 ); + emit_add( func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_Z ); + FETCH( func, *inst, 2, 1, CHAN_Z ); + emit_mul( func, 1, 2 ); + emit_add( func, 0, 1 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_DP4: + /* TGSI_OPCODE_DOT4 */ + FETCH( func, *inst, 0, 0, CHAN_X ); + FETCH( func, *inst, 1, 1, CHAN_X ); + emit_mul( func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_Y ); + FETCH( func, *inst, 2, 1, CHAN_Y ); + emit_mul( func, 1, 2 ); + emit_add( func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_Z ); + FETCH( func, *inst, 2, 1, CHAN_Z ); + emit_mul(func, 1, 2 ); + emit_add(func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_W ); + FETCH( func, *inst, 2, 1, CHAN_W ); + emit_mul( func, 1, 2 ); + emit_add( func, 0, 1 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_DST: + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { + emit_tempf( + func, + 0, + TGSI_EXEC_TEMP_ONE_I, + TGSI_EXEC_TEMP_ONE_C ); + STORE( func, *inst, 0, 0, CHAN_X ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { + FETCH( func, *inst, 0, 0, CHAN_Y ); + FETCH( func, *inst, 1, 1, CHAN_Y ); + emit_mul( func, 0, 1 ); + STORE( func, *inst, 0, 0, CHAN_Y ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { + FETCH( func, *inst, 0, 0, CHAN_Z ); + STORE( func, *inst, 0, 0, CHAN_Z ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) { + FETCH( func, *inst, 0, 1, CHAN_W ); + STORE( func, *inst, 0, 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_MIN: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + emit_minps( + func, + make_xmm( 0 ), + make_xmm( 1 ) ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_MAX: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + emit_maxps( + func, + make_xmm( 0 ), + make_xmm( 1 ) ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_SLT: + /* TGSI_OPCODE_SETLT */ + emit_setcc( func, inst, cc_LessThan ); + break; + + case TGSI_OPCODE_SGE: + /* TGSI_OPCODE_SETGE */ + emit_setcc( func, inst, cc_NotLessThan ); + break; + + case TGSI_OPCODE_MAD: + /* TGSI_OPCODE_MADD */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + FETCH( func, *inst, 2, 2, chan_index ); + emit_mul( func, 0, 1 ); + emit_add( func, 0, 2 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_SUB: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + emit_sub( func, 0, 1 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_LERP: + /* TGSI_OPCODE_LRP */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + FETCH( func, *inst, 1, 1, chan_index ); + FETCH( func, *inst, 2, 2, chan_index ); + emit_sub( func, 1, 2 ); + emit_mul( func, 0, 1 ); + emit_add( func, 0, 2 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_CND: + return 0; + break; + + case TGSI_OPCODE_CND0: + return 0; + break; + + case TGSI_OPCODE_DOT2ADD: + /* TGSI_OPCODE_DP2A */ + return 0; + break; + + case TGSI_OPCODE_INDEX: + return 0; + break; + + case TGSI_OPCODE_NEGATE: + return 0; + break; + + case TGSI_OPCODE_FRAC: + /* TGSI_OPCODE_FRC */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + emit_frc( func, 0 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_CLAMP: + return 0; + break; + + case TGSI_OPCODE_FLOOR: + /* TGSI_OPCODE_FLR */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + emit_flr( func, 0 ); + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_ROUND: + return 0; + break; + + case TGSI_OPCODE_EXPBASE2: + /* TGSI_OPCODE_EX2 */ + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_ex2( func, 0 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_LOGBASE2: + /* TGSI_OPCODE_LG2 */ + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_lg2( func, 0 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_POWER: + /* TGSI_OPCODE_POW */ + FETCH( func, *inst, 0, 0, CHAN_X ); + FETCH( func, *inst, 1, 1, CHAN_X ); + emit_pow( func, 0, 1 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_CROSSPRODUCT: + /* TGSI_OPCODE_XPD */ + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || + IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { + FETCH( func, *inst, 1, 1, CHAN_Z ); + FETCH( func, *inst, 3, 0, CHAN_Z ); + } + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || + IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { + FETCH( func, *inst, 0, 0, CHAN_Y ); + FETCH( func, *inst, 4, 1, CHAN_Y ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { + emit_MOV( func, 2, 0 ); + emit_mul( func, 2, 1 ); + emit_MOV( func, 5, 3 ); + emit_mul( func, 5, 4 ); + emit_sub( func, 2, 5 ); + STORE( func, *inst, 2, 0, CHAN_X ); + } + if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) || + IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { + FETCH( func, *inst, 2, 1, CHAN_X ); + FETCH( func, *inst, 5, 0, CHAN_X ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { + emit_mul( func, 3, 2 ); + emit_mul( func, 1, 5 ); + emit_sub( func, 3, 1 ); + STORE( func, *inst, 3, 0, CHAN_Y ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { + emit_mul( func, 5, 4 ); + emit_mul( func, 0, 2 ); + emit_sub( func, 5, 0 ); + STORE( func, *inst, 5, 0, CHAN_Z ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) { + FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C ); + STORE( func, *inst, 0, 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_MULTIPLYMATRIX: + return 0; + break; + + case TGSI_OPCODE_ABS: + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + FETCH( func, *inst, 0, 0, chan_index ); + emit_abs( func, 0) ; + + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_RCC: + return 0; + break; + + case TGSI_OPCODE_DPH: + FETCH( func, *inst, 0, 0, CHAN_X ); + FETCH( func, *inst, 1, 1, CHAN_X ); + emit_mul( func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_Y ); + FETCH( func, *inst, 2, 1, CHAN_Y ); + emit_mul( func, 1, 2 ); + emit_add( func, 0, 1 ); + FETCH( func, *inst, 1, 0, CHAN_Z ); + FETCH( func, *inst, 2, 1, CHAN_Z ); + emit_mul( func, 1, 2 ); + emit_add( func, 0, 1 ); + FETCH( func, *inst, 1, 1, CHAN_W ); + emit_add( func, 0, 1 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_COS: + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_cos( func, 0 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_DDX: + return 0; + break; + + case TGSI_OPCODE_DDY: + return 0; + break; + + case TGSI_OPCODE_KIL: + emit_kil( func, &inst->FullSrcRegisters[0] ); + break; + + case TGSI_OPCODE_PK2H: + return 0; + break; + + case TGSI_OPCODE_PK2US: + return 0; + break; + + case TGSI_OPCODE_PK4B: + return 0; + break; + + case TGSI_OPCODE_PK4UB: + return 0; + break; + + case TGSI_OPCODE_RFL: + return 0; + break; + + case TGSI_OPCODE_SEQ: + return 0; + break; + + case TGSI_OPCODE_SFL: + return 0; + break; + + case TGSI_OPCODE_SGT: + return 0; + break; + + case TGSI_OPCODE_SIN: + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_sin( func, 0 ); + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE( func, *inst, 0, 0, chan_index ); + } + break; + + case TGSI_OPCODE_SLE: + return 0; + break; + + case TGSI_OPCODE_SNE: + return 0; + break; + + case TGSI_OPCODE_STR: + return 0; + break; + + case TGSI_OPCODE_TEX: - emit_tempf( - func, - 0, - TGSI_EXEC_TEMP_ONE_I, - TGSI_EXEC_TEMP_ONE_C ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); ++ if (0) { ++ /* Disable dummy texture code: ++ */ ++ emit_tempf( ++ func, ++ 0, ++ TGSI_EXEC_TEMP_ONE_I, ++ TGSI_EXEC_TEMP_ONE_C ); ++ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { ++ STORE( func, *inst, 0, 0, chan_index ); ++ } ++ } ++ else { ++ return 0; + } + break; + + case TGSI_OPCODE_TXD: + return 0; + break; + + case TGSI_OPCODE_UP2H: + return 0; + break; + + case TGSI_OPCODE_UP2US: + return 0; + break; + + case TGSI_OPCODE_UP4B: + return 0; + break; + + case TGSI_OPCODE_UP4UB: + return 0; + break; + + case TGSI_OPCODE_X2D: + return 0; + break; + + case TGSI_OPCODE_ARA: + return 0; + break; + + case TGSI_OPCODE_ARR: + return 0; + break; + + case TGSI_OPCODE_BRA: + return 0; + break; + + case TGSI_OPCODE_CAL: + return 0; + break; + + case TGSI_OPCODE_RET: + case TGSI_OPCODE_END: + #ifdef WIN32 + emit_retw( func, 16 ); + #else + emit_ret( func ); + #endif + break; + + case TGSI_OPCODE_SSG: + return 0; + break; + + case TGSI_OPCODE_CMP: + emit_cmp (func, inst); + break; + + case TGSI_OPCODE_SCS: + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { + FETCH( func, *inst, 0, 0, CHAN_X ); + emit_cos( func, 0 ); + STORE( func, *inst, 0, 0, CHAN_X ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { + FETCH( func, *inst, 0, 0, CHAN_Y ); + emit_sin( func, 0 ); + STORE( func, *inst, 0, 0, CHAN_Y ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { + FETCH( func, *inst, 0, TGSI_EXEC_TEMP_00000000_I, TGSI_EXEC_TEMP_00000000_C ); + STORE( func, *inst, 0, 0, CHAN_Z ); + } + IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) { + FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C ); + STORE( func, *inst, 0, 0, CHAN_W ); + } + break; + + case TGSI_OPCODE_TXB: + return 0; + break; + + case TGSI_OPCODE_NRM: + return 0; + break; + + case TGSI_OPCODE_DIV: + return 0; + break; + + case TGSI_OPCODE_DP2: + return 0; + break; + + case TGSI_OPCODE_TXL: + return 0; + break; + + case TGSI_OPCODE_BRK: + return 0; + break; + + case TGSI_OPCODE_IF: + return 0; + break; + + case TGSI_OPCODE_LOOP: + return 0; + break; + + case TGSI_OPCODE_REP: + return 0; + break; + + case TGSI_OPCODE_ELSE: + return 0; + break; + + case TGSI_OPCODE_ENDIF: + return 0; + break; + + case TGSI_OPCODE_ENDLOOP: + return 0; + break; + + case TGSI_OPCODE_ENDREP: + return 0; + break; + + case TGSI_OPCODE_PUSHA: + return 0; + break; + + case TGSI_OPCODE_POPA: + return 0; + break; + + case TGSI_OPCODE_CEIL: + return 0; + break; + + case TGSI_OPCODE_I2F: + return 0; + break; + + case TGSI_OPCODE_NOT: + return 0; + break; + + case TGSI_OPCODE_TRUNC: + return 0; + break; + + case TGSI_OPCODE_SHL: + return 0; + break; + + case TGSI_OPCODE_SHR: + return 0; + break; + + case TGSI_OPCODE_AND: + return 0; + break; + + case TGSI_OPCODE_OR: + return 0; + break; + + case TGSI_OPCODE_MOD: + return 0; + break; + + case TGSI_OPCODE_XOR: + return 0; + break; + + case TGSI_OPCODE_SAD: + return 0; + break; + + case TGSI_OPCODE_TXF: + return 0; + break; + + case TGSI_OPCODE_TXQ: + return 0; + break; + + case TGSI_OPCODE_CONT: + return 0; + break; + + case TGSI_OPCODE_EMIT: + return 0; + break; + + case TGSI_OPCODE_ENDPRIM: + return 0; + break; + + default: + return 0; + } + + return 1; + } + + static void + emit_declaration( + struct x86_function *func, + struct tgsi_full_declaration *decl ) + { + if( decl->Declaration.File == TGSI_FILE_INPUT ) { + unsigned first, last, mask; + unsigned i, j; + + assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE ); + + first = decl->u.DeclarationRange.First; + last = decl->u.DeclarationRange.Last; + mask = decl->Declaration.UsageMask; + - /* Do not touch WPOS.xy */ - if( first == 0 ) { - mask &= ~TGSI_WRITEMASK_XY; - if( mask == TGSI_WRITEMASK_NONE ) { - first++; - } - } - + for( i = first; i <= last; i++ ) { + for( j = 0; j < NUM_CHANNELS; j++ ) { + if( mask & (1 << j) ) { + switch( decl->Interpolation.Interpolate ) { + case TGSI_INTERPOLATE_CONSTANT: + emit_coef_a0( func, 0, i, j ); + emit_inputs( func, 0, i, j ); + break; + + case TGSI_INTERPOLATE_LINEAR: - emit_inputf( func, 0, 0, TGSI_SWIZZLE_X ); ++ emit_tempf( func, 0, 0, TGSI_SWIZZLE_X ); + emit_coef_dadx( func, 1, i, j ); - emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y ); ++ emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y ); + emit_coef_dady( func, 3, i, j ); + emit_mul( func, 0, 1 ); /* x * dadx */ + emit_coef_a0( func, 4, i, j ); + emit_mul( func, 2, 3 ); /* y * dady */ + emit_add( func, 0, 4 ); /* x * dadx + a0 */ + emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */ + emit_inputs( func, 0, i, j ); + break; + + case TGSI_INTERPOLATE_PERSPECTIVE: - emit_inputf( func, 0, 0, TGSI_SWIZZLE_X ); ++ emit_tempf( func, 0, 0, TGSI_SWIZZLE_X ); + emit_coef_dadx( func, 1, i, j ); - emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y ); ++ emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y ); + emit_coef_dady( func, 3, i, j ); + emit_mul( func, 0, 1 ); /* x * dadx */ - emit_inputf( func, 4, 0, TGSI_SWIZZLE_W ); ++ emit_tempf( func, 4, 0, TGSI_SWIZZLE_W ); + emit_coef_a0( func, 5, i, j ); + emit_rcp( func, 4, 4 ); /* 1.0 / w */ + emit_mul( func, 2, 3 ); /* y * dady */ + emit_add( func, 0, 5 ); /* x * dadx + a0 */ + emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */ + emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */ + emit_inputs( func, 0, i, j ); + break; + + default: + assert( 0 ); + break; + } + } + } + } + } + } + + unsigned + tgsi_emit_sse2( + struct tgsi_token *tokens, + struct x86_function *func ) + { + struct tgsi_parse_context parse; + unsigned ok = 1; + + DUMP_START(); + + func->csr = func->store; + + emit_mov( + func, + get_input_base(), + get_argument( 0 ) ); + emit_mov( + func, + get_output_base(), + get_argument( 1 ) ); + emit_mov( + func, + get_const_base(), + get_argument( 2 ) ); + emit_mov( + func, + get_temp_base(), + get_argument( 3 ) ); + + tgsi_parse_init( &parse, tokens ); + + while( !tgsi_parse_end_of_tokens( &parse ) && ok ) { + tgsi_parse_token( &parse ); + + switch( parse.FullToken.Token.Type ) { + case TGSI_TOKEN_TYPE_DECLARATION: + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + ok = emit_instruction( + func, + &parse.FullToken.FullInstruction ); + + if (!ok) { + debug_printf("failed to translate tgsi opcode %d\n", + parse.FullToken.FullInstruction.Instruction.Opcode ); + } + break; + + case TGSI_TOKEN_TYPE_IMMEDIATE: + /* XXX implement this */ + ok = 0; + debug_printf("failed to emit immediate value\n"); + break; + + default: + assert( 0 ); + ok = 0; + break; + } + } + + tgsi_parse_free( &parse ); + + DUMP_END(); + + return ok; + } + + /** + * Fragment shaders are responsible for interpolating shader inputs. Because on + * x86 we have only 4 GP registers, and here we have 5 shader arguments (input, + * output, const, temp and coef), the code is split into two phases -- + * DECLARATION and INSTRUCTION phase. + * GP register holding the output argument is aliased with the coeff argument, + * as outputs are not needed in the DECLARATION phase. + */ + unsigned + tgsi_emit_sse2_fs( + struct tgsi_token *tokens, + struct x86_function *func ) + { + struct tgsi_parse_context parse; + boolean instruction_phase = FALSE; + + DUMP_START(); + + func->csr = func->store; + + /* DECLARATION phase, do not load output argument. */ + emit_mov( + func, + get_input_base(), + get_argument( 0 ) ); + emit_mov( + func, + get_const_base(), + get_argument( 2 ) ); + emit_mov( + func, + get_temp_base(), + get_argument( 3 ) ); + emit_mov( + func, + get_coef_base(), + get_argument( 4 ) ); + + tgsi_parse_init( &parse, tokens ); + + while( !tgsi_parse_end_of_tokens( &parse ) ) { + tgsi_parse_token( &parse ); + + switch( parse.FullToken.Token.Type ) { + case TGSI_TOKEN_TYPE_DECLARATION: + emit_declaration( + func, + &parse.FullToken.FullDeclaration ); + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + if( !instruction_phase ) { + /* INSTRUCTION phase, overwrite coeff with output. */ + instruction_phase = TRUE; + emit_mov( + func, + get_output_base(), + get_argument( 1 ) ); + } + emit_instruction( + func, + &parse.FullToken.FullInstruction ); + break; + + case TGSI_TOKEN_TYPE_IMMEDIATE: + /* XXX implement this */ + assert(0); + break; + + default: + assert( 0 ); + } + } + + tgsi_parse_free( &parse ); + + DUMP_END(); + + return 1; + } + + #endif /* i386 */