src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_config.h"
  29
  30 #if defined(PIPE_ARCH_X86)
  31
  32 #include "pipe/p_debug.h"
  33 #include "pipe/p_shader_tokens.h"
  34 #include "util/u_math.h"
  35 #if defined(PIPE_ARCH_SSE)
  36 #include "util/u_sse.h"
  37 #endif
  38 #include "tgsi/tgsi_parse.h"
  39 #include "tgsi/tgsi_util.h"
  40 #include "tgsi_exec.h"
  41 #include "tgsi_sse2.h"
  42
  43 #include "rtasm/rtasm_x86sse.h"
  44
  45 /* for 1/sqrt()
  46  *
  47  * This costs about 100fps (close to 10%) in gears:
  48  */
  49 #define HIGH_PRECISION 1
  50
  51 #define FAST_MATH 1
  52
  53
  54 #define FOR_EACH_CHANNEL( CHAN )\
  55    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  56
  57 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  58    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  59
  60 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  61    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  62
  63 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  64    FOR_EACH_CHANNEL( CHAN )\
  65       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  66
  67 #define CHAN_X 0
  68 #define CHAN_Y 1
  69 #define CHAN_Z 2
  70 #define CHAN_W 3
  71
  72 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  73 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  74
  75 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  76 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  77 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  78 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  79
  80
  81 /**
  82  * X86 utility functions.
  83  */
  84
  85 static struct x86_reg
  86 make_xmm(
  87    unsigned xmm )
  88 {
  89    return x86_make_reg(
  90       file_XMM,
  91       (enum x86_reg_name) xmm );
  92 }
  93
  94 /**
  95  * X86 register mapping helpers.
  96  */
  97
  98 static struct x86_reg
  99 get_const_base( void )
 100 {
 101    return x86_make_reg(
 102       file_REG32,
 103       reg_CX );
 104 }
 105
 106 static struct x86_reg
 107 get_input_base( void )
 108 {
 109    return x86_make_reg(
 110       file_REG32,
 111       reg_AX );
 112 }
 113
 114 static struct x86_reg
 115 get_output_base( void )
 116 {
 117    return x86_make_reg(
 118       file_REG32,
 119       reg_DX );
 120 }
 121
 122 static struct x86_reg
 123 get_temp_base( void )
 124 {
 125    return x86_make_reg(
 126       file_REG32,
 127       reg_BX );
 128 }
 129
 130 static struct x86_reg
 131 get_coef_base( void )
 132 {
 133    return get_output_base();
 134 }
 135
 136 static struct x86_reg
 137 get_immediate_base( void )
 138 {
 139    return x86_make_reg(
 140       file_REG32,
 141       reg_DI );
 142 }
 143
 144
 145 /**
 146  * Data access helpers.
 147  */
 148
 149
 150 static struct x86_reg
 151 get_immediate(
 152    unsigned vec,
 153    unsigned chan )
 154 {
 155    return x86_make_disp(
 156       get_immediate_base(),
 157       (vec * 4 + chan) * 4 );
 158 }
 159
 160 static struct x86_reg
 161 get_const(
 162    unsigned vec,
 163    unsigned chan )
 164 {
 165    return x86_make_disp(
 166       get_const_base(),
 167       (vec * 4 + chan) * 4 );
 168 }
 169
 170 static struct x86_reg
 171 get_input(
 172    unsigned vec,
 173    unsigned chan )
 174 {
 175    return x86_make_disp(
 176       get_input_base(),
 177       (vec * 4 + chan) * 16 );
 178 }
 179
 180 static struct x86_reg
 181 get_output(
 182    unsigned vec,
 183    unsigned chan )
 184 {
 185    return x86_make_disp(
 186       get_output_base(),
 187       (vec * 4 + chan) * 16 );
 188 }
 189
 190 static struct x86_reg
 191 get_temp(
 192    unsigned vec,
 193    unsigned chan )
 194 {
 195    return x86_make_disp(
 196       get_temp_base(),
 197       (vec * 4 + chan) * 16 );
 198 }
 199
 200 static struct x86_reg
 201 get_coef(
 202    unsigned vec,
 203    unsigned chan,
 204    unsigned member )
 205 {
 206    return x86_make_disp(
 207       get_coef_base(),
 208       ((vec * 3 + member) * 4 + chan) * 4 );
 209 }
 210
 211
 212 static void
 213 emit_ret(
 214    struct x86_function  *func )
 215 {
 216    x86_ret( func );
 217 }
 218
 219
 220 /**
 221  * Data fetch helpers.
 222  */
 223
 224 /**
 225  * Copy a shader constant to xmm register
 226  * \param xmm  the destination xmm register
 227  * \param vec  the src const buffer index
 228  * \param chan  src channel to fetch (X, Y, Z or W)
 229  */
 230 static void
 231 emit_const(
 232    struct x86_function *func,
 233    uint xmm,
 234    int vec,
 235    uint chan,
 236    uint indirect,
 237    uint indirectFile,
 238    int indirectIndex )
 239 {
 240    if (indirect) {
 241       /* 'vec' is the offset from the address register's value.
 242        * We're loading CONST[ADDR+vec] into an xmm register.
 243        */
 244       struct x86_reg r0 = get_input_base();
 245       struct x86_reg r1 = get_output_base();
 246       uint i;
 247
 248       assert( indirectFile == TGSI_FILE_ADDRESS );
 249       assert( indirectIndex == 0 );
 250
 251       x86_push( func, r0 );
 252       x86_push( func, r1 );
 253
 254       /*
 255        * Loop over the four pixels or vertices in the quad.
 256        * Get the value of the address (offset) register for pixel/vertex[i],
 257        * add it to the src offset and index into the constant buffer.
 258        * Note that we're working on SOA data.
 259        * If any of the pixel/vertex execution channels are unused their
 260        * values will be garbage.  It's very important that we don't use
 261        * those garbage values as indexes into the constant buffer since
 262        * that'll cause segfaults.
 263        * The solution is to bitwise-AND the offset with the execution mask
 264        * register whose values are either 0 or ~0.
 265        * The caller must setup the execution mask register to indicate
 266        * which channels are valid/alive before running the shader.
 267        * The execution mask will also figure into loops and conditionals
 268        * someday.
 269        */
 270       for (i = 0; i < QUAD_SIZE; i++) {
 271          /* r1 = address register[i] */
 272          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 273          /* r0 = execution mask[i] */
 274          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 275          /* r1 = r1 & r0 */
 276          x86_and( func, r1, r0 );
 277          /* r0 = 'vec', the offset */
 278          x86_lea( func, r0, get_const( vec, chan ) );
 279
 280          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 281           */
 282          x86_add( func, r1, r1 );
 283          x86_add( func, r1, r1 );
 284          x86_add( func, r1, r1 );
 285          x86_add( func, r1, r1 );
 286
 287          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 288          x86_mov( func, r1, x86_deref( r0 ) );
 289          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 290       }
 291
 292       x86_pop( func, r1 );
 293       x86_pop( func, r0 );
 294
 295       sse_movaps(
 296          func,
 297          make_xmm( xmm ),
 298          get_temp( TEMP_R0, CHAN_X ) );
 299    }
 300    else {
 301       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 302       assert( vec >= 0 );
 303
 304       sse_movss(
 305          func,
 306          make_xmm( xmm ),
 307          get_const( vec, chan ) );
 308       sse_shufps(
 309          func,
 310          make_xmm( xmm ),
 311          make_xmm( xmm ),
 312          SHUF( 0, 0, 0, 0 ) );
 313    }
 314 }
 315
 316 static void
 317 emit_immediate(
 318    struct x86_function *func,
 319    unsigned xmm,
 320    unsigned vec,
 321    unsigned chan )
 322 {
 323    sse_movss(
 324       func,
 325       make_xmm( xmm ),
 326       get_immediate( vec, chan ) );
 327    sse_shufps(
 328       func,
 329       make_xmm( xmm ),
 330       make_xmm( xmm ),
 331       SHUF( 0, 0, 0, 0 ) );
 332 }
 333
 334
 335 /**
 336  * Copy a shader input to xmm register
 337  * \param xmm  the destination xmm register
 338  * \param vec  the src input attrib
 339  * \param chan  src channel to fetch (X, Y, Z or W)
 340  */
 341 static void
 342 emit_inputf(
 343    struct x86_function *func,
 344    unsigned xmm,
 345    unsigned vec,
 346    unsigned chan )
 347 {
 348    sse_movups(
 349       func,
 350       make_xmm( xmm ),
 351       get_input( vec, chan ) );
 352 }
 353
 354 /**
 355  * Store an xmm register to a shader output
 356  * \param xmm  the source xmm register
 357  * \param vec  the dest output attrib
 358  * \param chan  src dest channel to store (X, Y, Z or W)
 359  */
 360 static void
 361 emit_output(
 362    struct x86_function *func,
 363    unsigned xmm,
 364    unsigned vec,
 365    unsigned chan )
 366 {
 367    sse_movups(
 368       func,
 369       get_output( vec, chan ),
 370       make_xmm( xmm ) );
 371 }
 372
 373 /**
 374  * Copy a shader temporary to xmm register
 375  * \param xmm  the destination xmm register
 376  * \param vec  the src temp register
 377  * \param chan  src channel to fetch (X, Y, Z or W)
 378  */
 379 static void
 380 emit_tempf(
 381    struct x86_function *func,
 382    unsigned xmm,
 383    unsigned vec,
 384    unsigned chan )
 385 {
 386    sse_movaps(
 387       func,
 388       make_xmm( xmm ),
 389       get_temp( vec, chan ) );
 390 }
 391
 392 /**
 393  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 394  * \param xmm  the destination xmm register
 395  * \param vec  the src input/attribute coefficient index
 396  * \param chan  src channel to fetch (X, Y, Z or W)
 397  * \param member  0=a0, 1=dadx, 2=dady
 398  */
 399 static void
 400 emit_coef(
 401    struct x86_function *func,
 402    unsigned xmm,
 403    unsigned vec,
 404    unsigned chan,
 405    unsigned member )
 406 {
 407    sse_movss(
 408       func,
 409       make_xmm( xmm ),
 410       get_coef( vec, chan, member ) );
 411    sse_shufps(
 412       func,
 413       make_xmm( xmm ),
 414       make_xmm( xmm ),
 415       SHUF( 0, 0, 0, 0 ) );
 416 }
 417
 418 /**
 419  * Data store helpers.
 420  */
 421
 422 static void
 423 emit_inputs(
 424    struct x86_function *func,
 425    unsigned xmm,
 426    unsigned vec,
 427    unsigned chan )
 428 {
 429    sse_movups(
 430       func,
 431       get_input( vec, chan ),
 432       make_xmm( xmm ) );
 433 }
 434
 435 static void
 436 emit_temps(
 437    struct x86_function *func,
 438    unsigned xmm,
 439    unsigned vec,
 440    unsigned chan )
 441 {
 442    sse_movaps(
 443       func,
 444       get_temp( vec, chan ),
 445       make_xmm( xmm ) );
 446 }
 447
 448 static void
 449 emit_addrs(
 450    struct x86_function *func,
 451    unsigned xmm,
 452    unsigned vec,
 453    unsigned chan )
 454 {
 455    assert( vec == 0 );
 456
 457    emit_temps(
 458       func,
 459       xmm,
 460       vec + TGSI_EXEC_TEMP_ADDR,
 461       chan );
 462 }
 463
 464 /**
 465  * Coefficent fetch helpers.
 466  */
 467
 468 static void
 469 emit_coef_a0(
 470    struct x86_function *func,
 471    unsigned xmm,
 472    unsigned vec,
 473    unsigned chan )
 474 {
 475    emit_coef(
 476       func,
 477       xmm,
 478       vec,
 479       chan,
 480       0 );
 481 }
 482
 483 static void
 484 emit_coef_dadx(
 485    struct x86_function *func,
 486    unsigned xmm,
 487    unsigned vec,
 488    unsigned chan )
 489 {
 490    emit_coef(
 491       func,
 492       xmm,
 493       vec,
 494       chan,
 495       1 );
 496 }
 497
 498 static void
 499 emit_coef_dady(
 500    struct x86_function *func,
 501    unsigned xmm,
 502    unsigned vec,
 503    unsigned chan )
 504 {
 505    emit_coef(
 506       func,
 507       xmm,
 508       vec,
 509       chan,
 510       2 );
 511 }
 512
 513 /**
 514  * Function call helpers.
 515  */
 516
 517 /**
 518  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 519  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 520  * that the stack pointer is 16 byte aligned, as expected.
 521  */
 522 static void
 523 emit_func_call_dst(
 524    struct x86_function *func,
 525    unsigned xmm_save,
 526    unsigned xmm_dst,
 527    void (PIPE_CDECL *code)() )
 528 {
 529    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 530    unsigned i, n;
 531    unsigned xmm_mask;
 532
 533    /* Bitmask of the xmm registers to save */
 534    xmm_mask = (1 << xmm_save) - 1;
 535    xmm_mask &= ~(1 << xmm_dst);
 536
 537    sse_movaps(
 538       func,
 539       get_temp( TEMP_R0, 0 ),
 540       make_xmm( xmm_dst ) );
 541
 542    x86_push(
 543       func,
 544       x86_make_reg( file_REG32, reg_AX) );
 545    x86_push(
 546       func,
 547       x86_make_reg( file_REG32, reg_CX) );
 548    x86_push(
 549       func,
 550       x86_make_reg( file_REG32, reg_DX) );
 551
 552    for(i = 0, n = 0; i < 8; ++i)
 553       if(xmm_mask & (1 << i))
 554          ++n;
 555
 556    x86_sub_imm(
 557       func,
 558       x86_make_reg( file_REG32, reg_SP ),
 559       n*16);
 560
 561    for(i = 0, n = 0; i < 8; ++i)
 562       if(xmm_mask & (1 << i)) {
 563          sse_movups(
 564             func,
 565             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 566             make_xmm( i ) );
 567          ++n;
 568       }
 569
 570    x86_lea(
 571       func,
 572       ecx,
 573       get_temp( TEMP_R0, 0 ) );
 574
 575    x86_push( func, ecx );
 576    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 577    x86_call( func, ecx );
 578    x86_pop(func, ecx );
 579
 580    for(i = 0, n = 0; i < 8; ++i)
 581       if(xmm_mask & (1 << i)) {
 582          sse_movups(
 583             func,
 584             make_xmm( i ),
 585             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 586          ++n;
 587       }
 588
 589    x86_add_imm(
 590       func,
 591       x86_make_reg( file_REG32, reg_SP ),
 592       n*16);
 593
 594    /* Restore GP registers in a reverse order.
 595     */
 596    x86_pop(
 597       func,
 598       x86_make_reg( file_REG32, reg_DX) );
 599    x86_pop(
 600       func,
 601       x86_make_reg( file_REG32, reg_CX) );
 602    x86_pop(
 603       func,
 604       x86_make_reg( file_REG32, reg_AX) );
 605
 606    sse_movaps(
 607       func,
 608       make_xmm( xmm_dst ),
 609       get_temp( TEMP_R0, 0 ) );
 610 }
 611
 612 static void
 613 emit_func_call_dst_src(
 614    struct x86_function *func,
 615    unsigned xmm_save,
 616    unsigned xmm_dst,
 617    unsigned xmm_src,
 618    void (PIPE_CDECL *code)() )
 619 {
 620    sse_movaps(
 621       func,
 622       get_temp( TEMP_R0, 1 ),
 623       make_xmm( xmm_src ) );
 624
 625    emit_func_call_dst(
 626       func,
 627       xmm_save,
 628       xmm_dst,
 629       code );
 630 }
 631
 632
 633 #if defined(PIPE_ARCH_SSE)
 634
 635 /*
 636  * Fast SSE2 implementation of special math functions.
 637  */
 638
 639 #define POLY0(x, c0) _mm_set1_ps(c0)
 640 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 641 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 642 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 643 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 644 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 645
 646 #define EXP_POLY_DEGREE 3
 647 #define LOG_POLY_DEGREE 5
 648
 649 /**
 650  * See http://www.devmaster.net/forums/showthread.php?p=43580
 651  */
 652 static INLINE __m128
 653 exp2f4(__m128 x)
 654 {
 655    __m128i ipart;
 656    __m128 fpart, expipart, expfpart;
 657
 658    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 659    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 660
 661    /* ipart = int(x - 0.5) */
 662    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 663
 664    /* fpart = x - ipart */
 665    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 666
 667    /* expipart = (float) (1 << ipart) */
 668    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 669
 670    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 671 #if EXP_POLY_DEGREE == 5
 672    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 673 #elif EXP_POLY_DEGREE == 4
 674    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 675 #elif EXP_POLY_DEGREE == 3
 676    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 677 #elif EXP_POLY_DEGREE == 2
 678    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 679 #else
 680 #error
 681 #endif
 682
 683    return _mm_mul_ps(expipart, expfpart);
 684 }
 685
 686
 687 /**
 688  * See http://www.devmaster.net/forums/showthread.php?p=43580
 689  */
 690 static INLINE __m128
 691 log2f4(__m128 x)
 692 {
 693    __m128i expmask = _mm_set1_epi32(0x7f800000);
 694    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 695    __m128 one = _mm_set1_ps(1.0f);
 696
 697    __m128i i = _mm_castps_si128(x);
 698
 699    /* exp = (float) exponent(x) */
 700    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 701
 702    /* mant = (float) mantissa(x) */
 703    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 704
 705    __m128 logmant;
 706
 707    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 708     * These coefficients can be generate with
 709     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 710     */
 711 #if LOG_POLY_DEGREE == 6
 712    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 713 #elif LOG_POLY_DEGREE == 5
 714    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 715 #elif LOG_POLY_DEGREE == 4
 716    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 717 #elif LOG_POLY_DEGREE == 3
 718    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 719 #else
 720 #error
 721 #endif
 722
 723    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 724    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 725
 726    return _mm_add_ps(logmant, exp);
 727 }
 728
 729
 730 static INLINE __m128
 731 powf4(__m128 x, __m128 y)
 732 {
 733    return exp2f4(_mm_mul_ps(log2f4(x), y));
 734 }
 735
 736 #endif /* PIPE_ARCH_SSE */
 737
 738
 739
 740 /**
 741  * Low-level instruction translators.
 742  */
 743
 744 static void
 745 emit_abs(
 746    struct x86_function *func,
 747    unsigned xmm )
 748 {
 749    sse_andps(
 750       func,
 751       make_xmm( xmm ),
 752       get_temp(
 753          TGSI_EXEC_TEMP_7FFFFFFF_I,
 754          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 755 }
 756
 757 static void
 758 emit_add(
 759    struct x86_function *func,
 760    unsigned xmm_dst,
 761    unsigned xmm_src )
 762 {
 763    sse_addps(
 764       func,
 765       make_xmm( xmm_dst ),
 766       make_xmm( xmm_src ) );
 767 }
 768
 769 static void PIPE_CDECL
 770 cos4f(
 771    float *store )
 772 {
 773    store[0] = cosf( store[0] );
 774    store[1] = cosf( store[1] );
 775    store[2] = cosf( store[2] );
 776    store[3] = cosf( store[3] );
 777 }
 778
 779 static void
 780 emit_cos(
 781    struct x86_function *func,
 782    unsigned xmm_save,
 783    unsigned xmm_dst )
 784 {
 785    emit_func_call_dst(
 786       func,
 787       xmm_save,
 788       xmm_dst,
 789       cos4f );
 790 }
 791
 792 static void PIPE_CDECL
 793 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 794 __attribute__((force_align_arg_pointer))
 795 #endif
 796 ex24f(
 797    float *store )
 798 {
 799 #if defined(PIPE_ARCH_SSE)
 800    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 801 #else
 802    store[0] = util_fast_exp2( store[0] );
 803    store[1] = util_fast_exp2( store[1] );
 804    store[2] = util_fast_exp2( store[2] );
 805    store[3] = util_fast_exp2( store[3] );
 806 #endif
 807 }
 808
 809 static void
 810 emit_ex2(
 811    struct x86_function *func,
 812    unsigned xmm_save,
 813    unsigned xmm_dst )
 814 {
 815    emit_func_call_dst(
 816       func,
 817       xmm_save,
 818       xmm_dst,
 819       ex24f );
 820 }
 821
 822 static void
 823 emit_f2it(
 824    struct x86_function *func,
 825    unsigned xmm )
 826 {
 827    sse2_cvttps2dq(
 828       func,
 829       make_xmm( xmm ),
 830       make_xmm( xmm ) );
 831 }
 832
 833 static void
 834 emit_i2f(
 835    struct x86_function *func,
 836    unsigned xmm )
 837 {
 838    sse2_cvtdq2ps(
 839       func,
 840       make_xmm( xmm ),
 841       make_xmm( xmm ) );
 842 }
 843
 844 static void PIPE_CDECL
 845 flr4f(
 846    float *store )
 847 {
 848    store[0] = floorf( store[0] );
 849    store[1] = floorf( store[1] );
 850    store[2] = floorf( store[2] );
 851    store[3] = floorf( store[3] );
 852 }
 853
 854 static void
 855 emit_flr(
 856    struct x86_function *func,
 857    unsigned xmm_save,
 858    unsigned xmm_dst )
 859 {
 860    emit_func_call_dst(
 861       func,
 862       xmm_save,
 863       xmm_dst,
 864       flr4f );
 865 }
 866
 867 static void PIPE_CDECL
 868 frc4f(
 869    float *store )
 870 {
 871    store[0] -= floorf( store[0] );
 872    store[1] -= floorf( store[1] );
 873    store[2] -= floorf( store[2] );
 874    store[3] -= floorf( store[3] );
 875 }
 876
 877 static void
 878 emit_frc(
 879    struct x86_function *func,
 880    unsigned xmm_save,
 881    unsigned xmm_dst )
 882 {
 883    emit_func_call_dst(
 884       func,
 885       xmm_save,
 886       xmm_dst,
 887       frc4f );
 888 }
 889
 890 static void PIPE_CDECL
 891 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 892 __attribute__((force_align_arg_pointer))
 893 #endif
 894 lg24f(
 895    float *store )
 896 {
 897 #if defined(PIPE_ARCH_SSE)
 898    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 899 #else
 900    store[0] = util_fast_log2( store[0] );
 901    store[1] = util_fast_log2( store[1] );
 902    store[2] = util_fast_log2( store[2] );
 903    store[3] = util_fast_log2( store[3] );
 904 #endif
 905 }
 906
 907 static void
 908 emit_lg2(
 909    struct x86_function *func,
 910    unsigned xmm_save,
 911    unsigned xmm_dst )
 912 {
 913    emit_func_call_dst(
 914       func,
 915       xmm_save,
 916       xmm_dst,
 917       lg24f );
 918 }
 919
 920 static void
 921 emit_MOV(
 922    struct x86_function *func,
 923    unsigned xmm_dst,
 924    unsigned xmm_src )
 925 {
 926    sse_movups(
 927       func,
 928       make_xmm( xmm_dst ),
 929       make_xmm( xmm_src ) );
 930 }
 931
 932 static void
 933 emit_mul (struct x86_function *func,
 934           unsigned xmm_dst,
 935           unsigned xmm_src)
 936 {
 937    sse_mulps(
 938       func,
 939       make_xmm( xmm_dst ),
 940       make_xmm( xmm_src ) );
 941 }
 942
 943 static void
 944 emit_neg(
 945    struct x86_function *func,
 946    unsigned xmm )
 947 {
 948    sse_xorps(
 949       func,
 950       make_xmm( xmm ),
 951       get_temp(
 952          TGSI_EXEC_TEMP_80000000_I,
 953          TGSI_EXEC_TEMP_80000000_C ) );
 954 }
 955
 956 static void PIPE_CDECL
 957 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 958 __attribute__((force_align_arg_pointer))
 959 #endif
 960 pow4f(
 961    float *store )
 962 {
 963 #if defined(PIPE_ARCH_SSE)
 964    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
 965 #else
 966    store[0] = util_fast_pow( store[0], store[4] );
 967    store[1] = util_fast_pow( store[1], store[5] );
 968    store[2] = util_fast_pow( store[2], store[6] );
 969    store[3] = util_fast_pow( store[3], store[7] );
 970 #endif
 971 }
 972
 973 static void
 974 emit_pow(
 975    struct x86_function *func,
 976    unsigned xmm_save,
 977    unsigned xmm_dst,
 978    unsigned xmm_src )
 979 {
 980    emit_func_call_dst_src(
 981       func,
 982       xmm_save,
 983       xmm_dst,
 984       xmm_src,
 985       pow4f );
 986 }
 987
 988 static void
 989 emit_rcp (
 990    struct x86_function *func,
 991    unsigned xmm_dst,
 992    unsigned xmm_src )
 993 {
 994    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 995     * good enough.  Need to either emit a proper divide or use the
 996     * iterative technique described below in emit_rsqrt().
 997     */
 998    sse2_rcpps(
 999       func,
1000       make_xmm( xmm_dst ),
1001       make_xmm( xmm_src ) );
1002 }
1003
1004 static void PIPE_CDECL
1005 rnd4f(
1006    float *store )
1007 {
1008    store[0] = floorf( store[0] + 0.5f );
1009    store[1] = floorf( store[1] + 0.5f );
1010    store[2] = floorf( store[2] + 0.5f );
1011    store[3] = floorf( store[3] + 0.5f );
1012 }
1013
1014 static void
1015 emit_rnd(
1016    struct x86_function *func,
1017    unsigned xmm_save,
1018    unsigned xmm_dst )
1019 {
1020    emit_func_call_dst(
1021       func,
1022       xmm_save,
1023       xmm_dst,
1024       rnd4f );
1025 }
1026
1027 static void
1028 emit_rsqrt(
1029    struct x86_function *func,
1030    unsigned xmm_dst,
1031    unsigned xmm_src )
1032 {
1033 #if HIGH_PRECISION
1034    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1035     * implementations, it is possible to improve its precision at
1036     * fairly low cost, using a newton/raphson step, as below:
1037     *
1038     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1039     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1040     *
1041     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1042     */
1043    {
1044       struct x86_reg dst = make_xmm( xmm_dst );
1045       struct x86_reg src = make_xmm( xmm_src );
1046       struct x86_reg tmp0 = make_xmm( 2 );
1047       struct x86_reg tmp1 = make_xmm( 3 );
1048
1049       assert( xmm_dst != xmm_src );
1050       assert( xmm_dst != 2 && xmm_dst != 3 );
1051       assert( xmm_src != 2 && xmm_src != 3 );
1052
1053       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1054       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1055       sse_rsqrtps( func, tmp1, src  );
1056       sse_mulps(   func, src,  tmp1 );
1057       sse_mulps(   func, dst,  tmp1 );
1058       sse_mulps(   func, src,  tmp1 );
1059       sse_subps(   func, tmp0, src  );
1060       sse_mulps(   func, dst,  tmp0 );
1061    }
1062 #else
1063    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1064     * good enough.
1065     */
1066    sse_rsqrtps(
1067       func,
1068       make_xmm( xmm_dst ),
1069       make_xmm( xmm_src ) );
1070 #endif
1071 }
1072
1073 static void
1074 emit_setsign(
1075    struct x86_function *func,
1076    unsigned xmm )
1077 {
1078    sse_orps(
1079       func,
1080       make_xmm( xmm ),
1081       get_temp(
1082          TGSI_EXEC_TEMP_80000000_I,
1083          TGSI_EXEC_TEMP_80000000_C ) );
1084 }
1085
1086 static void PIPE_CDECL
1087 sgn4f(
1088    float *store )
1089 {
1090    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1091    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1092    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1093    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1094 }
1095
1096 static void
1097 emit_sgn(
1098    struct x86_function *func,
1099    unsigned xmm_save,
1100    unsigned xmm_dst )
1101 {
1102    emit_func_call_dst(
1103       func,
1104       xmm_save,
1105       xmm_dst,
1106       sgn4f );
1107 }
1108
1109 static void PIPE_CDECL
1110 sin4f(
1111    float *store )
1112 {
1113    store[0] = sinf( store[0] );
1114    store[1] = sinf( store[1] );
1115    store[2] = sinf( store[2] );
1116    store[3] = sinf( store[3] );
1117 }
1118
1119 static void
1120 emit_sin (struct x86_function *func,
1121           unsigned xmm_save,
1122           unsigned xmm_dst)
1123 {
1124    emit_func_call_dst(
1125       func,
1126       xmm_save,
1127       xmm_dst,
1128       sin4f );
1129 }
1130
1131 static void
1132 emit_sub(
1133    struct x86_function *func,
1134    unsigned xmm_dst,
1135    unsigned xmm_src )
1136 {
1137    sse_subps(
1138       func,
1139       make_xmm( xmm_dst ),
1140       make_xmm( xmm_src ) );
1141 }
1142
1143 /**
1144  * Register fetch.
1145  */
1146
1147 static void
1148 emit_fetch(
1149    struct x86_function *func,
1150    unsigned xmm,
1151    const struct tgsi_full_src_register *reg,
1152    const unsigned chan_index )
1153 {
1154    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1155
1156    switch (swizzle) {
1157    case TGSI_EXTSWIZZLE_X:
1158    case TGSI_EXTSWIZZLE_Y:
1159    case TGSI_EXTSWIZZLE_Z:
1160    case TGSI_EXTSWIZZLE_W:
1161       switch (reg->SrcRegister.File) {
1162       case TGSI_FILE_CONSTANT:
1163          emit_const(
1164             func,
1165             xmm,
1166             reg->SrcRegister.Index,
1167             swizzle,
1168             reg->SrcRegister.Indirect,
1169             reg->SrcRegisterInd.File,
1170             reg->SrcRegisterInd.Index );
1171          break;
1172
1173       case TGSI_FILE_IMMEDIATE:
1174          emit_immediate(
1175             func,
1176             xmm,
1177             reg->SrcRegister.Index,
1178             swizzle );
1179          break;
1180
1181       case TGSI_FILE_INPUT:
1182          emit_inputf(
1183             func,
1184             xmm,
1185             reg->SrcRegister.Index,
1186             swizzle );
1187          break;
1188
1189       case TGSI_FILE_TEMPORARY:
1190          emit_tempf(
1191             func,
1192             xmm,
1193             reg->SrcRegister.Index,
1194             swizzle );
1195          break;
1196
1197       default:
1198          assert( 0 );
1199       }
1200       break;
1201
1202    case TGSI_EXTSWIZZLE_ZERO:
1203       emit_tempf(
1204          func,
1205          xmm,
1206          TGSI_EXEC_TEMP_00000000_I,
1207          TGSI_EXEC_TEMP_00000000_C );
1208       break;
1209
1210    case TGSI_EXTSWIZZLE_ONE:
1211       emit_tempf(
1212          func,
1213          xmm,
1214          TEMP_ONE_I,
1215          TEMP_ONE_C );
1216       break;
1217
1218    default:
1219       assert( 0 );
1220    }
1221
1222    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1223    case TGSI_UTIL_SIGN_CLEAR:
1224       emit_abs( func, xmm );
1225       break;
1226
1227    case TGSI_UTIL_SIGN_SET:
1228       emit_setsign( func, xmm );
1229       break;
1230
1231    case TGSI_UTIL_SIGN_TOGGLE:
1232       emit_neg( func, xmm );
1233       break;
1234
1235    case TGSI_UTIL_SIGN_KEEP:
1236       break;
1237    }
1238 }
1239
1240 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1241    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1242
1243 /**
1244  * Register store.
1245  */
1246
1247 static void
1248 emit_store(
1249    struct x86_function *func,
1250    unsigned xmm,
1251    const struct tgsi_full_dst_register *reg,
1252    const struct tgsi_full_instruction *inst,
1253    unsigned chan_index )
1254 {
1255    switch( reg->DstRegister.File ) {
1256    case TGSI_FILE_OUTPUT:
1257       emit_output(
1258          func,
1259          xmm,
1260          reg->DstRegister.Index,
1261          chan_index );
1262       break;
1263
1264    case TGSI_FILE_TEMPORARY:
1265       emit_temps(
1266          func,
1267          xmm,
1268          reg->DstRegister.Index,
1269          chan_index );
1270       break;
1271
1272    case TGSI_FILE_ADDRESS:
1273       emit_addrs(
1274          func,
1275          xmm,
1276          reg->DstRegister.Index,
1277          chan_index );
1278       break;
1279
1280    default:
1281       assert( 0 );
1282    }
1283
1284    switch( inst->Instruction.Saturate ) {
1285    case TGSI_SAT_NONE:
1286       break;
1287
1288    case TGSI_SAT_ZERO_ONE:
1289       /* assert( 0 ); */
1290       break;
1291
1292    case TGSI_SAT_MINUS_PLUS_ONE:
1293       assert( 0 );
1294       break;
1295    }
1296 }
1297
1298 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1299    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1300
1301 /**
1302  * High-level instruction translators.
1303  */
1304
1305 static void
1306 emit_kil(
1307    struct x86_function *func,
1308    const struct tgsi_full_src_register *reg )
1309 {
1310    unsigned uniquemask;
1311    unsigned registers[4];
1312    unsigned nextregister = 0;
1313    unsigned firstchan = ~0;
1314    unsigned chan_index;
1315
1316    /* This mask stores component bits that were already tested. Note that
1317     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1318     * tested. */
1319    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1320
1321    FOR_EACH_CHANNEL( chan_index ) {
1322       unsigned swizzle;
1323
1324       /* unswizzle channel */
1325       swizzle = tgsi_util_get_full_src_register_extswizzle(
1326          reg,
1327          chan_index );
1328
1329       /* check if the component has not been already tested */
1330       if( !(uniquemask & (1 << swizzle)) ) {
1331          uniquemask |= 1 << swizzle;
1332
1333          /* allocate register */
1334          registers[chan_index] = nextregister;
1335          emit_fetch(
1336             func,
1337             nextregister,
1338             reg,
1339             chan_index );
1340          nextregister++;
1341
1342          /* mark the first channel used */
1343          if( firstchan == ~0 ) {
1344             firstchan = chan_index;
1345          }
1346       }
1347    }
1348
1349    x86_push(
1350       func,
1351       x86_make_reg( file_REG32, reg_AX ) );
1352    x86_push(
1353       func,
1354       x86_make_reg( file_REG32, reg_DX ) );
1355
1356    FOR_EACH_CHANNEL( chan_index ) {
1357       if( uniquemask & (1 << chan_index) ) {
1358          sse_cmpps(
1359             func,
1360             make_xmm( registers[chan_index] ),
1361             get_temp(
1362                TGSI_EXEC_TEMP_00000000_I,
1363                TGSI_EXEC_TEMP_00000000_C ),
1364             cc_LessThan );
1365
1366          if( chan_index == firstchan ) {
1367             sse_pmovmskb(
1368                func,
1369                x86_make_reg( file_REG32, reg_AX ),
1370                make_xmm( registers[chan_index] ) );
1371          }
1372          else {
1373             sse_pmovmskb(
1374                func,
1375                x86_make_reg( file_REG32, reg_DX ),
1376                make_xmm( registers[chan_index] ) );
1377             x86_or(
1378                func,
1379                x86_make_reg( file_REG32, reg_AX ),
1380                x86_make_reg( file_REG32, reg_DX ) );
1381          }
1382       }
1383    }
1384
1385    x86_or(
1386       func,
1387       get_temp(
1388          TGSI_EXEC_TEMP_KILMASK_I,
1389          TGSI_EXEC_TEMP_KILMASK_C ),
1390       x86_make_reg( file_REG32, reg_AX ) );
1391
1392    x86_pop(
1393       func,
1394       x86_make_reg( file_REG32, reg_DX ) );
1395    x86_pop(
1396       func,
1397       x86_make_reg( file_REG32, reg_AX ) );
1398 }
1399
1400
1401 static void
1402 emit_kilp(
1403    struct x86_function *func )
1404 {
1405    /* XXX todo / fix me */
1406 }
1407
1408
1409 static void
1410 emit_setcc(
1411    struct x86_function *func,
1412    struct tgsi_full_instruction *inst,
1413    enum sse_cc cc )
1414 {
1415    unsigned chan_index;
1416
1417    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1418       FETCH( func, *inst, 0, 0, chan_index );
1419       FETCH( func, *inst, 1, 1, chan_index );
1420       sse_cmpps(
1421          func,
1422          make_xmm( 0 ),
1423          make_xmm( 1 ),
1424          cc );
1425       sse_andps(
1426          func,
1427          make_xmm( 0 ),
1428          get_temp(
1429             TEMP_ONE_I,
1430             TEMP_ONE_C ) );
1431       STORE( func, *inst, 0, 0, chan_index );
1432    }
1433 }
1434
1435 static void
1436 emit_cmp(
1437    struct x86_function *func,
1438    struct tgsi_full_instruction *inst )
1439 {
1440    unsigned chan_index;
1441
1442    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1443       FETCH( func, *inst, 0, 0, chan_index );
1444       FETCH( func, *inst, 1, 1, chan_index );
1445       FETCH( func, *inst, 2, 2, chan_index );
1446       sse_cmpps(
1447          func,
1448          make_xmm( 0 ),
1449          get_temp(
1450             TGSI_EXEC_TEMP_00000000_I,
1451             TGSI_EXEC_TEMP_00000000_C ),
1452          cc_LessThan );
1453       sse_andps(
1454          func,
1455          make_xmm( 1 ),
1456          make_xmm( 0 ) );
1457       sse_andnps(
1458          func,
1459          make_xmm( 0 ),
1460          make_xmm( 2 ) );
1461       sse_orps(
1462          func,
1463          make_xmm( 0 ),
1464          make_xmm( 1 ) );
1465       STORE( func, *inst, 0, 0, chan_index );
1466    }
1467 }
1468
1469 static int
1470 emit_instruction(
1471    struct x86_function *func,
1472    struct tgsi_full_instruction *inst )
1473 {
1474    unsigned chan_index;
1475
1476    switch (inst->Instruction.Opcode) {
1477    case TGSI_OPCODE_ARL:
1478       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1479          FETCH( func, *inst, 0, 0, chan_index );
1480          emit_f2it( func, 0 );
1481          STORE( func, *inst, 0, 0, chan_index );
1482       }
1483       break;
1484
1485    case TGSI_OPCODE_MOV:
1486    case TGSI_OPCODE_SWZ:
1487       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1488          FETCH( func, *inst, 0, 0, chan_index );
1489          STORE( func, *inst, 0, 0, chan_index );
1490       }
1491       break;
1492
1493    case TGSI_OPCODE_LIT:
1494       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1495           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1496          emit_tempf(
1497             func,
1498             0,
1499             TEMP_ONE_I,
1500             TEMP_ONE_C);
1501          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1502             STORE( func, *inst, 0, 0, CHAN_X );
1503          }
1504          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1505             STORE( func, *inst, 0, 0, CHAN_W );
1506          }
1507       }
1508       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1509           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1510          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1511             FETCH( func, *inst, 0, 0, CHAN_X );
1512             sse_maxps(
1513                func,
1514                make_xmm( 0 ),
1515                get_temp(
1516                   TGSI_EXEC_TEMP_00000000_I,
1517                   TGSI_EXEC_TEMP_00000000_C ) );
1518             STORE( func, *inst, 0, 0, CHAN_Y );
1519          }
1520          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1521             /* XMM[1] = SrcReg[0].yyyy */
1522             FETCH( func, *inst, 1, 0, CHAN_Y );
1523             /* XMM[1] = max(XMM[1], 0) */
1524             sse_maxps(
1525                func,
1526                make_xmm( 1 ),
1527                get_temp(
1528                   TGSI_EXEC_TEMP_00000000_I,
1529                   TGSI_EXEC_TEMP_00000000_C ) );
1530             /* XMM[2] = SrcReg[0].wwww */
1531             FETCH( func, *inst, 2, 0, CHAN_W );
1532             /* XMM[2] = min(XMM[2], 128.0) */
1533             sse_minps(
1534                func,
1535                make_xmm( 2 ),
1536                get_temp(
1537                   TGSI_EXEC_TEMP_128_I,
1538                   TGSI_EXEC_TEMP_128_C ) );
1539             /* XMM[2] = max(XMM[2], -128.0) */
1540             sse_maxps(
1541                func,
1542                make_xmm( 2 ),
1543                get_temp(
1544                   TGSI_EXEC_TEMP_MINUS_128_I,
1545                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1546             emit_pow( func, 3, 1, 2 );
1547             FETCH( func, *inst, 0, 0, CHAN_X );
1548             sse_xorps(
1549                func,
1550                make_xmm( 2 ),
1551                make_xmm( 2 ) );
1552             sse_cmpps(
1553                func,
1554                make_xmm( 2 ),
1555                make_xmm( 0 ),
1556                cc_LessThanEqual );
1557             sse_andps(
1558                func,
1559                make_xmm( 2 ),
1560                make_xmm( 1 ) );
1561             STORE( func, *inst, 2, 0, CHAN_Z );
1562          }
1563       }
1564       break;
1565
1566    case TGSI_OPCODE_RCP:
1567    /* TGSI_OPCODE_RECIP */
1568       FETCH( func, *inst, 0, 0, CHAN_X );
1569       emit_rcp( func, 0, 0 );
1570       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1571          STORE( func, *inst, 0, 0, chan_index );
1572       }
1573       break;
1574
1575    case TGSI_OPCODE_RSQ:
1576    /* TGSI_OPCODE_RECIPSQRT */
1577       FETCH( func, *inst, 0, 0, CHAN_X );
1578       emit_rsqrt( func, 1, 0 );
1579       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1580          STORE( func, *inst, 1, 0, chan_index );
1581       }
1582       break;
1583
1584    case TGSI_OPCODE_EXP:
1585       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1586           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1587           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1588          FETCH( func, *inst, 0, 0, CHAN_X );
1589          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1590              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1591             emit_MOV( func, 1, 0 );
1592             emit_flr( func, 2, 1 );
1593             /* dst.x = ex2(floor(src.x)) */
1594             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1595                emit_MOV( func, 2, 1 );
1596                emit_ex2( func, 3, 2 );
1597                STORE( func, *inst, 2, 0, CHAN_X );
1598             }
1599             /* dst.y = src.x - floor(src.x) */
1600             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1601                emit_MOV( func, 2, 0 );
1602                emit_sub( func, 2, 1 );
1603                STORE( func, *inst, 2, 0, CHAN_Y );
1604             }
1605          }
1606          /* dst.z = ex2(src.x) */
1607          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1608             emit_ex2( func, 3, 0 );
1609             STORE( func, *inst, 0, 0, CHAN_Z );
1610          }
1611       }
1612       /* dst.w = 1.0 */
1613       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1614          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1615          STORE( func, *inst, 0, 0, CHAN_W );
1616       }
1617       break;
1618
1619    case TGSI_OPCODE_LOG:
1620       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1621           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1622           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1623          FETCH( func, *inst, 0, 0, CHAN_X );
1624          emit_abs( func, 0 );
1625          emit_MOV( func, 1, 0 );
1626          emit_lg2( func, 2, 1 );
1627          /* dst.z = lg2(abs(src.x)) */
1628          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1629             STORE( func, *inst, 1, 0, CHAN_Z );
1630          }
1631          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1632              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1633             emit_flr( func, 2, 1 );
1634             /* dst.x = floor(lg2(abs(src.x))) */
1635             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1636                STORE( func, *inst, 1, 0, CHAN_X );
1637             }
1638             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1639             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1640                emit_ex2( func, 2, 1 );
1641                emit_rcp( func, 1, 1 );
1642                emit_mul( func, 0, 1 );
1643                STORE( func, *inst, 0, 0, CHAN_Y );
1644             }
1645          }
1646       }
1647       /* dst.w = 1.0 */
1648       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1649          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1650          STORE( func, *inst, 0, 0, CHAN_W );
1651       }
1652       break;
1653
1654    case TGSI_OPCODE_MUL:
1655       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1656          FETCH( func, *inst, 0, 0, chan_index );
1657          FETCH( func, *inst, 1, 1, chan_index );
1658          emit_mul( func, 0, 1 );
1659          STORE( func, *inst, 0, 0, chan_index );
1660       }
1661       break;
1662
1663    case TGSI_OPCODE_ADD:
1664       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1665          FETCH( func, *inst, 0, 0, chan_index );
1666          FETCH( func, *inst, 1, 1, chan_index );
1667          emit_add( func, 0, 1 );
1668          STORE( func, *inst, 0, 0, chan_index );
1669       }
1670       break;
1671
1672    case TGSI_OPCODE_DP3:
1673    /* TGSI_OPCODE_DOT3 */
1674       FETCH( func, *inst, 0, 0, CHAN_X );
1675       FETCH( func, *inst, 1, 1, CHAN_X );
1676       emit_mul( func, 0, 1 );
1677       FETCH( func, *inst, 1, 0, CHAN_Y );
1678       FETCH( func, *inst, 2, 1, CHAN_Y );
1679       emit_mul( func, 1, 2 );
1680       emit_add( func, 0, 1 );
1681       FETCH( func, *inst, 1, 0, CHAN_Z );
1682       FETCH( func, *inst, 2, 1, CHAN_Z );
1683       emit_mul( func, 1, 2 );
1684       emit_add( func, 0, 1 );
1685       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1686          STORE( func, *inst, 0, 0, chan_index );
1687       }
1688       break;
1689
1690    case TGSI_OPCODE_DP4:
1691    /* TGSI_OPCODE_DOT4 */
1692       FETCH( func, *inst, 0, 0, CHAN_X );
1693       FETCH( func, *inst, 1, 1, CHAN_X );
1694       emit_mul( func, 0, 1 );
1695       FETCH( func, *inst, 1, 0, CHAN_Y );
1696       FETCH( func, *inst, 2, 1, CHAN_Y );
1697       emit_mul( func, 1, 2 );
1698       emit_add( func, 0, 1 );
1699       FETCH( func, *inst, 1, 0, CHAN_Z );
1700       FETCH( func, *inst, 2, 1, CHAN_Z );
1701       emit_mul(func, 1, 2 );
1702       emit_add(func, 0, 1 );
1703       FETCH( func, *inst, 1, 0, CHAN_W );
1704       FETCH( func, *inst, 2, 1, CHAN_W );
1705       emit_mul( func, 1, 2 );
1706       emit_add( func, 0, 1 );
1707       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1708          STORE( func, *inst, 0, 0, chan_index );
1709       }
1710       break;
1711
1712    case TGSI_OPCODE_DST:
1713       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1714          emit_tempf(
1715             func,
1716             0,
1717             TEMP_ONE_I,
1718             TEMP_ONE_C );
1719          STORE( func, *inst, 0, 0, CHAN_X );
1720       }
1721       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1722          FETCH( func, *inst, 0, 0, CHAN_Y );
1723          FETCH( func, *inst, 1, 1, CHAN_Y );
1724          emit_mul( func, 0, 1 );
1725          STORE( func, *inst, 0, 0, CHAN_Y );
1726       }
1727       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1728          FETCH( func, *inst, 0, 0, CHAN_Z );
1729          STORE( func, *inst, 0, 0, CHAN_Z );
1730       }
1731       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1732          FETCH( func, *inst, 0, 1, CHAN_W );
1733          STORE( func, *inst, 0, 0, CHAN_W );
1734       }
1735       break;
1736
1737    case TGSI_OPCODE_MIN:
1738       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1739          FETCH( func, *inst, 0, 0, chan_index );
1740          FETCH( func, *inst, 1, 1, chan_index );
1741          sse_minps(
1742             func,
1743             make_xmm( 0 ),
1744             make_xmm( 1 ) );
1745          STORE( func, *inst, 0, 0, chan_index );
1746       }
1747       break;
1748
1749    case TGSI_OPCODE_MAX:
1750       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1751          FETCH( func, *inst, 0, 0, chan_index );
1752          FETCH( func, *inst, 1, 1, chan_index );
1753          sse_maxps(
1754             func,
1755             make_xmm( 0 ),
1756             make_xmm( 1 ) );
1757          STORE( func, *inst, 0, 0, chan_index );
1758       }
1759       break;
1760
1761    case TGSI_OPCODE_SLT:
1762    /* TGSI_OPCODE_SETLT */
1763       emit_setcc( func, inst, cc_LessThan );
1764       break;
1765
1766    case TGSI_OPCODE_SGE:
1767    /* TGSI_OPCODE_SETGE */
1768       emit_setcc( func, inst, cc_NotLessThan );
1769       break;
1770
1771    case TGSI_OPCODE_MAD:
1772    /* TGSI_OPCODE_MADD */
1773       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1774          FETCH( func, *inst, 0, 0, chan_index );
1775          FETCH( func, *inst, 1, 1, chan_index );
1776          FETCH( func, *inst, 2, 2, chan_index );
1777          emit_mul( func, 0, 1 );
1778          emit_add( func, 0, 2 );
1779          STORE( func, *inst, 0, 0, chan_index );
1780       }
1781       break;
1782
1783    case TGSI_OPCODE_SUB:
1784       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1785          FETCH( func, *inst, 0, 0, chan_index );
1786          FETCH( func, *inst, 1, 1, chan_index );
1787          emit_sub( func, 0, 1 );
1788          STORE( func, *inst, 0, 0, chan_index );
1789       }
1790       break;
1791
1792    case TGSI_OPCODE_LERP:
1793    /* TGSI_OPCODE_LRP */
1794       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1795          FETCH( func, *inst, 0, 0, chan_index );
1796          FETCH( func, *inst, 1, 1, chan_index );
1797          FETCH( func, *inst, 2, 2, chan_index );
1798          emit_sub( func, 1, 2 );
1799          emit_mul( func, 0, 1 );
1800          emit_add( func, 0, 2 );
1801          STORE( func, *inst, 0, 0, chan_index );
1802       }
1803       break;
1804
1805    case TGSI_OPCODE_CND:
1806       return 0;
1807       break;
1808
1809    case TGSI_OPCODE_CND0:
1810       return 0;
1811       break;
1812
1813    case TGSI_OPCODE_DOT2ADD:
1814    /* TGSI_OPCODE_DP2A */
1815       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
1816       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
1817       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
1818       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
1819       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
1820       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
1821       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
1822       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
1823       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
1824       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1825          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
1826       }
1827       break;
1828
1829    case TGSI_OPCODE_INDEX:
1830       return 0;
1831       break;
1832
1833    case TGSI_OPCODE_NEGATE:
1834       return 0;
1835       break;
1836
1837    case TGSI_OPCODE_FRAC:
1838    /* TGSI_OPCODE_FRC */
1839       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1840          FETCH( func, *inst, 0, 0, chan_index );
1841          emit_frc( func, 0, 0 );
1842          STORE( func, *inst, 0, 0, chan_index );
1843       }
1844       break;
1845
1846    case TGSI_OPCODE_CLAMP:
1847       return 0;
1848       break;
1849
1850    case TGSI_OPCODE_FLOOR:
1851    /* TGSI_OPCODE_FLR */
1852       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1853          FETCH( func, *inst, 0, 0, chan_index );
1854          emit_flr( func, 0, 0 );
1855          STORE( func, *inst, 0, 0, chan_index );
1856       }
1857       break;
1858
1859    case TGSI_OPCODE_ROUND:
1860       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1861          FETCH( func, *inst, 0, 0, chan_index );
1862          emit_rnd( func, 0, 0 );
1863          STORE( func, *inst, 0, 0, chan_index );
1864       }
1865       break;
1866
1867    case TGSI_OPCODE_EXPBASE2:
1868    /* TGSI_OPCODE_EX2 */
1869       FETCH( func, *inst, 0, 0, CHAN_X );
1870       emit_ex2( func, 0, 0 );
1871       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1872          STORE( func, *inst, 0, 0, chan_index );
1873       }
1874       break;
1875
1876    case TGSI_OPCODE_LOGBASE2:
1877    /* TGSI_OPCODE_LG2 */
1878       FETCH( func, *inst, 0, 0, CHAN_X );
1879       emit_lg2( func, 0, 0 );
1880       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1881          STORE( func, *inst, 0, 0, chan_index );
1882       }
1883       break;
1884
1885    case TGSI_OPCODE_POWER:
1886    /* TGSI_OPCODE_POW */
1887       FETCH( func, *inst, 0, 0, CHAN_X );
1888       FETCH( func, *inst, 1, 1, CHAN_X );
1889       emit_pow( func, 0, 0, 1 );
1890       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1891          STORE( func, *inst, 0, 0, chan_index );
1892       }
1893       break;
1894
1895    case TGSI_OPCODE_CROSSPRODUCT:
1896    /* TGSI_OPCODE_XPD */
1897       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1898           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1899          FETCH( func, *inst, 1, 1, CHAN_Z );
1900          FETCH( func, *inst, 3, 0, CHAN_Z );
1901       }
1902       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1903           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1904          FETCH( func, *inst, 0, 0, CHAN_Y );
1905          FETCH( func, *inst, 4, 1, CHAN_Y );
1906       }
1907       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1908          emit_MOV( func, 2, 0 );
1909          emit_mul( func, 2, 1 );
1910          emit_MOV( func, 5, 3 );
1911          emit_mul( func, 5, 4 );
1912          emit_sub( func, 2, 5 );
1913          STORE( func, *inst, 2, 0, CHAN_X );
1914       }
1915       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1916           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1917          FETCH( func, *inst, 2, 1, CHAN_X );
1918          FETCH( func, *inst, 5, 0, CHAN_X );
1919       }
1920       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1921          emit_mul( func, 3, 2 );
1922          emit_mul( func, 1, 5 );
1923          emit_sub( func, 3, 1 );
1924          STORE( func, *inst, 3, 0, CHAN_Y );
1925       }
1926       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1927          emit_mul( func, 5, 4 );
1928          emit_mul( func, 0, 2 );
1929          emit_sub( func, 5, 0 );
1930          STORE( func, *inst, 5, 0, CHAN_Z );
1931       }
1932       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1933          emit_tempf(
1934             func,
1935             0,
1936             TEMP_ONE_I,
1937             TEMP_ONE_C );
1938          STORE( func, *inst, 0, 0, CHAN_W );
1939       }
1940       break;
1941
1942    case TGSI_OPCODE_MULTIPLYMATRIX:
1943       return 0;
1944       break;
1945
1946    case TGSI_OPCODE_ABS:
1947       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1948          FETCH( func, *inst, 0, 0, chan_index );
1949          emit_abs( func, 0) ;
1950
1951          STORE( func, *inst, 0, 0, chan_index );
1952       }
1953       break;
1954
1955    case TGSI_OPCODE_RCC:
1956       return 0;
1957       break;
1958
1959    case TGSI_OPCODE_DPH:
1960       FETCH( func, *inst, 0, 0, CHAN_X );
1961       FETCH( func, *inst, 1, 1, CHAN_X );
1962       emit_mul( func, 0, 1 );
1963       FETCH( func, *inst, 1, 0, CHAN_Y );
1964       FETCH( func, *inst, 2, 1, CHAN_Y );
1965       emit_mul( func, 1, 2 );
1966       emit_add( func, 0, 1 );
1967       FETCH( func, *inst, 1, 0, CHAN_Z );
1968       FETCH( func, *inst, 2, 1, CHAN_Z );
1969       emit_mul( func, 1, 2 );
1970       emit_add( func, 0, 1 );
1971       FETCH( func, *inst, 1, 1, CHAN_W );
1972       emit_add( func, 0, 1 );
1973       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1974          STORE( func, *inst, 0, 0, chan_index );
1975       }
1976       break;
1977
1978    case TGSI_OPCODE_COS:
1979       FETCH( func, *inst, 0, 0, CHAN_X );
1980       emit_cos( func, 0, 0 );
1981       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1982          STORE( func, *inst, 0, 0, chan_index );
1983       }
1984       break;
1985
1986    case TGSI_OPCODE_DDX:
1987       return 0;
1988       break;
1989
1990    case TGSI_OPCODE_DDY:
1991       return 0;
1992       break;
1993
1994    case TGSI_OPCODE_KILP:
1995       /* predicated kill */
1996       emit_kilp( func );
1997       return 0; /* XXX fix me */
1998       break;
1999
2000    case TGSI_OPCODE_KIL:
2001       /* conditional kill */
2002       emit_kil( func, &inst->FullSrcRegisters[0] );
2003       break;
2004
2005    case TGSI_OPCODE_PK2H:
2006       return 0;
2007       break;
2008
2009    case TGSI_OPCODE_PK2US:
2010       return 0;
2011       break;
2012
2013    case TGSI_OPCODE_PK4B:
2014       return 0;
2015       break;
2016
2017    case TGSI_OPCODE_PK4UB:
2018       return 0;
2019       break;
2020
2021    case TGSI_OPCODE_RFL:
2022       return 0;
2023       break;
2024
2025    case TGSI_OPCODE_SEQ:
2026       return 0;
2027       break;
2028
2029    case TGSI_OPCODE_SFL:
2030       return 0;
2031       break;
2032
2033    case TGSI_OPCODE_SGT:
2034       return 0;
2035       break;
2036
2037    case TGSI_OPCODE_SIN:
2038       FETCH( func, *inst, 0, 0, CHAN_X );
2039       emit_sin( func, 0, 0 );
2040       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2041          STORE( func, *inst, 0, 0, chan_index );
2042       }
2043       break;
2044
2045    case TGSI_OPCODE_SLE:
2046       return 0;
2047       break;
2048
2049    case TGSI_OPCODE_SNE:
2050       return 0;
2051       break;
2052
2053    case TGSI_OPCODE_STR:
2054       return 0;
2055       break;
2056
2057    case TGSI_OPCODE_TEX:
2058       if (0) {
2059          /* Disable dummy texture code:
2060           */
2061          emit_tempf(
2062             func,
2063             0,
2064             TEMP_ONE_I,
2065             TEMP_ONE_C );
2066          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2067             STORE( func, *inst, 0, 0, chan_index );
2068          }
2069       }
2070       else {
2071          return 0;
2072       }
2073       break;
2074
2075    case TGSI_OPCODE_TXD:
2076       return 0;
2077       break;
2078
2079    case TGSI_OPCODE_UP2H:
2080       return 0;
2081       break;
2082
2083    case TGSI_OPCODE_UP2US:
2084       return 0;
2085       break;
2086
2087    case TGSI_OPCODE_UP4B:
2088       return 0;
2089       break;
2090
2091    case TGSI_OPCODE_UP4UB:
2092       return 0;
2093       break;
2094
2095    case TGSI_OPCODE_X2D:
2096       return 0;
2097       break;
2098
2099    case TGSI_OPCODE_ARA:
2100       return 0;
2101       break;
2102
2103    case TGSI_OPCODE_ARR:
2104       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2105          FETCH( func, *inst, 0, 0, chan_index );
2106          emit_rnd( func, 0, 0 );
2107          emit_f2it( func, 0 );
2108          STORE( func, *inst, 0, 0, chan_index );
2109       }
2110       break;
2111
2112    case TGSI_OPCODE_BRA:
2113       return 0;
2114       break;
2115
2116    case TGSI_OPCODE_CAL:
2117       return 0;
2118       break;
2119
2120    case TGSI_OPCODE_RET:
2121       emit_ret( func );
2122       break;
2123
2124    case TGSI_OPCODE_END:
2125       break;
2126
2127    case TGSI_OPCODE_SSG:
2128    /* TGSI_OPCODE_SGN */
2129       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2130          FETCH( func, *inst, 0, 0, chan_index );
2131          emit_sgn( func, 0, 0 );
2132          STORE( func, *inst, 0, 0, chan_index );
2133       }
2134       break;
2135
2136    case TGSI_OPCODE_CMP:
2137       emit_cmp (func, inst);
2138       break;
2139
2140    case TGSI_OPCODE_SCS:
2141       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2142          FETCH( func, *inst, 0, 0, CHAN_X );
2143          emit_cos( func, 0, 0 );
2144          STORE( func, *inst, 0, 0, CHAN_X );
2145       }
2146       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2147          FETCH( func, *inst, 0, 0, CHAN_X );
2148          emit_sin( func, 0, 0 );
2149          STORE( func, *inst, 0, 0, CHAN_Y );
2150       }
2151       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2152          emit_tempf(
2153             func,
2154             0,
2155             TGSI_EXEC_TEMP_00000000_I,
2156             TGSI_EXEC_TEMP_00000000_C );
2157          STORE( func, *inst, 0, 0, CHAN_Z );
2158       }
2159       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2160          emit_tempf(
2161             func,
2162             0,
2163             TEMP_ONE_I,
2164             TEMP_ONE_C );
2165          STORE( func, *inst, 0, 0, CHAN_W );
2166       }
2167       break;
2168
2169    case TGSI_OPCODE_TXB:
2170       return 0;
2171       break;
2172
2173    case TGSI_OPCODE_NRM:
2174       /* fall-through */
2175    case TGSI_OPCODE_NRM4:
2176       /* 3 or 4-component normalization */
2177       {
2178          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2179          /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
2180          FETCH( func, *inst, 4, 0, CHAN_X );    /* xmm4 = src[0].x */
2181          FETCH( func, *inst, 5, 0, CHAN_Y );    /* xmm5 = src[0].y */
2182          FETCH( func, *inst, 6, 0, CHAN_Z );    /* xmm6 = src[0].z */
2183          if (dims == 4) {
2184             FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
2185          }
2186          emit_MOV( func, 0, 4 );                /* xmm0 = xmm3 */
2187          emit_mul( func, 0, 4 );                /* xmm0 *= xmm3 */
2188          emit_MOV( func, 1, 5 );                /* xmm1 = xmm4 */
2189          emit_mul( func, 1, 5 );                /* xmm1 *= xmm4 */
2190          emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
2191          emit_MOV( func, 1, 6 );                /* xmm1 = xmm5 */
2192          emit_mul( func, 1, 6 );                /* xmm1 *= xmm5 */
2193          emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
2194          if (dims == 4) {
2195             emit_MOV( func, 1, 7 );             /* xmm1 = xmm7 */
2196             emit_mul( func, 1, 7 );             /* xmm1 *= xmm7 */
2197             emit_add( func, 0, 0 );             /* xmm0 += xmm1 */
2198          }
2199          emit_rsqrt( func, 1, 0 );              /* xmm1 = 1/sqrt(xmm0) */
2200          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2201             if (chan_index < dims) {
2202                emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
2203                STORE( func, *inst, 4+chan_index, 0, chan_index );
2204             }
2205          }
2206       }
2207       break;
2208
2209    case TGSI_OPCODE_DIV:
2210       return 0;
2211       break;
2212
2213    case TGSI_OPCODE_DP2:
2214       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2215       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2216       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2217       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2218       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2219       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2220       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2221       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2222          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2223       }
2224       break;
2225
2226    case TGSI_OPCODE_TXL:
2227       return 0;
2228       break;
2229
2230    case TGSI_OPCODE_BRK:
2231       return 0;
2232       break;
2233
2234    case TGSI_OPCODE_IF:
2235       return 0;
2236       break;
2237
2238    case TGSI_OPCODE_LOOP:
2239       return 0;
2240       break;
2241
2242    case TGSI_OPCODE_REP:
2243       return 0;
2244       break;
2245
2246    case TGSI_OPCODE_ELSE:
2247       return 0;
2248       break;
2249
2250    case TGSI_OPCODE_ENDIF:
2251       return 0;
2252       break;
2253
2254    case TGSI_OPCODE_ENDLOOP:
2255       return 0;
2256       break;
2257
2258    case TGSI_OPCODE_ENDREP:
2259       return 0;
2260       break;
2261
2262    case TGSI_OPCODE_PUSHA:
2263       return 0;
2264       break;
2265
2266    case TGSI_OPCODE_POPA:
2267       return 0;
2268       break;
2269
2270    case TGSI_OPCODE_CEIL:
2271       return 0;
2272       break;
2273
2274    case TGSI_OPCODE_I2F:
2275       return 0;
2276       break;
2277
2278    case TGSI_OPCODE_NOT:
2279       return 0;
2280       break;
2281
2282    case TGSI_OPCODE_TRUNC:
2283       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2284          FETCH( func, *inst, 0, 0, chan_index );
2285          emit_f2it( func, 0 );
2286          emit_i2f( func, 0 );
2287          STORE( func, *inst, 0, 0, chan_index );
2288       }
2289       break;
2290
2291    case TGSI_OPCODE_SHL:
2292       return 0;
2293       break;
2294
2295    case TGSI_OPCODE_SHR:
2296       return 0;
2297       break;
2298
2299    case TGSI_OPCODE_AND:
2300       return 0;
2301       break;
2302
2303    case TGSI_OPCODE_OR:
2304       return 0;
2305       break;
2306
2307    case TGSI_OPCODE_MOD:
2308       return 0;
2309       break;
2310
2311    case TGSI_OPCODE_XOR:
2312       return 0;
2313       break;
2314
2315    case TGSI_OPCODE_SAD:
2316       return 0;
2317       break;
2318
2319    case TGSI_OPCODE_TXF:
2320       return 0;
2321       break;
2322
2323    case TGSI_OPCODE_TXQ:
2324       return 0;
2325       break;
2326
2327    case TGSI_OPCODE_CONT:
2328       return 0;
2329       break;
2330
2331    case TGSI_OPCODE_EMIT:
2332       return 0;
2333       break;
2334
2335    case TGSI_OPCODE_ENDPRIM:
2336       return 0;
2337       break;
2338
2339    default:
2340       return 0;
2341    }
2342
2343    return 1;
2344 }
2345
2346 static void
2347 emit_declaration(
2348    struct x86_function *func,
2349    struct tgsi_full_declaration *decl )
2350 {
2351    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2352       unsigned first, last, mask;
2353       unsigned i, j;
2354
2355       first = decl->DeclarationRange.First;
2356       last = decl->DeclarationRange.Last;
2357       mask = decl->Declaration.UsageMask;
2358
2359       for( i = first; i <= last; i++ ) {
2360          for( j = 0; j < NUM_CHANNELS; j++ ) {
2361             if( mask & (1 << j) ) {
2362                switch( decl->Declaration.Interpolate ) {
2363                case TGSI_INTERPOLATE_CONSTANT:
2364                   emit_coef_a0( func, 0, i, j );
2365                   emit_inputs( func, 0, i, j );
2366                   break;
2367
2368                case TGSI_INTERPOLATE_LINEAR:
2369                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2370                   emit_coef_dadx( func, 1, i, j );
2371                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2372                   emit_coef_dady( func, 3, i, j );
2373                   emit_mul( func, 0, 1 );    /* x * dadx */
2374                   emit_coef_a0( func, 4, i, j );
2375                   emit_mul( func, 2, 3 );    /* y * dady */
2376                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2377                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2378                   emit_inputs( func, 0, i, j );
2379                   break;
2380
2381                case TGSI_INTERPOLATE_PERSPECTIVE:
2382                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2383                   emit_coef_dadx( func, 1, i, j );
2384                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2385                   emit_coef_dady( func, 3, i, j );
2386                   emit_mul( func, 0, 1 );    /* x * dadx */
2387                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2388                   emit_coef_a0( func, 5, i, j );
2389                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2390                   emit_mul( func, 2, 3 );    /* y * dady */
2391                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2392                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2393                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2394                   emit_inputs( func, 0, i, j );
2395                   break;
2396
2397                default:
2398                   assert( 0 );
2399                   break;
2400                }
2401             }
2402          }
2403       }
2404    }
2405 }
2406
2407 static void aos_to_soa( struct x86_function *func,
2408                         uint arg_aos,
2409                         uint arg_soa,
2410                         uint arg_num,
2411                         uint arg_stride )
2412 {
2413    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2414    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2415    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2416    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2417    int inner_loop;
2418
2419
2420    /* Save EBX */
2421    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2422
2423    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2424    x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
2425    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2426    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2427
2428    /* do */
2429    inner_loop = x86_get_label( func );
2430    {
2431       x86_push( func, aos_input );
2432       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2433       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2434       x86_add( func, aos_input, stride );
2435       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2436       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2437       x86_add( func, aos_input, stride );
2438       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2439       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2440       x86_add( func, aos_input, stride );
2441       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2442       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2443       x86_pop( func, aos_input );
2444
2445       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2446       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2447       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2448       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2449       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2450       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2451
2452       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2453       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2454       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2455       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2456
2457       /* Advance to next input */
2458       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2459       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2460    }
2461    /* while --num_inputs */
2462    x86_dec( func, num_inputs );
2463    x86_jcc( func, cc_NE, inner_loop );
2464
2465    /* Restore EBX */
2466    x86_pop( func, aos_input );
2467 }
2468
2469 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2470 {
2471    struct x86_reg soa_output;
2472    struct x86_reg aos_output;
2473    struct x86_reg num_outputs;
2474    struct x86_reg temp;
2475    int inner_loop;
2476
2477    soa_output = x86_make_reg( file_REG32, reg_AX );
2478    aos_output = x86_make_reg( file_REG32, reg_BX );
2479    num_outputs = x86_make_reg( file_REG32, reg_CX );
2480    temp = x86_make_reg( file_REG32, reg_DX );
2481
2482    /* Save EBX */
2483    x86_push( func, aos_output );
2484
2485    x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2486    x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2487    x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2488
2489    /* do */
2490    inner_loop = x86_get_label( func );
2491    {
2492       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2493       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2494       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2495       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2496
2497       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2498       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2499       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2500       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2501       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2502       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2503
2504       x86_mov( func, temp, x86_fn_arg( func, stride ) );
2505       x86_push( func, aos_output );
2506       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2507       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2508       x86_add( func, aos_output, temp );
2509       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2510       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2511       x86_add( func, aos_output, temp );
2512       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2513       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2514       x86_add( func, aos_output, temp );
2515       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2516       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2517       x86_pop( func, aos_output );
2518
2519       /* Advance to next output */
2520       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2521       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2522    }
2523    /* while --num_outputs */
2524    x86_dec( func, num_outputs );
2525    x86_jcc( func, cc_NE, inner_loop );
2526
2527    /* Restore EBX */
2528    x86_pop( func, aos_output );
2529 }
2530
2531 /**
2532  * Translate a TGSI vertex/fragment shader to SSE2 code.
2533  * Slightly different things are done for vertex vs. fragment shaders.
2534  *
2535  * Note that fragment shaders are responsible for interpolating shader
2536  * inputs. Because on x86 we have only 4 GP registers, and here we
2537  * have 5 shader arguments (input, output, const, temp and coef), the
2538  * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2539  * GP register holding the output argument is aliased with the coeff
2540  * argument, as outputs are not needed in the DECLARATION phase.
2541  *
2542  * \param tokens  the TGSI input shader
2543  * \param func  the output SSE code/function
2544  * \param immediates  buffer to place immediates, later passed to SSE func
2545  * \param return  1 for success, 0 if translation failed
2546  */
2547 unsigned
2548 tgsi_emit_sse2(
2549    const struct tgsi_token *tokens,
2550    struct x86_function *func,
2551    float (*immediates)[4],
2552    boolean do_swizzles )
2553 {
2554    struct tgsi_parse_context parse;
2555    boolean instruction_phase = FALSE;
2556    unsigned ok = 1;
2557    uint num_immediates = 0;
2558
2559    util_init_math();
2560
2561    func->csr = func->store;
2562
2563    tgsi_parse_init( &parse, tokens );
2564
2565    /* Can't just use EDI, EBX without save/restoring them:
2566     */
2567    x86_push(
2568       func,
2569       get_immediate_base() );
2570
2571    x86_push(
2572       func,
2573       get_temp_base() );
2574
2575
2576    /*
2577     * Different function args for vertex/fragment shaders:
2578     */
2579    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2580       /* DECLARATION phase, do not load output argument. */
2581       x86_mov(
2582          func,
2583          get_input_base(),
2584          x86_fn_arg( func, 1 ) );
2585       /* skipping outputs argument here */
2586       x86_mov(
2587          func,
2588          get_const_base(),
2589          x86_fn_arg( func, 3 ) );
2590       x86_mov(
2591          func,
2592          get_temp_base(),
2593          x86_fn_arg( func, 4 ) );
2594       x86_mov(
2595          func,
2596          get_coef_base(),
2597          x86_fn_arg( func, 5 ) );
2598       x86_mov(
2599          func,
2600          get_immediate_base(),
2601          x86_fn_arg( func, 6 ) );
2602    }
2603    else {
2604       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2605
2606       if (do_swizzles)
2607          aos_to_soa( func,
2608                      6,         /* aos_input */
2609                      1,         /* machine->input */
2610                      7,         /* num_inputs */
2611                      8 );       /* input_stride */
2612
2613       x86_mov(
2614          func,
2615          get_input_base(),
2616          x86_fn_arg( func, 1 ) );
2617       x86_mov(
2618          func,
2619          get_output_base(),
2620          x86_fn_arg( func, 2 ) );
2621       x86_mov(
2622          func,
2623          get_const_base(),
2624          x86_fn_arg( func, 3 ) );
2625       x86_mov(
2626          func,
2627          get_temp_base(),
2628          x86_fn_arg( func, 4 ) );
2629       x86_mov(
2630          func,
2631          get_immediate_base(),
2632          x86_fn_arg( func, 5 ) );
2633    }
2634
2635    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2636       tgsi_parse_token( &parse );
2637
2638       switch( parse.FullToken.Token.Type ) {
2639       case TGSI_TOKEN_TYPE_DECLARATION:
2640          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2641             emit_declaration(
2642                func,
2643                &parse.FullToken.FullDeclaration );
2644          }
2645          break;
2646
2647       case TGSI_TOKEN_TYPE_INSTRUCTION:
2648          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2649             if( !instruction_phase ) {
2650                /* INSTRUCTION phase, overwrite coeff with output. */
2651                instruction_phase = TRUE;
2652                x86_mov(
2653                   func,
2654                   get_output_base(),
2655                   x86_fn_arg( func, 2 ) );
2656             }
2657          }
2658
2659          ok = emit_instruction(
2660             func,
2661             &parse.FullToken.FullInstruction );
2662
2663          if (!ok) {
2664             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2665                          parse.FullToken.FullInstruction.Instruction.Opcode,
2666                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2667                          "vertex shader" : "fragment shader");
2668          }
2669          break;
2670
2671       case TGSI_TOKEN_TYPE_IMMEDIATE:
2672          /* simply copy the immediate values into the next immediates[] slot */
2673          {
2674             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2675             uint i;
2676             assert(size <= 4);
2677             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2678             for( i = 0; i < size; i++ ) {
2679                immediates[num_immediates][i] =
2680                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2681             }
2682 #if 0
2683             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2684                    num_immediates,
2685                    immediates[num_immediates][0],
2686                    immediates[num_immediates][1],
2687                    immediates[num_immediates][2],
2688                    immediates[num_immediates][3]);
2689 #endif
2690             num_immediates++;
2691          }
2692          break;
2693
2694       default:
2695          ok = 0;
2696          assert( 0 );
2697       }
2698    }
2699
2700    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2701       if (do_swizzles)
2702          soa_to_aos( func, 9, 2, 10, 11 );
2703    }
2704
2705    /* Can't just use EBX, EDI without save/restoring them:
2706     */
2707    x86_pop(
2708       func,
2709       get_temp_base() );
2710
2711    x86_pop(
2712       func,
2713       get_immediate_base() );
2714
2715    emit_ret( func );
2716
2717    tgsi_parse_free( &parse );
2718
2719    return ok;
2720 }
2721
2722 #endif /* PIPE_ARCH_X86 */
2723