src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_config.h"
  29
  30 #if defined(PIPE_ARCH_X86)
  31
  32 #include "util/u_debug.h"
  33 #include "pipe/p_shader_tokens.h"
  34 #include "util/u_math.h"
  35 #if defined(PIPE_ARCH_SSE)
  36 #include "util/u_sse.h"
  37 #endif
  38 #include "tgsi/tgsi_parse.h"
  39 #include "tgsi/tgsi_util.h"
  40 #include "tgsi_exec.h"
  41 #include "tgsi_sse2.h"
  42
  43 #include "rtasm/rtasm_x86sse.h"
  44
  45 /* for 1/sqrt()
  46  *
  47  * This costs about 100fps (close to 10%) in gears:
  48  */
  49 #define HIGH_PRECISION 1
  50
  51 #define FAST_MATH 1
  52
  53
  54 #define FOR_EACH_CHANNEL( CHAN )\
  55    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  56
  57 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  58    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  59
  60 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  61    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  62
  63 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  64    FOR_EACH_CHANNEL( CHAN )\
  65       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  66
  67 #define CHAN_X 0
  68 #define CHAN_Y 1
  69 #define CHAN_Z 2
  70 #define CHAN_W 3
  71
  72 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  73 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  74
  75 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  76 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  77 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  78 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  79
  80
  81 /**
  82  * X86 utility functions.
  83  */
  84
  85 static struct x86_reg
  86 make_xmm(
  87    unsigned xmm )
  88 {
  89    return x86_make_reg(
  90       file_XMM,
  91       (enum x86_reg_name) xmm );
  92 }
  93
  94 /**
  95  * X86 register mapping helpers.
  96  */
  97
  98 static struct x86_reg
  99 get_const_base( void )
 100 {
 101    return x86_make_reg(
 102       file_REG32,
 103       reg_CX );
 104 }
 105
 106 static struct x86_reg
 107 get_input_base( void )
 108 {
 109    return x86_make_reg(
 110       file_REG32,
 111       reg_AX );
 112 }
 113
 114 static struct x86_reg
 115 get_output_base( void )
 116 {
 117    return x86_make_reg(
 118       file_REG32,
 119       reg_DX );
 120 }
 121
 122 static struct x86_reg
 123 get_temp_base( void )
 124 {
 125    return x86_make_reg(
 126       file_REG32,
 127       reg_BX );
 128 }
 129
 130 static struct x86_reg
 131 get_coef_base( void )
 132 {
 133    return get_output_base();
 134 }
 135
 136 static struct x86_reg
 137 get_immediate_base( void )
 138 {
 139    return x86_make_reg(
 140       file_REG32,
 141       reg_DI );
 142 }
 143
 144
 145 /**
 146  * Data access helpers.
 147  */
 148
 149
 150 static struct x86_reg
 151 get_immediate(
 152    unsigned vec,
 153    unsigned chan )
 154 {
 155    return x86_make_disp(
 156       get_immediate_base(),
 157       (vec * 4 + chan) * 4 );
 158 }
 159
 160 static struct x86_reg
 161 get_const(
 162    unsigned vec,
 163    unsigned chan )
 164 {
 165    return x86_make_disp(
 166       get_const_base(),
 167       (vec * 4 + chan) * 4 );
 168 }
 169
 170 static struct x86_reg
 171 get_input(
 172    unsigned vec,
 173    unsigned chan )
 174 {
 175    return x86_make_disp(
 176       get_input_base(),
 177       (vec * 4 + chan) * 16 );
 178 }
 179
 180 static struct x86_reg
 181 get_output(
 182    unsigned vec,
 183    unsigned chan )
 184 {
 185    return x86_make_disp(
 186       get_output_base(),
 187       (vec * 4 + chan) * 16 );
 188 }
 189
 190 static struct x86_reg
 191 get_temp(
 192    unsigned vec,
 193    unsigned chan )
 194 {
 195    return x86_make_disp(
 196       get_temp_base(),
 197       (vec * 4 + chan) * 16 );
 198 }
 199
 200 static struct x86_reg
 201 get_coef(
 202    unsigned vec,
 203    unsigned chan,
 204    unsigned member )
 205 {
 206    return x86_make_disp(
 207       get_coef_base(),
 208       ((vec * 3 + member) * 4 + chan) * 4 );
 209 }
 210
 211
 212 static void
 213 emit_ret(
 214    struct x86_function  *func )
 215 {
 216    x86_ret( func );
 217 }
 218
 219
 220 /**
 221  * Data fetch helpers.
 222  */
 223
 224 /**
 225  * Copy a shader constant to xmm register
 226  * \param xmm  the destination xmm register
 227  * \param vec  the src const buffer index
 228  * \param chan  src channel to fetch (X, Y, Z or W)
 229  */
 230 static void
 231 emit_const(
 232    struct x86_function *func,
 233    uint xmm,
 234    int vec,
 235    uint chan,
 236    uint indirect,
 237    uint indirectFile,
 238    int indirectIndex )
 239 {
 240    if (indirect) {
 241       /* 'vec' is the offset from the address register's value.
 242        * We're loading CONST[ADDR+vec] into an xmm register.
 243        */
 244       struct x86_reg r0 = get_input_base();
 245       struct x86_reg r1 = get_output_base();
 246       uint i;
 247
 248       assert( indirectFile == TGSI_FILE_ADDRESS );
 249       assert( indirectIndex == 0 );
 250
 251       x86_push( func, r0 );
 252       x86_push( func, r1 );
 253
 254       /*
 255        * Loop over the four pixels or vertices in the quad.
 256        * Get the value of the address (offset) register for pixel/vertex[i],
 257        * add it to the src offset and index into the constant buffer.
 258        * Note that we're working on SOA data.
 259        * If any of the pixel/vertex execution channels are unused their
 260        * values will be garbage.  It's very important that we don't use
 261        * those garbage values as indexes into the constant buffer since
 262        * that'll cause segfaults.
 263        * The solution is to bitwise-AND the offset with the execution mask
 264        * register whose values are either 0 or ~0.
 265        * The caller must setup the execution mask register to indicate
 266        * which channels are valid/alive before running the shader.
 267        * The execution mask will also figure into loops and conditionals
 268        * someday.
 269        */
 270       for (i = 0; i < QUAD_SIZE; i++) {
 271          /* r1 = address register[i] */
 272          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 273          /* r0 = execution mask[i] */
 274          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 275          /* r1 = r1 & r0 */
 276          x86_and( func, r1, r0 );
 277          /* r0 = 'vec', the offset */
 278          x86_lea( func, r0, get_const( vec, chan ) );
 279
 280          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 281           */
 282          x86_add( func, r1, r1 );
 283          x86_add( func, r1, r1 );
 284          x86_add( func, r1, r1 );
 285          x86_add( func, r1, r1 );
 286
 287          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 288          x86_mov( func, r1, x86_deref( r0 ) );
 289          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 290       }
 291
 292       x86_pop( func, r1 );
 293       x86_pop( func, r0 );
 294
 295       sse_movaps(
 296          func,
 297          make_xmm( xmm ),
 298          get_temp( TEMP_R0, CHAN_X ) );
 299    }
 300    else {
 301       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 302       assert( vec >= 0 );
 303
 304       sse_movss(
 305          func,
 306          make_xmm( xmm ),
 307          get_const( vec, chan ) );
 308       sse_shufps(
 309          func,
 310          make_xmm( xmm ),
 311          make_xmm( xmm ),
 312          SHUF( 0, 0, 0, 0 ) );
 313    }
 314 }
 315
 316 static void
 317 emit_immediate(
 318    struct x86_function *func,
 319    unsigned xmm,
 320    unsigned vec,
 321    unsigned chan )
 322 {
 323    sse_movss(
 324       func,
 325       make_xmm( xmm ),
 326       get_immediate( vec, chan ) );
 327    sse_shufps(
 328       func,
 329       make_xmm( xmm ),
 330       make_xmm( xmm ),
 331       SHUF( 0, 0, 0, 0 ) );
 332 }
 333
 334
 335 /**
 336  * Copy a shader input to xmm register
 337  * \param xmm  the destination xmm register
 338  * \param vec  the src input attrib
 339  * \param chan  src channel to fetch (X, Y, Z or W)
 340  */
 341 static void
 342 emit_inputf(
 343    struct x86_function *func,
 344    unsigned xmm,
 345    unsigned vec,
 346    unsigned chan )
 347 {
 348    sse_movups(
 349       func,
 350       make_xmm( xmm ),
 351       get_input( vec, chan ) );
 352 }
 353
 354 /**
 355  * Store an xmm register to a shader output
 356  * \param xmm  the source xmm register
 357  * \param vec  the dest output attrib
 358  * \param chan  src dest channel to store (X, Y, Z or W)
 359  */
 360 static void
 361 emit_output(
 362    struct x86_function *func,
 363    unsigned xmm,
 364    unsigned vec,
 365    unsigned chan )
 366 {
 367    sse_movups(
 368       func,
 369       get_output( vec, chan ),
 370       make_xmm( xmm ) );
 371 }
 372
 373 /**
 374  * Copy a shader temporary to xmm register
 375  * \param xmm  the destination xmm register
 376  * \param vec  the src temp register
 377  * \param chan  src channel to fetch (X, Y, Z or W)
 378  */
 379 static void
 380 emit_tempf(
 381    struct x86_function *func,
 382    unsigned xmm,
 383    unsigned vec,
 384    unsigned chan )
 385 {
 386    sse_movaps(
 387       func,
 388       make_xmm( xmm ),
 389       get_temp( vec, chan ) );
 390 }
 391
 392 /**
 393  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 394  * \param xmm  the destination xmm register
 395  * \param vec  the src input/attribute coefficient index
 396  * \param chan  src channel to fetch (X, Y, Z or W)
 397  * \param member  0=a0, 1=dadx, 2=dady
 398  */
 399 static void
 400 emit_coef(
 401    struct x86_function *func,
 402    unsigned xmm,
 403    unsigned vec,
 404    unsigned chan,
 405    unsigned member )
 406 {
 407    sse_movss(
 408       func,
 409       make_xmm( xmm ),
 410       get_coef( vec, chan, member ) );
 411    sse_shufps(
 412       func,
 413       make_xmm( xmm ),
 414       make_xmm( xmm ),
 415       SHUF( 0, 0, 0, 0 ) );
 416 }
 417
 418 /**
 419  * Data store helpers.
 420  */
 421
 422 static void
 423 emit_inputs(
 424    struct x86_function *func,
 425    unsigned xmm,
 426    unsigned vec,
 427    unsigned chan )
 428 {
 429    sse_movups(
 430       func,
 431       get_input( vec, chan ),
 432       make_xmm( xmm ) );
 433 }
 434
 435 static void
 436 emit_temps(
 437    struct x86_function *func,
 438    unsigned xmm,
 439    unsigned vec,
 440    unsigned chan )
 441 {
 442    sse_movaps(
 443       func,
 444       get_temp( vec, chan ),
 445       make_xmm( xmm ) );
 446 }
 447
 448 static void
 449 emit_addrs(
 450    struct x86_function *func,
 451    unsigned xmm,
 452    unsigned vec,
 453    unsigned chan )
 454 {
 455    assert( vec == 0 );
 456
 457    emit_temps(
 458       func,
 459       xmm,
 460       vec + TGSI_EXEC_TEMP_ADDR,
 461       chan );
 462 }
 463
 464 /**
 465  * Coefficent fetch helpers.
 466  */
 467
 468 static void
 469 emit_coef_a0(
 470    struct x86_function *func,
 471    unsigned xmm,
 472    unsigned vec,
 473    unsigned chan )
 474 {
 475    emit_coef(
 476       func,
 477       xmm,
 478       vec,
 479       chan,
 480       0 );
 481 }
 482
 483 static void
 484 emit_coef_dadx(
 485    struct x86_function *func,
 486    unsigned xmm,
 487    unsigned vec,
 488    unsigned chan )
 489 {
 490    emit_coef(
 491       func,
 492       xmm,
 493       vec,
 494       chan,
 495       1 );
 496 }
 497
 498 static void
 499 emit_coef_dady(
 500    struct x86_function *func,
 501    unsigned xmm,
 502    unsigned vec,
 503    unsigned chan )
 504 {
 505    emit_coef(
 506       func,
 507       xmm,
 508       vec,
 509       chan,
 510       2 );
 511 }
 512
 513 /**
 514  * Function call helpers.
 515  */
 516
 517 /**
 518  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 519  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 520  * that the stack pointer is 16 byte aligned, as expected.
 521  */
 522 static void
 523 emit_func_call_dst(
 524    struct x86_function *func,
 525    unsigned xmm_save,
 526    unsigned xmm_dst,
 527    void (PIPE_CDECL *code)() )
 528 {
 529    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 530    unsigned i, n;
 531    unsigned xmm_mask;
 532
 533    /* Bitmask of the xmm registers to save */
 534    xmm_mask = (1 << xmm_save) - 1;
 535    xmm_mask &= ~(1 << xmm_dst);
 536
 537    sse_movaps(
 538       func,
 539       get_temp( TEMP_R0, 0 ),
 540       make_xmm( xmm_dst ) );
 541
 542    x86_push(
 543       func,
 544       x86_make_reg( file_REG32, reg_AX) );
 545    x86_push(
 546       func,
 547       x86_make_reg( file_REG32, reg_CX) );
 548    x86_push(
 549       func,
 550       x86_make_reg( file_REG32, reg_DX) );
 551
 552    for(i = 0, n = 0; i < 8; ++i)
 553       if(xmm_mask & (1 << i))
 554          ++n;
 555
 556    x86_sub_imm(
 557       func,
 558       x86_make_reg( file_REG32, reg_SP ),
 559       n*16);
 560
 561    for(i = 0, n = 0; i < 8; ++i)
 562       if(xmm_mask & (1 << i)) {
 563          sse_movups(
 564             func,
 565             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 566             make_xmm( i ) );
 567          ++n;
 568       }
 569
 570    x86_lea(
 571       func,
 572       ecx,
 573       get_temp( TEMP_R0, 0 ) );
 574
 575    x86_push( func, ecx );
 576    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 577    x86_call( func, ecx );
 578    x86_pop(func, ecx );
 579
 580    for(i = 0, n = 0; i < 8; ++i)
 581       if(xmm_mask & (1 << i)) {
 582          sse_movups(
 583             func,
 584             make_xmm( i ),
 585             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 586          ++n;
 587       }
 588
 589    x86_add_imm(
 590       func,
 591       x86_make_reg( file_REG32, reg_SP ),
 592       n*16);
 593
 594    /* Restore GP registers in a reverse order.
 595     */
 596    x86_pop(
 597       func,
 598       x86_make_reg( file_REG32, reg_DX) );
 599    x86_pop(
 600       func,
 601       x86_make_reg( file_REG32, reg_CX) );
 602    x86_pop(
 603       func,
 604       x86_make_reg( file_REG32, reg_AX) );
 605
 606    sse_movaps(
 607       func,
 608       make_xmm( xmm_dst ),
 609       get_temp( TEMP_R0, 0 ) );
 610 }
 611
 612 static void
 613 emit_func_call_dst_src(
 614    struct x86_function *func,
 615    unsigned xmm_save,
 616    unsigned xmm_dst,
 617    unsigned xmm_src,
 618    void (PIPE_CDECL *code)() )
 619 {
 620    sse_movaps(
 621       func,
 622       get_temp( TEMP_R0, 1 ),
 623       make_xmm( xmm_src ) );
 624
 625    emit_func_call_dst(
 626       func,
 627       xmm_save,
 628       xmm_dst,
 629       code );
 630 }
 631
 632
 633 #if defined(PIPE_ARCH_SSE)
 634
 635 /*
 636  * Fast SSE2 implementation of special math functions.
 637  */
 638
 639 #define POLY0(x, c0) _mm_set1_ps(c0)
 640 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 641 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 642 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 643 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 644 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 645
 646 #define EXP_POLY_DEGREE 3
 647 #define LOG_POLY_DEGREE 5
 648
 649 /**
 650  * See http://www.devmaster.net/forums/showthread.php?p=43580
 651  */
 652 static INLINE __m128
 653 exp2f4(__m128 x)
 654 {
 655    __m128i ipart;
 656    __m128 fpart, expipart, expfpart;
 657
 658    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 659    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 660
 661    /* ipart = int(x - 0.5) */
 662    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 663
 664    /* fpart = x - ipart */
 665    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 666
 667    /* expipart = (float) (1 << ipart) */
 668    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 669
 670    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 671 #if EXP_POLY_DEGREE == 5
 672    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 673 #elif EXP_POLY_DEGREE == 4
 674    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 675 #elif EXP_POLY_DEGREE == 3
 676    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 677 #elif EXP_POLY_DEGREE == 2
 678    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 679 #else
 680 #error
 681 #endif
 682
 683    return _mm_mul_ps(expipart, expfpart);
 684 }
 685
 686
 687 /**
 688  * See http://www.devmaster.net/forums/showthread.php?p=43580
 689  */
 690 static INLINE __m128
 691 log2f4(__m128 x)
 692 {
 693    __m128i expmask = _mm_set1_epi32(0x7f800000);
 694    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 695    __m128 one = _mm_set1_ps(1.0f);
 696
 697    __m128i i = _mm_castps_si128(x);
 698
 699    /* exp = (float) exponent(x) */
 700    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 701
 702    /* mant = (float) mantissa(x) */
 703    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 704
 705    __m128 logmant;
 706
 707    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 708     * These coefficients can be generate with
 709     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 710     */
 711 #if LOG_POLY_DEGREE == 6
 712    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 713 #elif LOG_POLY_DEGREE == 5
 714    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 715 #elif LOG_POLY_DEGREE == 4
 716    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 717 #elif LOG_POLY_DEGREE == 3
 718    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 719 #else
 720 #error
 721 #endif
 722
 723    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 724    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 725
 726    return _mm_add_ps(logmant, exp);
 727 }
 728
 729
 730 static INLINE __m128
 731 powf4(__m128 x, __m128 y)
 732 {
 733    return exp2f4(_mm_mul_ps(log2f4(x), y));
 734 }
 735
 736 #endif /* PIPE_ARCH_SSE */
 737
 738
 739
 740 /**
 741  * Low-level instruction translators.
 742  */
 743
 744 static void
 745 emit_abs(
 746    struct x86_function *func,
 747    unsigned xmm )
 748 {
 749    sse_andps(
 750       func,
 751       make_xmm( xmm ),
 752       get_temp(
 753          TGSI_EXEC_TEMP_7FFFFFFF_I,
 754          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 755 }
 756
 757 static void
 758 emit_add(
 759    struct x86_function *func,
 760    unsigned xmm_dst,
 761    unsigned xmm_src )
 762 {
 763    sse_addps(
 764       func,
 765       make_xmm( xmm_dst ),
 766       make_xmm( xmm_src ) );
 767 }
 768
 769 static void PIPE_CDECL
 770 cos4f(
 771    float *store )
 772 {
 773    store[0] = cosf( store[0] );
 774    store[1] = cosf( store[1] );
 775    store[2] = cosf( store[2] );
 776    store[3] = cosf( store[3] );
 777 }
 778
 779 static void
 780 emit_cos(
 781    struct x86_function *func,
 782    unsigned xmm_save,
 783    unsigned xmm_dst )
 784 {
 785    emit_func_call_dst(
 786       func,
 787       xmm_save,
 788       xmm_dst,
 789       cos4f );
 790 }
 791
 792 static void PIPE_CDECL
 793 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 794 __attribute__((force_align_arg_pointer))
 795 #endif
 796 ex24f(
 797    float *store )
 798 {
 799 #if defined(PIPE_ARCH_SSE)
 800    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 801 #else
 802    store[0] = util_fast_exp2( store[0] );
 803    store[1] = util_fast_exp2( store[1] );
 804    store[2] = util_fast_exp2( store[2] );
 805    store[3] = util_fast_exp2( store[3] );
 806 #endif
 807 }
 808
 809 static void
 810 emit_ex2(
 811    struct x86_function *func,
 812    unsigned xmm_save,
 813    unsigned xmm_dst )
 814 {
 815    emit_func_call_dst(
 816       func,
 817       xmm_save,
 818       xmm_dst,
 819       ex24f );
 820 }
 821
 822 static void
 823 emit_f2it(
 824    struct x86_function *func,
 825    unsigned xmm )
 826 {
 827    sse2_cvttps2dq(
 828       func,
 829       make_xmm( xmm ),
 830       make_xmm( xmm ) );
 831 }
 832
 833 static void
 834 emit_i2f(
 835    struct x86_function *func,
 836    unsigned xmm )
 837 {
 838    sse2_cvtdq2ps(
 839       func,
 840       make_xmm( xmm ),
 841       make_xmm( xmm ) );
 842 }
 843
 844 static void PIPE_CDECL
 845 flr4f(
 846    float *store )
 847 {
 848    store[0] = floorf( store[0] );
 849    store[1] = floorf( store[1] );
 850    store[2] = floorf( store[2] );
 851    store[3] = floorf( store[3] );
 852 }
 853
 854 static void
 855 emit_flr(
 856    struct x86_function *func,
 857    unsigned xmm_save,
 858    unsigned xmm_dst )
 859 {
 860    emit_func_call_dst(
 861       func,
 862       xmm_save,
 863       xmm_dst,
 864       flr4f );
 865 }
 866
 867 static void PIPE_CDECL
 868 frc4f(
 869    float *store )
 870 {
 871    store[0] -= floorf( store[0] );
 872    store[1] -= floorf( store[1] );
 873    store[2] -= floorf( store[2] );
 874    store[3] -= floorf( store[3] );
 875 }
 876
 877 static void
 878 emit_frc(
 879    struct x86_function *func,
 880    unsigned xmm_save,
 881    unsigned xmm_dst )
 882 {
 883    emit_func_call_dst(
 884       func,
 885       xmm_save,
 886       xmm_dst,
 887       frc4f );
 888 }
 889
 890 static void PIPE_CDECL
 891 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 892 __attribute__((force_align_arg_pointer))
 893 #endif
 894 lg24f(
 895    float *store )
 896 {
 897 #if defined(PIPE_ARCH_SSE)
 898    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 899 #else
 900    store[0] = util_fast_log2( store[0] );
 901    store[1] = util_fast_log2( store[1] );
 902    store[2] = util_fast_log2( store[2] );
 903    store[3] = util_fast_log2( store[3] );
 904 #endif
 905 }
 906
 907 static void
 908 emit_lg2(
 909    struct x86_function *func,
 910    unsigned xmm_save,
 911    unsigned xmm_dst )
 912 {
 913    emit_func_call_dst(
 914       func,
 915       xmm_save,
 916       xmm_dst,
 917       lg24f );
 918 }
 919
 920 static void
 921 emit_MOV(
 922    struct x86_function *func,
 923    unsigned xmm_dst,
 924    unsigned xmm_src )
 925 {
 926    sse_movups(
 927       func,
 928       make_xmm( xmm_dst ),
 929       make_xmm( xmm_src ) );
 930 }
 931
 932 static void
 933 emit_mul (struct x86_function *func,
 934           unsigned xmm_dst,
 935           unsigned xmm_src)
 936 {
 937    sse_mulps(
 938       func,
 939       make_xmm( xmm_dst ),
 940       make_xmm( xmm_src ) );
 941 }
 942
 943 static void
 944 emit_neg(
 945    struct x86_function *func,
 946    unsigned xmm )
 947 {
 948    sse_xorps(
 949       func,
 950       make_xmm( xmm ),
 951       get_temp(
 952          TGSI_EXEC_TEMP_80000000_I,
 953          TGSI_EXEC_TEMP_80000000_C ) );
 954 }
 955
 956 static void PIPE_CDECL
 957 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 958 __attribute__((force_align_arg_pointer))
 959 #endif
 960 pow4f(
 961    float *store )
 962 {
 963 #if defined(PIPE_ARCH_SSE)
 964    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
 965 #else
 966    store[0] = util_fast_pow( store[0], store[4] );
 967    store[1] = util_fast_pow( store[1], store[5] );
 968    store[2] = util_fast_pow( store[2], store[6] );
 969    store[3] = util_fast_pow( store[3], store[7] );
 970 #endif
 971 }
 972
 973 static void
 974 emit_pow(
 975    struct x86_function *func,
 976    unsigned xmm_save,
 977    unsigned xmm_dst,
 978    unsigned xmm_src )
 979 {
 980    emit_func_call_dst_src(
 981       func,
 982       xmm_save,
 983       xmm_dst,
 984       xmm_src,
 985       pow4f );
 986 }
 987
 988 static void
 989 emit_rcp (
 990    struct x86_function *func,
 991    unsigned xmm_dst,
 992    unsigned xmm_src )
 993 {
 994    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 995     * good enough.  Need to either emit a proper divide or use the
 996     * iterative technique described below in emit_rsqrt().
 997     */
 998    sse2_rcpps(
 999       func,
1000       make_xmm( xmm_dst ),
1001       make_xmm( xmm_src ) );
1002 }
1003
1004 static void PIPE_CDECL
1005 rnd4f(
1006    float *store )
1007 {
1008    store[0] = floorf( store[0] + 0.5f );
1009    store[1] = floorf( store[1] + 0.5f );
1010    store[2] = floorf( store[2] + 0.5f );
1011    store[3] = floorf( store[3] + 0.5f );
1012 }
1013
1014 static void
1015 emit_rnd(
1016    struct x86_function *func,
1017    unsigned xmm_save,
1018    unsigned xmm_dst )
1019 {
1020    emit_func_call_dst(
1021       func,
1022       xmm_save,
1023       xmm_dst,
1024       rnd4f );
1025 }
1026
1027 static void
1028 emit_rsqrt(
1029    struct x86_function *func,
1030    unsigned xmm_dst,
1031    unsigned xmm_src )
1032 {
1033 #if HIGH_PRECISION
1034    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1035     * implementations, it is possible to improve its precision at
1036     * fairly low cost, using a newton/raphson step, as below:
1037     *
1038     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1039     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1040     *
1041     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1042     */
1043    {
1044       struct x86_reg dst = make_xmm( xmm_dst );
1045       struct x86_reg src = make_xmm( xmm_src );
1046       struct x86_reg tmp0 = make_xmm( 2 );
1047       struct x86_reg tmp1 = make_xmm( 3 );
1048
1049       assert( xmm_dst != xmm_src );
1050       assert( xmm_dst != 2 && xmm_dst != 3 );
1051       assert( xmm_src != 2 && xmm_src != 3 );
1052
1053       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1054       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1055       sse_rsqrtps( func, tmp1, src  );
1056       sse_mulps(   func, src,  tmp1 );
1057       sse_mulps(   func, dst,  tmp1 );
1058       sse_mulps(   func, src,  tmp1 );
1059       sse_subps(   func, tmp0, src  );
1060       sse_mulps(   func, dst,  tmp0 );
1061    }
1062 #else
1063    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1064     * good enough.
1065     */
1066    sse_rsqrtps(
1067       func,
1068       make_xmm( xmm_dst ),
1069       make_xmm( xmm_src ) );
1070 #endif
1071 }
1072
1073 static void
1074 emit_setsign(
1075    struct x86_function *func,
1076    unsigned xmm )
1077 {
1078    sse_orps(
1079       func,
1080       make_xmm( xmm ),
1081       get_temp(
1082          TGSI_EXEC_TEMP_80000000_I,
1083          TGSI_EXEC_TEMP_80000000_C ) );
1084 }
1085
1086 static void PIPE_CDECL
1087 sgn4f(
1088    float *store )
1089 {
1090    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1091    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1092    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1093    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1094 }
1095
1096 static void
1097 emit_sgn(
1098    struct x86_function *func,
1099    unsigned xmm_save,
1100    unsigned xmm_dst )
1101 {
1102    emit_func_call_dst(
1103       func,
1104       xmm_save,
1105       xmm_dst,
1106       sgn4f );
1107 }
1108
1109 static void PIPE_CDECL
1110 sin4f(
1111    float *store )
1112 {
1113    store[0] = sinf( store[0] );
1114    store[1] = sinf( store[1] );
1115    store[2] = sinf( store[2] );
1116    store[3] = sinf( store[3] );
1117 }
1118
1119 static void
1120 emit_sin (struct x86_function *func,
1121           unsigned xmm_save,
1122           unsigned xmm_dst)
1123 {
1124    emit_func_call_dst(
1125       func,
1126       xmm_save,
1127       xmm_dst,
1128       sin4f );
1129 }
1130
1131 static void
1132 emit_sub(
1133    struct x86_function *func,
1134    unsigned xmm_dst,
1135    unsigned xmm_src )
1136 {
1137    sse_subps(
1138       func,
1139       make_xmm( xmm_dst ),
1140       make_xmm( xmm_src ) );
1141 }
1142
1143 /**
1144  * Register fetch.
1145  */
1146
1147 static void
1148 emit_fetch(
1149    struct x86_function *func,
1150    unsigned xmm,
1151    const struct tgsi_full_src_register *reg,
1152    const unsigned chan_index )
1153 {
1154    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1155
1156    switch (swizzle) {
1157    case TGSI_EXTSWIZZLE_X:
1158    case TGSI_EXTSWIZZLE_Y:
1159    case TGSI_EXTSWIZZLE_Z:
1160    case TGSI_EXTSWIZZLE_W:
1161       switch (reg->SrcRegister.File) {
1162       case TGSI_FILE_CONSTANT:
1163          emit_const(
1164             func,
1165             xmm,
1166             reg->SrcRegister.Index,
1167             swizzle,
1168             reg->SrcRegister.Indirect,
1169             reg->SrcRegisterInd.File,
1170             reg->SrcRegisterInd.Index );
1171          break;
1172
1173       case TGSI_FILE_IMMEDIATE:
1174          emit_immediate(
1175             func,
1176             xmm,
1177             reg->SrcRegister.Index,
1178             swizzle );
1179          break;
1180
1181       case TGSI_FILE_INPUT:
1182          emit_inputf(
1183             func,
1184             xmm,
1185             reg->SrcRegister.Index,
1186             swizzle );
1187          break;
1188
1189       case TGSI_FILE_TEMPORARY:
1190          emit_tempf(
1191             func,
1192             xmm,
1193             reg->SrcRegister.Index,
1194             swizzle );
1195          break;
1196
1197       default:
1198          assert( 0 );
1199       }
1200       break;
1201
1202    case TGSI_EXTSWIZZLE_ZERO:
1203       emit_tempf(
1204          func,
1205          xmm,
1206          TGSI_EXEC_TEMP_00000000_I,
1207          TGSI_EXEC_TEMP_00000000_C );
1208       break;
1209
1210    case TGSI_EXTSWIZZLE_ONE:
1211       emit_tempf(
1212          func,
1213          xmm,
1214          TEMP_ONE_I,
1215          TEMP_ONE_C );
1216       break;
1217
1218    default:
1219       assert( 0 );
1220    }
1221
1222    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1223    case TGSI_UTIL_SIGN_CLEAR:
1224       emit_abs( func, xmm );
1225       break;
1226
1227    case TGSI_UTIL_SIGN_SET:
1228       emit_setsign( func, xmm );
1229       break;
1230
1231    case TGSI_UTIL_SIGN_TOGGLE:
1232       emit_neg( func, xmm );
1233       break;
1234
1235    case TGSI_UTIL_SIGN_KEEP:
1236       break;
1237    }
1238 }
1239
1240 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1241    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1242
1243 /**
1244  * Register store.
1245  */
1246
1247 static void
1248 emit_store(
1249    struct x86_function *func,
1250    unsigned xmm,
1251    const struct tgsi_full_dst_register *reg,
1252    const struct tgsi_full_instruction *inst,
1253    unsigned chan_index )
1254 {
1255    switch( reg->DstRegister.File ) {
1256    case TGSI_FILE_OUTPUT:
1257       emit_output(
1258          func,
1259          xmm,
1260          reg->DstRegister.Index,
1261          chan_index );
1262       break;
1263
1264    case TGSI_FILE_TEMPORARY:
1265       emit_temps(
1266          func,
1267          xmm,
1268          reg->DstRegister.Index,
1269          chan_index );
1270       break;
1271
1272    case TGSI_FILE_ADDRESS:
1273       emit_addrs(
1274          func,
1275          xmm,
1276          reg->DstRegister.Index,
1277          chan_index );
1278       break;
1279
1280    default:
1281       assert( 0 );
1282    }
1283
1284    switch( inst->Instruction.Saturate ) {
1285    case TGSI_SAT_NONE:
1286       break;
1287
1288    case TGSI_SAT_ZERO_ONE:
1289       /* assert( 0 ); */
1290       break;
1291
1292    case TGSI_SAT_MINUS_PLUS_ONE:
1293       assert( 0 );
1294       break;
1295    }
1296 }
1297
1298 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1299    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1300
1301 /**
1302  * High-level instruction translators.
1303  */
1304
1305 static void
1306 emit_kil(
1307    struct x86_function *func,
1308    const struct tgsi_full_src_register *reg )
1309 {
1310    unsigned uniquemask;
1311    unsigned registers[4];
1312    unsigned nextregister = 0;
1313    unsigned firstchan = ~0;
1314    unsigned chan_index;
1315
1316    /* This mask stores component bits that were already tested. Note that
1317     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1318     * tested. */
1319    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1320
1321    FOR_EACH_CHANNEL( chan_index ) {
1322       unsigned swizzle;
1323
1324       /* unswizzle channel */
1325       swizzle = tgsi_util_get_full_src_register_extswizzle(
1326          reg,
1327          chan_index );
1328
1329       /* check if the component has not been already tested */
1330       if( !(uniquemask & (1 << swizzle)) ) {
1331          uniquemask |= 1 << swizzle;
1332
1333          /* allocate register */
1334          registers[chan_index] = nextregister;
1335          emit_fetch(
1336             func,
1337             nextregister,
1338             reg,
1339             chan_index );
1340          nextregister++;
1341
1342          /* mark the first channel used */
1343          if( firstchan == ~0 ) {
1344             firstchan = chan_index;
1345          }
1346       }
1347    }
1348
1349    x86_push(
1350       func,
1351       x86_make_reg( file_REG32, reg_AX ) );
1352    x86_push(
1353       func,
1354       x86_make_reg( file_REG32, reg_DX ) );
1355
1356    FOR_EACH_CHANNEL( chan_index ) {
1357       if( uniquemask & (1 << chan_index) ) {
1358          sse_cmpps(
1359             func,
1360             make_xmm( registers[chan_index] ),
1361             get_temp(
1362                TGSI_EXEC_TEMP_00000000_I,
1363                TGSI_EXEC_TEMP_00000000_C ),
1364             cc_LessThan );
1365
1366          if( chan_index == firstchan ) {
1367             sse_pmovmskb(
1368                func,
1369                x86_make_reg( file_REG32, reg_AX ),
1370                make_xmm( registers[chan_index] ) );
1371          }
1372          else {
1373             sse_pmovmskb(
1374                func,
1375                x86_make_reg( file_REG32, reg_DX ),
1376                make_xmm( registers[chan_index] ) );
1377             x86_or(
1378                func,
1379                x86_make_reg( file_REG32, reg_AX ),
1380                x86_make_reg( file_REG32, reg_DX ) );
1381          }
1382       }
1383    }
1384
1385    x86_or(
1386       func,
1387       get_temp(
1388          TGSI_EXEC_TEMP_KILMASK_I,
1389          TGSI_EXEC_TEMP_KILMASK_C ),
1390       x86_make_reg( file_REG32, reg_AX ) );
1391
1392    x86_pop(
1393       func,
1394       x86_make_reg( file_REG32, reg_DX ) );
1395    x86_pop(
1396       func,
1397       x86_make_reg( file_REG32, reg_AX ) );
1398 }
1399
1400
1401 static void
1402 emit_kilp(
1403    struct x86_function *func )
1404 {
1405    /* XXX todo / fix me */
1406 }
1407
1408
1409 static void
1410 emit_setcc(
1411    struct x86_function *func,
1412    struct tgsi_full_instruction *inst,
1413    enum sse_cc cc )
1414 {
1415    unsigned chan_index;
1416
1417    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1418       FETCH( func, *inst, 0, 0, chan_index );
1419       FETCH( func, *inst, 1, 1, chan_index );
1420       sse_cmpps(
1421          func,
1422          make_xmm( 0 ),
1423          make_xmm( 1 ),
1424          cc );
1425       sse_andps(
1426          func,
1427          make_xmm( 0 ),
1428          get_temp(
1429             TEMP_ONE_I,
1430             TEMP_ONE_C ) );
1431       STORE( func, *inst, 0, 0, chan_index );
1432    }
1433 }
1434
1435 static void
1436 emit_cmp(
1437    struct x86_function *func,
1438    struct tgsi_full_instruction *inst )
1439 {
1440    unsigned chan_index;
1441
1442    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1443       FETCH( func, *inst, 0, 0, chan_index );
1444       FETCH( func, *inst, 1, 1, chan_index );
1445       FETCH( func, *inst, 2, 2, chan_index );
1446       sse_cmpps(
1447          func,
1448          make_xmm( 0 ),
1449          get_temp(
1450             TGSI_EXEC_TEMP_00000000_I,
1451             TGSI_EXEC_TEMP_00000000_C ),
1452          cc_LessThan );
1453       sse_andps(
1454          func,
1455          make_xmm( 1 ),
1456          make_xmm( 0 ) );
1457       sse_andnps(
1458          func,
1459          make_xmm( 0 ),
1460          make_xmm( 2 ) );
1461       sse_orps(
1462          func,
1463          make_xmm( 0 ),
1464          make_xmm( 1 ) );
1465       STORE( func, *inst, 0, 0, chan_index );
1466    }
1467 }
1468
1469 static int
1470 emit_instruction(
1471    struct x86_function *func,
1472    struct tgsi_full_instruction *inst )
1473 {
1474    unsigned chan_index;
1475
1476    switch (inst->Instruction.Opcode) {
1477    case TGSI_OPCODE_ARL:
1478       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1479          FETCH( func, *inst, 0, 0, chan_index );
1480          emit_f2it( func, 0 );
1481          STORE( func, *inst, 0, 0, chan_index );
1482       }
1483       break;
1484
1485    case TGSI_OPCODE_MOV:
1486    case TGSI_OPCODE_SWZ:
1487       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1488          FETCH( func, *inst, 0, 0, chan_index );
1489          STORE( func, *inst, 0, 0, chan_index );
1490       }
1491       break;
1492
1493    case TGSI_OPCODE_LIT:
1494       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1495           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1496          emit_tempf(
1497             func,
1498             0,
1499             TEMP_ONE_I,
1500             TEMP_ONE_C);
1501          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1502             STORE( func, *inst, 0, 0, CHAN_X );
1503          }
1504          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1505             STORE( func, *inst, 0, 0, CHAN_W );
1506          }
1507       }
1508       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1509           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1510          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1511             FETCH( func, *inst, 0, 0, CHAN_X );
1512             sse_maxps(
1513                func,
1514                make_xmm( 0 ),
1515                get_temp(
1516                   TGSI_EXEC_TEMP_00000000_I,
1517                   TGSI_EXEC_TEMP_00000000_C ) );
1518             STORE( func, *inst, 0, 0, CHAN_Y );
1519          }
1520          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1521             /* XMM[1] = SrcReg[0].yyyy */
1522             FETCH( func, *inst, 1, 0, CHAN_Y );
1523             /* XMM[1] = max(XMM[1], 0) */
1524             sse_maxps(
1525                func,
1526                make_xmm( 1 ),
1527                get_temp(
1528                   TGSI_EXEC_TEMP_00000000_I,
1529                   TGSI_EXEC_TEMP_00000000_C ) );
1530             /* XMM[2] = SrcReg[0].wwww */
1531             FETCH( func, *inst, 2, 0, CHAN_W );
1532             /* XMM[2] = min(XMM[2], 128.0) */
1533             sse_minps(
1534                func,
1535                make_xmm( 2 ),
1536                get_temp(
1537                   TGSI_EXEC_TEMP_128_I,
1538                   TGSI_EXEC_TEMP_128_C ) );
1539             /* XMM[2] = max(XMM[2], -128.0) */
1540             sse_maxps(
1541                func,
1542                make_xmm( 2 ),
1543                get_temp(
1544                   TGSI_EXEC_TEMP_MINUS_128_I,
1545                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1546             emit_pow( func, 3, 1, 2 );
1547             FETCH( func, *inst, 0, 0, CHAN_X );
1548             sse_xorps(
1549                func,
1550                make_xmm( 2 ),
1551                make_xmm( 2 ) );
1552             sse_cmpps(
1553                func,
1554                make_xmm( 2 ),
1555                make_xmm( 0 ),
1556                cc_LessThanEqual );
1557             sse_andps(
1558                func,
1559                make_xmm( 2 ),
1560                make_xmm( 1 ) );
1561             STORE( func, *inst, 2, 0, CHAN_Z );
1562          }
1563       }
1564       break;
1565
1566    case TGSI_OPCODE_RCP:
1567    /* TGSI_OPCODE_RECIP */
1568       FETCH( func, *inst, 0, 0, CHAN_X );
1569       emit_rcp( func, 0, 0 );
1570       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1571          STORE( func, *inst, 0, 0, chan_index );
1572       }
1573       break;
1574
1575    case TGSI_OPCODE_RSQ:
1576    /* TGSI_OPCODE_RECIPSQRT */
1577       FETCH( func, *inst, 0, 0, CHAN_X );
1578       emit_abs( func, 0 );
1579       emit_rsqrt( func, 1, 0 );
1580       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1581          STORE( func, *inst, 1, 0, chan_index );
1582       }
1583       break;
1584
1585    case TGSI_OPCODE_EXP:
1586       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1587           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1588           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1589          FETCH( func, *inst, 0, 0, CHAN_X );
1590          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1591              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1592             emit_MOV( func, 1, 0 );
1593             emit_flr( func, 2, 1 );
1594             /* dst.x = ex2(floor(src.x)) */
1595             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1596                emit_MOV( func, 2, 1 );
1597                emit_ex2( func, 3, 2 );
1598                STORE( func, *inst, 2, 0, CHAN_X );
1599             }
1600             /* dst.y = src.x - floor(src.x) */
1601             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1602                emit_MOV( func, 2, 0 );
1603                emit_sub( func, 2, 1 );
1604                STORE( func, *inst, 2, 0, CHAN_Y );
1605             }
1606          }
1607          /* dst.z = ex2(src.x) */
1608          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1609             emit_ex2( func, 3, 0 );
1610             STORE( func, *inst, 0, 0, CHAN_Z );
1611          }
1612       }
1613       /* dst.w = 1.0 */
1614       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1615          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1616          STORE( func, *inst, 0, 0, CHAN_W );
1617       }
1618       break;
1619
1620    case TGSI_OPCODE_LOG:
1621       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1622           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1623           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1624          FETCH( func, *inst, 0, 0, CHAN_X );
1625          emit_abs( func, 0 );
1626          emit_MOV( func, 1, 0 );
1627          emit_lg2( func, 2, 1 );
1628          /* dst.z = lg2(abs(src.x)) */
1629          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1630             STORE( func, *inst, 1, 0, CHAN_Z );
1631          }
1632          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1633              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1634             emit_flr( func, 2, 1 );
1635             /* dst.x = floor(lg2(abs(src.x))) */
1636             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1637                STORE( func, *inst, 1, 0, CHAN_X );
1638             }
1639             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1640             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1641                emit_ex2( func, 2, 1 );
1642                emit_rcp( func, 1, 1 );
1643                emit_mul( func, 0, 1 );
1644                STORE( func, *inst, 0, 0, CHAN_Y );
1645             }
1646          }
1647       }
1648       /* dst.w = 1.0 */
1649       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1650          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1651          STORE( func, *inst, 0, 0, CHAN_W );
1652       }
1653       break;
1654
1655    case TGSI_OPCODE_MUL:
1656       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1657          FETCH( func, *inst, 0, 0, chan_index );
1658          FETCH( func, *inst, 1, 1, chan_index );
1659          emit_mul( func, 0, 1 );
1660          STORE( func, *inst, 0, 0, chan_index );
1661       }
1662       break;
1663
1664    case TGSI_OPCODE_ADD:
1665       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1666          FETCH( func, *inst, 0, 0, chan_index );
1667          FETCH( func, *inst, 1, 1, chan_index );
1668          emit_add( func, 0, 1 );
1669          STORE( func, *inst, 0, 0, chan_index );
1670       }
1671       break;
1672
1673    case TGSI_OPCODE_DP3:
1674    /* TGSI_OPCODE_DOT3 */
1675       FETCH( func, *inst, 0, 0, CHAN_X );
1676       FETCH( func, *inst, 1, 1, CHAN_X );
1677       emit_mul( func, 0, 1 );
1678       FETCH( func, *inst, 1, 0, CHAN_Y );
1679       FETCH( func, *inst, 2, 1, CHAN_Y );
1680       emit_mul( func, 1, 2 );
1681       emit_add( func, 0, 1 );
1682       FETCH( func, *inst, 1, 0, CHAN_Z );
1683       FETCH( func, *inst, 2, 1, CHAN_Z );
1684       emit_mul( func, 1, 2 );
1685       emit_add( func, 0, 1 );
1686       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1687          STORE( func, *inst, 0, 0, chan_index );
1688       }
1689       break;
1690
1691    case TGSI_OPCODE_DP4:
1692    /* TGSI_OPCODE_DOT4 */
1693       FETCH( func, *inst, 0, 0, CHAN_X );
1694       FETCH( func, *inst, 1, 1, CHAN_X );
1695       emit_mul( func, 0, 1 );
1696       FETCH( func, *inst, 1, 0, CHAN_Y );
1697       FETCH( func, *inst, 2, 1, CHAN_Y );
1698       emit_mul( func, 1, 2 );
1699       emit_add( func, 0, 1 );
1700       FETCH( func, *inst, 1, 0, CHAN_Z );
1701       FETCH( func, *inst, 2, 1, CHAN_Z );
1702       emit_mul(func, 1, 2 );
1703       emit_add(func, 0, 1 );
1704       FETCH( func, *inst, 1, 0, CHAN_W );
1705       FETCH( func, *inst, 2, 1, CHAN_W );
1706       emit_mul( func, 1, 2 );
1707       emit_add( func, 0, 1 );
1708       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1709          STORE( func, *inst, 0, 0, chan_index );
1710       }
1711       break;
1712
1713    case TGSI_OPCODE_DST:
1714       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1715          emit_tempf(
1716             func,
1717             0,
1718             TEMP_ONE_I,
1719             TEMP_ONE_C );
1720          STORE( func, *inst, 0, 0, CHAN_X );
1721       }
1722       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1723          FETCH( func, *inst, 0, 0, CHAN_Y );
1724          FETCH( func, *inst, 1, 1, CHAN_Y );
1725          emit_mul( func, 0, 1 );
1726          STORE( func, *inst, 0, 0, CHAN_Y );
1727       }
1728       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1729          FETCH( func, *inst, 0, 0, CHAN_Z );
1730          STORE( func, *inst, 0, 0, CHAN_Z );
1731       }
1732       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1733          FETCH( func, *inst, 0, 1, CHAN_W );
1734          STORE( func, *inst, 0, 0, CHAN_W );
1735       }
1736       break;
1737
1738    case TGSI_OPCODE_MIN:
1739       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1740          FETCH( func, *inst, 0, 0, chan_index );
1741          FETCH( func, *inst, 1, 1, chan_index );
1742          sse_minps(
1743             func,
1744             make_xmm( 0 ),
1745             make_xmm( 1 ) );
1746          STORE( func, *inst, 0, 0, chan_index );
1747       }
1748       break;
1749
1750    case TGSI_OPCODE_MAX:
1751       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1752          FETCH( func, *inst, 0, 0, chan_index );
1753          FETCH( func, *inst, 1, 1, chan_index );
1754          sse_maxps(
1755             func,
1756             make_xmm( 0 ),
1757             make_xmm( 1 ) );
1758          STORE( func, *inst, 0, 0, chan_index );
1759       }
1760       break;
1761
1762    case TGSI_OPCODE_SLT:
1763    /* TGSI_OPCODE_SETLT */
1764       emit_setcc( func, inst, cc_LessThan );
1765       break;
1766
1767    case TGSI_OPCODE_SGE:
1768    /* TGSI_OPCODE_SETGE */
1769       emit_setcc( func, inst, cc_NotLessThan );
1770       break;
1771
1772    case TGSI_OPCODE_MAD:
1773    /* TGSI_OPCODE_MADD */
1774       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1775          FETCH( func, *inst, 0, 0, chan_index );
1776          FETCH( func, *inst, 1, 1, chan_index );
1777          FETCH( func, *inst, 2, 2, chan_index );
1778          emit_mul( func, 0, 1 );
1779          emit_add( func, 0, 2 );
1780          STORE( func, *inst, 0, 0, chan_index );
1781       }
1782       break;
1783
1784    case TGSI_OPCODE_SUB:
1785       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1786          FETCH( func, *inst, 0, 0, chan_index );
1787          FETCH( func, *inst, 1, 1, chan_index );
1788          emit_sub( func, 0, 1 );
1789          STORE( func, *inst, 0, 0, chan_index );
1790       }
1791       break;
1792
1793    case TGSI_OPCODE_LERP:
1794    /* TGSI_OPCODE_LRP */
1795       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1796          FETCH( func, *inst, 0, 0, chan_index );
1797          FETCH( func, *inst, 1, 1, chan_index );
1798          FETCH( func, *inst, 2, 2, chan_index );
1799          emit_sub( func, 1, 2 );
1800          emit_mul( func, 0, 1 );
1801          emit_add( func, 0, 2 );
1802          STORE( func, *inst, 0, 0, chan_index );
1803       }
1804       break;
1805
1806    case TGSI_OPCODE_CND:
1807       return 0;
1808       break;
1809
1810    case TGSI_OPCODE_CND0:
1811       return 0;
1812       break;
1813
1814    case TGSI_OPCODE_DOT2ADD:
1815    /* TGSI_OPCODE_DP2A */
1816       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
1817       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
1818       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
1819       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
1820       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
1821       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
1822       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
1823       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
1824       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
1825       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1826          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
1827       }
1828       break;
1829
1830    case TGSI_OPCODE_INDEX:
1831       return 0;
1832       break;
1833
1834    case TGSI_OPCODE_NEGATE:
1835       return 0;
1836       break;
1837
1838    case TGSI_OPCODE_FRAC:
1839    /* TGSI_OPCODE_FRC */
1840       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1841          FETCH( func, *inst, 0, 0, chan_index );
1842          emit_frc( func, 0, 0 );
1843          STORE( func, *inst, 0, 0, chan_index );
1844       }
1845       break;
1846
1847    case TGSI_OPCODE_CLAMP:
1848       return 0;
1849       break;
1850
1851    case TGSI_OPCODE_FLOOR:
1852    /* TGSI_OPCODE_FLR */
1853       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1854          FETCH( func, *inst, 0, 0, chan_index );
1855          emit_flr( func, 0, 0 );
1856          STORE( func, *inst, 0, 0, chan_index );
1857       }
1858       break;
1859
1860    case TGSI_OPCODE_ROUND:
1861       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1862          FETCH( func, *inst, 0, 0, chan_index );
1863          emit_rnd( func, 0, 0 );
1864          STORE( func, *inst, 0, 0, chan_index );
1865       }
1866       break;
1867
1868    case TGSI_OPCODE_EXPBASE2:
1869    /* TGSI_OPCODE_EX2 */
1870       FETCH( func, *inst, 0, 0, CHAN_X );
1871       emit_ex2( func, 0, 0 );
1872       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1873          STORE( func, *inst, 0, 0, chan_index );
1874       }
1875       break;
1876
1877    case TGSI_OPCODE_LOGBASE2:
1878    /* TGSI_OPCODE_LG2 */
1879       FETCH( func, *inst, 0, 0, CHAN_X );
1880       emit_lg2( func, 0, 0 );
1881       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1882          STORE( func, *inst, 0, 0, chan_index );
1883       }
1884       break;
1885
1886    case TGSI_OPCODE_POWER:
1887    /* TGSI_OPCODE_POW */
1888       FETCH( func, *inst, 0, 0, CHAN_X );
1889       FETCH( func, *inst, 1, 1, CHAN_X );
1890       emit_pow( func, 0, 0, 1 );
1891       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1892          STORE( func, *inst, 0, 0, chan_index );
1893       }
1894       break;
1895
1896    case TGSI_OPCODE_CROSSPRODUCT:
1897    /* TGSI_OPCODE_XPD */
1898       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1899           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1900          FETCH( func, *inst, 1, 1, CHAN_Z );
1901          FETCH( func, *inst, 3, 0, CHAN_Z );
1902       }
1903       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1904           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1905          FETCH( func, *inst, 0, 0, CHAN_Y );
1906          FETCH( func, *inst, 4, 1, CHAN_Y );
1907       }
1908       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1909          emit_MOV( func, 2, 0 );
1910          emit_mul( func, 2, 1 );
1911          emit_MOV( func, 5, 3 );
1912          emit_mul( func, 5, 4 );
1913          emit_sub( func, 2, 5 );
1914          STORE( func, *inst, 2, 0, CHAN_X );
1915       }
1916       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1917           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1918          FETCH( func, *inst, 2, 1, CHAN_X );
1919          FETCH( func, *inst, 5, 0, CHAN_X );
1920       }
1921       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1922          emit_mul( func, 3, 2 );
1923          emit_mul( func, 1, 5 );
1924          emit_sub( func, 3, 1 );
1925          STORE( func, *inst, 3, 0, CHAN_Y );
1926       }
1927       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1928          emit_mul( func, 5, 4 );
1929          emit_mul( func, 0, 2 );
1930          emit_sub( func, 5, 0 );
1931          STORE( func, *inst, 5, 0, CHAN_Z );
1932       }
1933       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1934          emit_tempf(
1935             func,
1936             0,
1937             TEMP_ONE_I,
1938             TEMP_ONE_C );
1939          STORE( func, *inst, 0, 0, CHAN_W );
1940       }
1941       break;
1942
1943    case TGSI_OPCODE_MULTIPLYMATRIX:
1944       return 0;
1945       break;
1946
1947    case TGSI_OPCODE_ABS:
1948       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1949          FETCH( func, *inst, 0, 0, chan_index );
1950          emit_abs( func, 0) ;
1951
1952          STORE( func, *inst, 0, 0, chan_index );
1953       }
1954       break;
1955
1956    case TGSI_OPCODE_RCC:
1957       return 0;
1958       break;
1959
1960    case TGSI_OPCODE_DPH:
1961       FETCH( func, *inst, 0, 0, CHAN_X );
1962       FETCH( func, *inst, 1, 1, CHAN_X );
1963       emit_mul( func, 0, 1 );
1964       FETCH( func, *inst, 1, 0, CHAN_Y );
1965       FETCH( func, *inst, 2, 1, CHAN_Y );
1966       emit_mul( func, 1, 2 );
1967       emit_add( func, 0, 1 );
1968       FETCH( func, *inst, 1, 0, CHAN_Z );
1969       FETCH( func, *inst, 2, 1, CHAN_Z );
1970       emit_mul( func, 1, 2 );
1971       emit_add( func, 0, 1 );
1972       FETCH( func, *inst, 1, 1, CHAN_W );
1973       emit_add( func, 0, 1 );
1974       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1975          STORE( func, *inst, 0, 0, chan_index );
1976       }
1977       break;
1978
1979    case TGSI_OPCODE_COS:
1980       FETCH( func, *inst, 0, 0, CHAN_X );
1981       emit_cos( func, 0, 0 );
1982       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1983          STORE( func, *inst, 0, 0, chan_index );
1984       }
1985       break;
1986
1987    case TGSI_OPCODE_DDX:
1988       return 0;
1989       break;
1990
1991    case TGSI_OPCODE_DDY:
1992       return 0;
1993       break;
1994
1995    case TGSI_OPCODE_KILP:
1996       /* predicated kill */
1997       emit_kilp( func );
1998       return 0; /* XXX fix me */
1999       break;
2000
2001    case TGSI_OPCODE_KIL:
2002       /* conditional kill */
2003       emit_kil( func, &inst->FullSrcRegisters[0] );
2004       break;
2005
2006    case TGSI_OPCODE_PK2H:
2007       return 0;
2008       break;
2009
2010    case TGSI_OPCODE_PK2US:
2011       return 0;
2012       break;
2013
2014    case TGSI_OPCODE_PK4B:
2015       return 0;
2016       break;
2017
2018    case TGSI_OPCODE_PK4UB:
2019       return 0;
2020       break;
2021
2022    case TGSI_OPCODE_RFL:
2023       return 0;
2024       break;
2025
2026    case TGSI_OPCODE_SEQ:
2027       return 0;
2028       break;
2029
2030    case TGSI_OPCODE_SFL:
2031       return 0;
2032       break;
2033
2034    case TGSI_OPCODE_SGT:
2035       return 0;
2036       break;
2037
2038    case TGSI_OPCODE_SIN:
2039       FETCH( func, *inst, 0, 0, CHAN_X );
2040       emit_sin( func, 0, 0 );
2041       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2042          STORE( func, *inst, 0, 0, chan_index );
2043       }
2044       break;
2045
2046    case TGSI_OPCODE_SLE:
2047       return 0;
2048       break;
2049
2050    case TGSI_OPCODE_SNE:
2051       return 0;
2052       break;
2053
2054    case TGSI_OPCODE_STR:
2055       return 0;
2056       break;
2057
2058    case TGSI_OPCODE_TEX:
2059       if (0) {
2060          /* Disable dummy texture code:
2061           */
2062          emit_tempf(
2063             func,
2064             0,
2065             TEMP_ONE_I,
2066             TEMP_ONE_C );
2067          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2068             STORE( func, *inst, 0, 0, chan_index );
2069          }
2070       }
2071       else {
2072          return 0;
2073       }
2074       break;
2075
2076    case TGSI_OPCODE_TXD:
2077       return 0;
2078       break;
2079
2080    case TGSI_OPCODE_UP2H:
2081       return 0;
2082       break;
2083
2084    case TGSI_OPCODE_UP2US:
2085       return 0;
2086       break;
2087
2088    case TGSI_OPCODE_UP4B:
2089       return 0;
2090       break;
2091
2092    case TGSI_OPCODE_UP4UB:
2093       return 0;
2094       break;
2095
2096    case TGSI_OPCODE_X2D:
2097       return 0;
2098       break;
2099
2100    case TGSI_OPCODE_ARA:
2101       return 0;
2102       break;
2103
2104    case TGSI_OPCODE_ARR:
2105       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2106          FETCH( func, *inst, 0, 0, chan_index );
2107          emit_rnd( func, 0, 0 );
2108          emit_f2it( func, 0 );
2109          STORE( func, *inst, 0, 0, chan_index );
2110       }
2111       break;
2112
2113    case TGSI_OPCODE_BRA:
2114       return 0;
2115       break;
2116
2117    case TGSI_OPCODE_CAL:
2118       return 0;
2119       break;
2120
2121    case TGSI_OPCODE_RET:
2122       emit_ret( func );
2123       break;
2124
2125    case TGSI_OPCODE_END:
2126       break;
2127
2128    case TGSI_OPCODE_SSG:
2129    /* TGSI_OPCODE_SGN */
2130       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2131          FETCH( func, *inst, 0, 0, chan_index );
2132          emit_sgn( func, 0, 0 );
2133          STORE( func, *inst, 0, 0, chan_index );
2134       }
2135       break;
2136
2137    case TGSI_OPCODE_CMP:
2138       emit_cmp (func, inst);
2139       break;
2140
2141    case TGSI_OPCODE_SCS:
2142       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2143          FETCH( func, *inst, 0, 0, CHAN_X );
2144          emit_cos( func, 0, 0 );
2145          STORE( func, *inst, 0, 0, CHAN_X );
2146       }
2147       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2148          FETCH( func, *inst, 0, 0, CHAN_X );
2149          emit_sin( func, 0, 0 );
2150          STORE( func, *inst, 0, 0, CHAN_Y );
2151       }
2152       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2153          emit_tempf(
2154             func,
2155             0,
2156             TGSI_EXEC_TEMP_00000000_I,
2157             TGSI_EXEC_TEMP_00000000_C );
2158          STORE( func, *inst, 0, 0, CHAN_Z );
2159       }
2160       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2161          emit_tempf(
2162             func,
2163             0,
2164             TEMP_ONE_I,
2165             TEMP_ONE_C );
2166          STORE( func, *inst, 0, 0, CHAN_W );
2167       }
2168       break;
2169
2170    case TGSI_OPCODE_TXB:
2171       return 0;
2172       break;
2173
2174    case TGSI_OPCODE_NRM:
2175       /* fall-through */
2176    case TGSI_OPCODE_NRM4:
2177       /* 3 or 4-component normalization */
2178       {
2179          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2180          /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
2181          FETCH( func, *inst, 4, 0, CHAN_X );    /* xmm4 = src[0].x */
2182          FETCH( func, *inst, 5, 0, CHAN_Y );    /* xmm5 = src[0].y */
2183          FETCH( func, *inst, 6, 0, CHAN_Z );    /* xmm6 = src[0].z */
2184          if (dims == 4) {
2185             FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
2186          }
2187          emit_MOV( func, 0, 4 );                /* xmm0 = xmm3 */
2188          emit_mul( func, 0, 4 );                /* xmm0 *= xmm3 */
2189          emit_MOV( func, 1, 5 );                /* xmm1 = xmm4 */
2190          emit_mul( func, 1, 5 );                /* xmm1 *= xmm4 */
2191          emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
2192          emit_MOV( func, 1, 6 );                /* xmm1 = xmm5 */
2193          emit_mul( func, 1, 6 );                /* xmm1 *= xmm5 */
2194          emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
2195          if (dims == 4) {
2196             emit_MOV( func, 1, 7 );             /* xmm1 = xmm7 */
2197             emit_mul( func, 1, 7 );             /* xmm1 *= xmm7 */
2198             emit_add( func, 0, 0 );             /* xmm0 += xmm1 */
2199          }
2200          emit_rsqrt( func, 1, 0 );              /* xmm1 = 1/sqrt(xmm0) */
2201          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2202             if (chan_index < dims) {
2203                emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
2204                STORE( func, *inst, 4+chan_index, 0, chan_index );
2205             }
2206          }
2207       }
2208       break;
2209
2210    case TGSI_OPCODE_DIV:
2211       return 0;
2212       break;
2213
2214    case TGSI_OPCODE_DP2:
2215       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2216       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2217       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2218       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2219       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2220       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2221       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2222       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2223          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2224       }
2225       break;
2226
2227    case TGSI_OPCODE_TXL:
2228       return 0;
2229       break;
2230
2231    case TGSI_OPCODE_BRK:
2232       return 0;
2233       break;
2234
2235    case TGSI_OPCODE_IF:
2236       return 0;
2237       break;
2238
2239    case TGSI_OPCODE_LOOP:
2240       return 0;
2241       break;
2242
2243    case TGSI_OPCODE_REP:
2244       return 0;
2245       break;
2246
2247    case TGSI_OPCODE_ELSE:
2248       return 0;
2249       break;
2250
2251    case TGSI_OPCODE_ENDIF:
2252       return 0;
2253       break;
2254
2255    case TGSI_OPCODE_ENDLOOP:
2256       return 0;
2257       break;
2258
2259    case TGSI_OPCODE_ENDREP:
2260       return 0;
2261       break;
2262
2263    case TGSI_OPCODE_PUSHA:
2264       return 0;
2265       break;
2266
2267    case TGSI_OPCODE_POPA:
2268       return 0;
2269       break;
2270
2271    case TGSI_OPCODE_CEIL:
2272       return 0;
2273       break;
2274
2275    case TGSI_OPCODE_I2F:
2276       return 0;
2277       break;
2278
2279    case TGSI_OPCODE_NOT:
2280       return 0;
2281       break;
2282
2283    case TGSI_OPCODE_TRUNC:
2284       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2285          FETCH( func, *inst, 0, 0, chan_index );
2286          emit_f2it( func, 0 );
2287          emit_i2f( func, 0 );
2288          STORE( func, *inst, 0, 0, chan_index );
2289       }
2290       break;
2291
2292    case TGSI_OPCODE_SHL:
2293       return 0;
2294       break;
2295
2296    case TGSI_OPCODE_SHR:
2297       return 0;
2298       break;
2299
2300    case TGSI_OPCODE_AND:
2301       return 0;
2302       break;
2303
2304    case TGSI_OPCODE_OR:
2305       return 0;
2306       break;
2307
2308    case TGSI_OPCODE_MOD:
2309       return 0;
2310       break;
2311
2312    case TGSI_OPCODE_XOR:
2313       return 0;
2314       break;
2315
2316    case TGSI_OPCODE_SAD:
2317       return 0;
2318       break;
2319
2320    case TGSI_OPCODE_TXF:
2321       return 0;
2322       break;
2323
2324    case TGSI_OPCODE_TXQ:
2325       return 0;
2326       break;
2327
2328    case TGSI_OPCODE_CONT:
2329       return 0;
2330       break;
2331
2332    case TGSI_OPCODE_EMIT:
2333       return 0;
2334       break;
2335
2336    case TGSI_OPCODE_ENDPRIM:
2337       return 0;
2338       break;
2339
2340    default:
2341       return 0;
2342    }
2343
2344    return 1;
2345 }
2346
2347 static void
2348 emit_declaration(
2349    struct x86_function *func,
2350    struct tgsi_full_declaration *decl )
2351 {
2352    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2353       unsigned first, last, mask;
2354       unsigned i, j;
2355
2356       first = decl->DeclarationRange.First;
2357       last = decl->DeclarationRange.Last;
2358       mask = decl->Declaration.UsageMask;
2359
2360       for( i = first; i <= last; i++ ) {
2361          for( j = 0; j < NUM_CHANNELS; j++ ) {
2362             if( mask & (1 << j) ) {
2363                switch( decl->Declaration.Interpolate ) {
2364                case TGSI_INTERPOLATE_CONSTANT:
2365                   emit_coef_a0( func, 0, i, j );
2366                   emit_inputs( func, 0, i, j );
2367                   break;
2368
2369                case TGSI_INTERPOLATE_LINEAR:
2370                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2371                   emit_coef_dadx( func, 1, i, j );
2372                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2373                   emit_coef_dady( func, 3, i, j );
2374                   emit_mul( func, 0, 1 );    /* x * dadx */
2375                   emit_coef_a0( func, 4, i, j );
2376                   emit_mul( func, 2, 3 );    /* y * dady */
2377                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2378                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2379                   emit_inputs( func, 0, i, j );
2380                   break;
2381
2382                case TGSI_INTERPOLATE_PERSPECTIVE:
2383                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2384                   emit_coef_dadx( func, 1, i, j );
2385                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2386                   emit_coef_dady( func, 3, i, j );
2387                   emit_mul( func, 0, 1 );    /* x * dadx */
2388                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2389                   emit_coef_a0( func, 5, i, j );
2390                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2391                   emit_mul( func, 2, 3 );    /* y * dady */
2392                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2393                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2394                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2395                   emit_inputs( func, 0, i, j );
2396                   break;
2397
2398                default:
2399                   assert( 0 );
2400                   break;
2401                }
2402             }
2403          }
2404       }
2405    }
2406 }
2407
2408 static void aos_to_soa( struct x86_function *func,
2409                         uint arg_aos,
2410                         uint arg_soa,
2411                         uint arg_num,
2412                         uint arg_stride )
2413 {
2414    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2415    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2416    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2417    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2418    int inner_loop;
2419
2420
2421    /* Save EBX */
2422    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2423
2424    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2425    x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
2426    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2427    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2428
2429    /* do */
2430    inner_loop = x86_get_label( func );
2431    {
2432       x86_push( func, aos_input );
2433       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2434       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2435       x86_add( func, aos_input, stride );
2436       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2437       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2438       x86_add( func, aos_input, stride );
2439       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2440       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2441       x86_add( func, aos_input, stride );
2442       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2443       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2444       x86_pop( func, aos_input );
2445
2446       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2447       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2448       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2449       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2450       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2451       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2452
2453       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2454       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2455       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2456       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2457
2458       /* Advance to next input */
2459       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2460       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2461    }
2462    /* while --num_inputs */
2463    x86_dec( func, num_inputs );
2464    x86_jcc( func, cc_NE, inner_loop );
2465
2466    /* Restore EBX */
2467    x86_pop( func, aos_input );
2468 }
2469
2470 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2471 {
2472    struct x86_reg soa_output;
2473    struct x86_reg aos_output;
2474    struct x86_reg num_outputs;
2475    struct x86_reg temp;
2476    int inner_loop;
2477
2478    soa_output = x86_make_reg( file_REG32, reg_AX );
2479    aos_output = x86_make_reg( file_REG32, reg_BX );
2480    num_outputs = x86_make_reg( file_REG32, reg_CX );
2481    temp = x86_make_reg( file_REG32, reg_DX );
2482
2483    /* Save EBX */
2484    x86_push( func, aos_output );
2485
2486    x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2487    x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2488    x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2489
2490    /* do */
2491    inner_loop = x86_get_label( func );
2492    {
2493       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2494       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2495       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2496       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2497
2498       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2499       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2500       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2501       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2502       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2503       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2504
2505       x86_mov( func, temp, x86_fn_arg( func, stride ) );
2506       x86_push( func, aos_output );
2507       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2508       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2509       x86_add( func, aos_output, temp );
2510       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2511       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2512       x86_add( func, aos_output, temp );
2513       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2514       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2515       x86_add( func, aos_output, temp );
2516       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2517       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2518       x86_pop( func, aos_output );
2519
2520       /* Advance to next output */
2521       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2522       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2523    }
2524    /* while --num_outputs */
2525    x86_dec( func, num_outputs );
2526    x86_jcc( func, cc_NE, inner_loop );
2527
2528    /* Restore EBX */
2529    x86_pop( func, aos_output );
2530 }
2531
2532 /**
2533  * Translate a TGSI vertex/fragment shader to SSE2 code.
2534  * Slightly different things are done for vertex vs. fragment shaders.
2535  *
2536  * Note that fragment shaders are responsible for interpolating shader
2537  * inputs. Because on x86 we have only 4 GP registers, and here we
2538  * have 5 shader arguments (input, output, const, temp and coef), the
2539  * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2540  * GP register holding the output argument is aliased with the coeff
2541  * argument, as outputs are not needed in the DECLARATION phase.
2542  *
2543  * \param tokens  the TGSI input shader
2544  * \param func  the output SSE code/function
2545  * \param immediates  buffer to place immediates, later passed to SSE func
2546  * \param return  1 for success, 0 if translation failed
2547  */
2548 unsigned
2549 tgsi_emit_sse2(
2550    const struct tgsi_token *tokens,
2551    struct x86_function *func,
2552    float (*immediates)[4],
2553    boolean do_swizzles )
2554 {
2555    struct tgsi_parse_context parse;
2556    boolean instruction_phase = FALSE;
2557    unsigned ok = 1;
2558    uint num_immediates = 0;
2559
2560    util_init_math();
2561
2562    func->csr = func->store;
2563
2564    tgsi_parse_init( &parse, tokens );
2565
2566    /* Can't just use EDI, EBX without save/restoring them:
2567     */
2568    x86_push(
2569       func,
2570       get_immediate_base() );
2571
2572    x86_push(
2573       func,
2574       get_temp_base() );
2575
2576
2577    /*
2578     * Different function args for vertex/fragment shaders:
2579     */
2580    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2581       /* DECLARATION phase, do not load output argument. */
2582       x86_mov(
2583          func,
2584          get_input_base(),
2585          x86_fn_arg( func, 1 ) );
2586       /* skipping outputs argument here */
2587       x86_mov(
2588          func,
2589          get_const_base(),
2590          x86_fn_arg( func, 3 ) );
2591       x86_mov(
2592          func,
2593          get_temp_base(),
2594          x86_fn_arg( func, 4 ) );
2595       x86_mov(
2596          func,
2597          get_coef_base(),
2598          x86_fn_arg( func, 5 ) );
2599       x86_mov(
2600          func,
2601          get_immediate_base(),
2602          x86_fn_arg( func, 6 ) );
2603    }
2604    else {
2605       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2606
2607       if (do_swizzles)
2608          aos_to_soa( func,
2609                      6,         /* aos_input */
2610                      1,         /* machine->input */
2611                      7,         /* num_inputs */
2612                      8 );       /* input_stride */
2613
2614       x86_mov(
2615          func,
2616          get_input_base(),
2617          x86_fn_arg( func, 1 ) );
2618       x86_mov(
2619          func,
2620          get_output_base(),
2621          x86_fn_arg( func, 2 ) );
2622       x86_mov(
2623          func,
2624          get_const_base(),
2625          x86_fn_arg( func, 3 ) );
2626       x86_mov(
2627          func,
2628          get_temp_base(),
2629          x86_fn_arg( func, 4 ) );
2630       x86_mov(
2631          func,
2632          get_immediate_base(),
2633          x86_fn_arg( func, 5 ) );
2634    }
2635
2636    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2637       tgsi_parse_token( &parse );
2638
2639       switch( parse.FullToken.Token.Type ) {
2640       case TGSI_TOKEN_TYPE_DECLARATION:
2641          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2642             emit_declaration(
2643                func,
2644                &parse.FullToken.FullDeclaration );
2645          }
2646          break;
2647
2648       case TGSI_TOKEN_TYPE_INSTRUCTION:
2649          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2650             if( !instruction_phase ) {
2651                /* INSTRUCTION phase, overwrite coeff with output. */
2652                instruction_phase = TRUE;
2653                x86_mov(
2654                   func,
2655                   get_output_base(),
2656                   x86_fn_arg( func, 2 ) );
2657             }
2658          }
2659
2660          ok = emit_instruction(
2661             func,
2662             &parse.FullToken.FullInstruction );
2663
2664          if (!ok) {
2665             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2666                          parse.FullToken.FullInstruction.Instruction.Opcode,
2667                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2668                          "vertex shader" : "fragment shader");
2669          }
2670          break;
2671
2672       case TGSI_TOKEN_TYPE_IMMEDIATE:
2673          /* simply copy the immediate values into the next immediates[] slot */
2674          {
2675             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2676             uint i;
2677             assert(size <= 4);
2678             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2679             for( i = 0; i < size; i++ ) {
2680                immediates[num_immediates][i] =
2681                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2682             }
2683 #if 0
2684             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2685                    num_immediates,
2686                    immediates[num_immediates][0],
2687                    immediates[num_immediates][1],
2688                    immediates[num_immediates][2],
2689                    immediates[num_immediates][3]);
2690 #endif
2691             num_immediates++;
2692          }
2693          break;
2694
2695       default:
2696          ok = 0;
2697          assert( 0 );
2698       }
2699    }
2700
2701    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2702       if (do_swizzles)
2703          soa_to_aos( func, 9, 2, 10, 11 );
2704    }
2705
2706    /* Can't just use EBX, EDI without save/restoring them:
2707     */
2708    x86_pop(
2709       func,
2710       get_temp_base() );
2711
2712    x86_pop(
2713       func,
2714       get_immediate_base() );
2715
2716    emit_ret( func );
2717
2718    tgsi_parse_free( &parse );
2719
2720    return ok;
2721 }
2722
2723 #endif /* PIPE_ARCH_X86 */
2724