src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #ifdef PIPE_ARCH_X86
  29
  30 #include "pipe/p_debug.h"
  31 #include "pipe/p_shader_tokens.h"
  32 #include "util/u_math.h"
  33 #include "util/u_sse.h"
  34 #include "tgsi/tgsi_parse.h"
  35 #include "tgsi/tgsi_util.h"
  36 #include "tgsi_exec.h"
  37 #include "tgsi_sse2.h"
  38
  39 #include "rtasm/rtasm_x86sse.h"
  40
  41 /* for 1/sqrt()
  42  *
  43  * This costs about 100fps (close to 10%) in gears:
  44  */
  45 #define HIGH_PRECISION 1
  46
  47 #define FAST_MATH 1
  48
  49
  50 #define FOR_EACH_CHANNEL( CHAN )\
  51    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  52
  53 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  54    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  55
  56 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  57    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  58
  59 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  60    FOR_EACH_CHANNEL( CHAN )\
  61       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  62
  63 #define CHAN_X 0
  64 #define CHAN_Y 1
  65 #define CHAN_Z 2
  66 #define CHAN_W 3
  67
  68 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  69 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  70
  71 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  72 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  73
  74 /**
  75  * X86 utility functions.
  76  */
  77
  78 static struct x86_reg
  79 make_xmm(
  80    unsigned xmm )
  81 {
  82    return x86_make_reg(
  83       file_XMM,
  84       (enum x86_reg_name) xmm );
  85 }
  86
  87 /**
  88  * X86 register mapping helpers.
  89  */
  90
  91 static struct x86_reg
  92 get_const_base( void )
  93 {
  94    return x86_make_reg(
  95       file_REG32,
  96       reg_CX );
  97 }
  98
  99 static struct x86_reg
 100 get_input_base( void )
 101 {
 102    return x86_make_reg(
 103       file_REG32,
 104       reg_AX );
 105 }
 106
 107 static struct x86_reg
 108 get_output_base( void )
 109 {
 110    return x86_make_reg(
 111       file_REG32,
 112       reg_DX );
 113 }
 114
 115 static struct x86_reg
 116 get_temp_base( void )
 117 {
 118    return x86_make_reg(
 119       file_REG32,
 120       reg_BX );
 121 }
 122
 123 static struct x86_reg
 124 get_coef_base( void )
 125 {
 126    return get_output_base();
 127 }
 128
 129 static struct x86_reg
 130 get_immediate_base( void )
 131 {
 132    return x86_make_reg(
 133       file_REG32,
 134       reg_DI );
 135 }
 136
 137
 138 /**
 139  * Data access helpers.
 140  */
 141
 142
 143 static struct x86_reg
 144 get_immediate(
 145    unsigned vec,
 146    unsigned chan )
 147 {
 148    return x86_make_disp(
 149       get_immediate_base(),
 150       (vec * 4 + chan) * 4 );
 151 }
 152
 153 static struct x86_reg
 154 get_const(
 155    unsigned vec,
 156    unsigned chan )
 157 {
 158    return x86_make_disp(
 159       get_const_base(),
 160       (vec * 4 + chan) * 4 );
 161 }
 162
 163 static struct x86_reg
 164 get_input(
 165    unsigned vec,
 166    unsigned chan )
 167 {
 168    return x86_make_disp(
 169       get_input_base(),
 170       (vec * 4 + chan) * 16 );
 171 }
 172
 173 static struct x86_reg
 174 get_output(
 175    unsigned vec,
 176    unsigned chan )
 177 {
 178    return x86_make_disp(
 179       get_output_base(),
 180       (vec * 4 + chan) * 16 );
 181 }
 182
 183 static struct x86_reg
 184 get_temp(
 185    unsigned vec,
 186    unsigned chan )
 187 {
 188    return x86_make_disp(
 189       get_temp_base(),
 190       (vec * 4 + chan) * 16 );
 191 }
 192
 193 static struct x86_reg
 194 get_coef(
 195    unsigned vec,
 196    unsigned chan,
 197    unsigned member )
 198 {
 199    return x86_make_disp(
 200       get_coef_base(),
 201       ((vec * 3 + member) * 4 + chan) * 4 );
 202 }
 203
 204
 205 static void
 206 emit_ret(
 207    struct x86_function  *func )
 208 {
 209    x86_ret( func );
 210 }
 211
 212
 213 /**
 214  * Data fetch helpers.
 215  */
 216
 217 /**
 218  * Copy a shader constant to xmm register
 219  * \param xmm  the destination xmm register
 220  * \param vec  the src const buffer index
 221  * \param chan  src channel to fetch (X, Y, Z or W)
 222  */
 223 static void
 224 emit_const(
 225    struct x86_function *func,
 226    uint xmm,
 227    int vec,
 228    uint chan,
 229    uint indirect,
 230    uint indirectFile,
 231    int indirectIndex )
 232 {
 233    if (indirect) {
 234       struct x86_reg r0 = get_input_base();
 235       struct x86_reg r1 = get_output_base();
 236       uint i;
 237
 238       assert( indirectFile == TGSI_FILE_ADDRESS );
 239       assert( indirectIndex == 0 );
 240
 241       x86_push( func, r0 );
 242       x86_push( func, r1 );
 243
 244       for (i = 0; i < QUAD_SIZE; i++) {
 245          x86_lea( func, r0, get_const( vec, chan ) );
 246          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 247
 248          /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
 249           */
 250          x86_add( func, r1, r1 );
 251          x86_add( func, r1, r1 );
 252          x86_add( func, r1, r1 );
 253          x86_add( func, r1, r1 );
 254
 255          x86_add( func, r0, r1 );
 256          x86_mov( func, r1, x86_deref( r0 ) );
 257          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 258       }
 259
 260       x86_pop( func, r1 );
 261       x86_pop( func, r0 );
 262
 263       sse_movaps(
 264          func,
 265          make_xmm( xmm ),
 266          get_temp( TEMP_R0, CHAN_X ) );
 267    }
 268    else {
 269       assert( vec >= 0 );
 270
 271       sse_movss(
 272          func,
 273          make_xmm( xmm ),
 274          get_const( vec, chan ) );
 275       sse_shufps(
 276          func,
 277          make_xmm( xmm ),
 278          make_xmm( xmm ),
 279          SHUF( 0, 0, 0, 0 ) );
 280    }
 281 }
 282
 283 static void
 284 emit_immediate(
 285    struct x86_function *func,
 286    unsigned xmm,
 287    unsigned vec,
 288    unsigned chan )
 289 {
 290    sse_movss(
 291       func,
 292       make_xmm( xmm ),
 293       get_immediate( vec, chan ) );
 294    sse_shufps(
 295       func,
 296       make_xmm( xmm ),
 297       make_xmm( xmm ),
 298       SHUF( 0, 0, 0, 0 ) );
 299 }
 300
 301
 302 /**
 303  * Copy a shader input to xmm register
 304  * \param xmm  the destination xmm register
 305  * \param vec  the src input attrib
 306  * \param chan  src channel to fetch (X, Y, Z or W)
 307  */
 308 static void
 309 emit_inputf(
 310    struct x86_function *func,
 311    unsigned xmm,
 312    unsigned vec,
 313    unsigned chan )
 314 {
 315    sse_movups(
 316       func,
 317       make_xmm( xmm ),
 318       get_input( vec, chan ) );
 319 }
 320
 321 /**
 322  * Store an xmm register to a shader output
 323  * \param xmm  the source xmm register
 324  * \param vec  the dest output attrib
 325  * \param chan  src dest channel to store (X, Y, Z or W)
 326  */
 327 static void
 328 emit_output(
 329    struct x86_function *func,
 330    unsigned xmm,
 331    unsigned vec,
 332    unsigned chan )
 333 {
 334    sse_movups(
 335       func,
 336       get_output( vec, chan ),
 337       make_xmm( xmm ) );
 338 }
 339
 340 /**
 341  * Copy a shader temporary to xmm register
 342  * \param xmm  the destination xmm register
 343  * \param vec  the src temp register
 344  * \param chan  src channel to fetch (X, Y, Z or W)
 345  */
 346 static void
 347 emit_tempf(
 348    struct x86_function *func,
 349    unsigned xmm,
 350    unsigned vec,
 351    unsigned chan )
 352 {
 353    sse_movaps(
 354       func,
 355       make_xmm( xmm ),
 356       get_temp( vec, chan ) );
 357 }
 358
 359 /**
 360  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 361  * \param xmm  the destination xmm register
 362  * \param vec  the src input/attribute coefficient index
 363  * \param chan  src channel to fetch (X, Y, Z or W)
 364  * \param member  0=a0, 1=dadx, 2=dady
 365  */
 366 static void
 367 emit_coef(
 368    struct x86_function *func,
 369    unsigned xmm,
 370    unsigned vec,
 371    unsigned chan,
 372    unsigned member )
 373 {
 374    sse_movss(
 375       func,
 376       make_xmm( xmm ),
 377       get_coef( vec, chan, member ) );
 378    sse_shufps(
 379       func,
 380       make_xmm( xmm ),
 381       make_xmm( xmm ),
 382       SHUF( 0, 0, 0, 0 ) );
 383 }
 384
 385 /**
 386  * Data store helpers.
 387  */
 388
 389 static void
 390 emit_inputs(
 391    struct x86_function *func,
 392    unsigned xmm,
 393    unsigned vec,
 394    unsigned chan )
 395 {
 396    sse_movups(
 397       func,
 398       get_input( vec, chan ),
 399       make_xmm( xmm ) );
 400 }
 401
 402 static void
 403 emit_temps(
 404    struct x86_function *func,
 405    unsigned xmm,
 406    unsigned vec,
 407    unsigned chan )
 408 {
 409    sse_movaps(
 410       func,
 411       get_temp( vec, chan ),
 412       make_xmm( xmm ) );
 413 }
 414
 415 static void
 416 emit_addrs(
 417    struct x86_function *func,
 418    unsigned xmm,
 419    unsigned vec,
 420    unsigned chan )
 421 {
 422    assert( vec == 0 );
 423
 424    emit_temps(
 425       func,
 426       xmm,
 427       vec + TGSI_EXEC_TEMP_ADDR,
 428       chan );
 429 }
 430
 431 /**
 432  * Coefficent fetch helpers.
 433  */
 434
 435 static void
 436 emit_coef_a0(
 437    struct x86_function *func,
 438    unsigned xmm,
 439    unsigned vec,
 440    unsigned chan )
 441 {
 442    emit_coef(
 443       func,
 444       xmm,
 445       vec,
 446       chan,
 447       0 );
 448 }
 449
 450 static void
 451 emit_coef_dadx(
 452    struct x86_function *func,
 453    unsigned xmm,
 454    unsigned vec,
 455    unsigned chan )
 456 {
 457    emit_coef(
 458       func,
 459       xmm,
 460       vec,
 461       chan,
 462       1 );
 463 }
 464
 465 static void
 466 emit_coef_dady(
 467    struct x86_function *func,
 468    unsigned xmm,
 469    unsigned vec,
 470    unsigned chan )
 471 {
 472    emit_coef(
 473       func,
 474       xmm,
 475       vec,
 476       chan,
 477       2 );
 478 }
 479
 480 /**
 481  * Function call helpers.
 482  */
 483
 484 /**
 485  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 486  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 487  * that the stack pointer is 16 byte aligned, as expected.
 488  */
 489 static void
 490 emit_func_call_dst(
 491    struct x86_function *func,
 492    unsigned xmm_save,
 493    unsigned xmm_dst,
 494    void (PIPE_CDECL *code)() )
 495 {
 496    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 497    unsigned i, n, xmm;
 498    unsigned xmm_mask;
 499
 500    /* Bitmask of the xmm registers to save */
 501    xmm_mask = (1 << xmm_save) - 1;
 502    xmm_mask &= ~(1 << xmm_dst);
 503
 504    sse_movaps(
 505       func,
 506       get_temp( TEMP_R0, 0 ),
 507       make_xmm( xmm_dst ) );
 508
 509    x86_push(
 510       func,
 511       x86_make_reg( file_REG32, reg_AX) );
 512    x86_push(
 513       func,
 514       x86_make_reg( file_REG32, reg_CX) );
 515    x86_push(
 516       func,
 517       x86_make_reg( file_REG32, reg_DX) );
 518
 519    for(i = 0, n = 0; i < 8; ++i)
 520       if(xmm_mask & (1 << i))
 521          ++n;
 522
 523    x86_sub_imm(
 524       func,
 525       x86_make_reg( file_REG32, reg_SP ),
 526       n*16);
 527
 528    for(i = 0, n = 0; i < 8; ++i)
 529       if(xmm_mask & (1 << i)) {
 530          sse_movups(
 531             func,
 532             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 533             make_xmm( xmm ) );
 534          ++n;
 535       }
 536
 537    x86_lea(
 538       func,
 539       ecx,
 540       get_temp( TEMP_R0, 0 ) );
 541
 542    x86_push( func, ecx );
 543    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 544    x86_call( func, ecx );
 545    x86_pop(func, ecx );
 546
 547    for(i = 0, n = 0; i < 8; ++i)
 548       if(xmm_mask & (1 << i)) {
 549          sse_movups(
 550             func,
 551             make_xmm( xmm ),
 552             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 553          ++n;
 554       }
 555
 556    x86_add_imm(
 557       func,
 558       x86_make_reg( file_REG32, reg_SP ),
 559       n*16);
 560
 561    /* Restore GP registers in a reverse order.
 562     */
 563    x86_pop(
 564       func,
 565       x86_make_reg( file_REG32, reg_DX) );
 566    x86_pop(
 567       func,
 568       x86_make_reg( file_REG32, reg_CX) );
 569    x86_pop(
 570       func,
 571       x86_make_reg( file_REG32, reg_AX) );
 572
 573    sse_movaps(
 574       func,
 575       make_xmm( xmm_dst ),
 576       get_temp( TEMP_R0, 0 ) );
 577 }
 578
 579 static void
 580 emit_func_call_dst_src(
 581    struct x86_function *func,
 582    unsigned xmm_save,
 583    unsigned xmm_dst,
 584    unsigned xmm_src,
 585    void (PIPE_CDECL *code)() )
 586 {
 587    sse_movaps(
 588       func,
 589       get_temp( TEMP_R0, 1 ),
 590       make_xmm( xmm_src ) );
 591
 592    emit_func_call_dst(
 593       func,
 594       xmm_save,
 595       xmm_dst,
 596       code );
 597 }
 598
 599 /*
 600  * Fast SSE2 implementation of special math functions.
 601  */
 602
 603 #define POLY0(x, c0) _mm_set1_ps(c0)
 604 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 605 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 606 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 607 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 608 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 609
 610 #define EXP_POLY_DEGREE 3
 611 #define LOG_POLY_DEGREE 5
 612
 613 /**
 614  * See http://www.devmaster.net/forums/showthread.php?p=43580
 615  */
 616 static INLINE __m128
 617 exp2f4(__m128 x)
 618 {
 619    __m128i ipart;
 620    __m128 fpart, expipart, expfpart;
 621
 622    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 623    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 624
 625    /* ipart = int(x - 0.5) */
 626    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 627
 628    /* fpart = x - ipart */
 629    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 630
 631    /* expipart = (float) (1 << ipart) */
 632    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 633
 634    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 635 #if EXP_POLY_DEGREE == 5
 636    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 637 #elif EXP_POLY_DEGREE == 4
 638    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 639 #elif EXP_POLY_DEGREE == 3
 640    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 641 #elif EXP_POLY_DEGREE == 2
 642    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 643 #else
 644 #error
 645 #endif
 646
 647    return _mm_mul_ps(expipart, expfpart);
 648 }
 649
 650 /**
 651  * See http://www.devmaster.net/forums/showthread.php?p=43580
 652  */
 653 static INLINE __m128
 654 log2f4(__m128 x)
 655 {
 656    __m128i expmask = _mm_set1_epi32(0x7f800000);
 657    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 658    __m128 one = _mm_set1_ps(1.0f);
 659
 660    __m128i i = _mm_castps_si128(x);
 661
 662    /* exp = (float) exponent(x) */
 663    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 664
 665    /* mant = (float) mantissa(x) */
 666    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 667
 668    __m128 logmant;
 669
 670    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 671     * These coefficients can be generate with
 672     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 673     */
 674 #if LOG_POLY_DEGREE == 6
 675    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 676 #elif LOG_POLY_DEGREE == 5
 677    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 678 #elif LOG_POLY_DEGREE == 4
 679    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 680 #elif LOG_POLY_DEGREE == 3
 681    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 682 #else
 683 #error
 684 #endif
 685
 686    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 687    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 688
 689    return _mm_add_ps(logmant, exp);
 690 }
 691
 692 static INLINE __m128
 693 powf4(__m128 x, __m128 y)
 694 {
 695    return exp2f4(_mm_mul_ps(log2f4(x), y));
 696 }
 697
 698
 699 /**
 700  * Low-level instruction translators.
 701  */
 702
 703 static void
 704 emit_abs(
 705    struct x86_function *func,
 706    unsigned xmm )
 707 {
 708    sse_andps(
 709       func,
 710       make_xmm( xmm ),
 711       get_temp(
 712          TGSI_EXEC_TEMP_7FFFFFFF_I,
 713          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 714 }
 715
 716 static void
 717 emit_add(
 718    struct x86_function *func,
 719    unsigned xmm_dst,
 720    unsigned xmm_src )
 721 {
 722    sse_addps(
 723       func,
 724       make_xmm( xmm_dst ),
 725       make_xmm( xmm_src ) );
 726 }
 727
 728 static void PIPE_CDECL
 729 cos4f(
 730    float *store )
 731 {
 732    store[0] = cosf( store[0] );
 733    store[1] = cosf( store[1] );
 734    store[2] = cosf( store[2] );
 735    store[3] = cosf( store[3] );
 736 }
 737
 738 static void
 739 emit_cos(
 740    struct x86_function *func,
 741    unsigned xmm_save,
 742    unsigned xmm_dst )
 743 {
 744    emit_func_call_dst(
 745       func,
 746       xmm_save,
 747       xmm_dst,
 748       cos4f );
 749 }
 750
 751 static void PIPE_CDECL
 752 #if defined(PIPE_CC_GCC)
 753 __attribute__((force_align_arg_pointer))
 754 #endif
 755 ex24f(
 756    float *store )
 757 {
 758    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 759 }
 760
 761 static void
 762 emit_ex2(
 763    struct x86_function *func,
 764    unsigned xmm_save,
 765    unsigned xmm_dst )
 766 {
 767    emit_func_call_dst(
 768       func,
 769       xmm_save,
 770       xmm_dst,
 771       ex24f );
 772 }
 773
 774 static void
 775 emit_f2it(
 776    struct x86_function *func,
 777    unsigned xmm )
 778 {
 779    sse2_cvttps2dq(
 780       func,
 781       make_xmm( xmm ),
 782       make_xmm( xmm ) );
 783 }
 784
 785 static void PIPE_CDECL
 786 flr4f(
 787    float *store )
 788 {
 789    store[0] = floorf( store[0] );
 790    store[1] = floorf( store[1] );
 791    store[2] = floorf( store[2] );
 792    store[3] = floorf( store[3] );
 793 }
 794
 795 static void
 796 emit_flr(
 797    struct x86_function *func,
 798    unsigned xmm_save,
 799    unsigned xmm_dst )
 800 {
 801    emit_func_call_dst(
 802       func,
 803       xmm_save,
 804       xmm_dst,
 805       flr4f );
 806 }
 807
 808 static void PIPE_CDECL
 809 frc4f(
 810    float *store )
 811 {
 812    store[0] -= floorf( store[0] );
 813    store[1] -= floorf( store[1] );
 814    store[2] -= floorf( store[2] );
 815    store[3] -= floorf( store[3] );
 816 }
 817
 818 static void
 819 emit_frc(
 820    struct x86_function *func,
 821    unsigned xmm_save,
 822    unsigned xmm_dst )
 823 {
 824    emit_func_call_dst(
 825       func,
 826       xmm_save,
 827       xmm_dst,
 828       frc4f );
 829 }
 830
 831 static void PIPE_CDECL
 832 #if defined(PIPE_CC_GCC)
 833 __attribute__((force_align_arg_pointer))
 834 #endif
 835 lg24f(
 836    float *store )
 837 {
 838    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 839 }
 840
 841 static void
 842 emit_lg2(
 843    struct x86_function *func,
 844    unsigned xmm_save,
 845    unsigned xmm_dst )
 846 {
 847    emit_func_call_dst(
 848       func,
 849       xmm_save,
 850       xmm_dst,
 851       lg24f );
 852 }
 853
 854 static void
 855 emit_MOV(
 856    struct x86_function *func,
 857    unsigned xmm_dst,
 858    unsigned xmm_src )
 859 {
 860    sse_movups(
 861       func,
 862       make_xmm( xmm_dst ),
 863       make_xmm( xmm_src ) );
 864 }
 865
 866 static void
 867 emit_mul (struct x86_function *func,
 868           unsigned xmm_dst,
 869           unsigned xmm_src)
 870 {
 871    sse_mulps(
 872       func,
 873       make_xmm( xmm_dst ),
 874       make_xmm( xmm_src ) );
 875 }
 876
 877 static void
 878 emit_neg(
 879    struct x86_function *func,
 880    unsigned xmm )
 881 {
 882    sse_xorps(
 883       func,
 884       make_xmm( xmm ),
 885       get_temp(
 886          TGSI_EXEC_TEMP_80000000_I,
 887          TGSI_EXEC_TEMP_80000000_C ) );
 888 }
 889
 890 static void PIPE_CDECL
 891 #if defined(PIPE_CC_GCC)
 892 __attribute__((force_align_arg_pointer))
 893 #endif
 894 pow4f(
 895    float *store )
 896 {
 897 #if 1
 898    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
 899 #else
 900    store[0] = powf( store[0], store[4] );
 901    store[1] = powf( store[1], store[5] );
 902    store[2] = powf( store[2], store[6] );
 903    store[3] = powf( store[3], store[7] );
 904 #endif
 905 }
 906
 907 static void
 908 emit_pow(
 909    struct x86_function *func,
 910    unsigned xmm_save,
 911    unsigned xmm_dst,
 912    unsigned xmm_src )
 913 {
 914    emit_func_call_dst_src(
 915       func,
 916       xmm_save,
 917       xmm_dst,
 918       xmm_src,
 919       pow4f );
 920 }
 921
 922 static void
 923 emit_rcp (
 924    struct x86_function *func,
 925    unsigned xmm_dst,
 926    unsigned xmm_src )
 927 {
 928    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 929     * good enough.  Need to either emit a proper divide or use the
 930     * iterative technique described below in emit_rsqrt().
 931     */
 932    sse2_rcpps(
 933       func,
 934       make_xmm( xmm_dst ),
 935       make_xmm( xmm_src ) );
 936 }
 937
 938 static void
 939 emit_rsqrt(
 940    struct x86_function *func,
 941    unsigned xmm_dst,
 942    unsigned xmm_src )
 943 {
 944 #if HIGH_PRECISION
 945    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
 946     * implementations, it is possible to improve its precision at
 947     * fairly low cost, using a newton/raphson step, as below:
 948     *
 949     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
 950     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
 951     *
 952     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
 953     */
 954    {
 955       struct x86_reg dst = make_xmm( xmm_dst );
 956       struct x86_reg src = make_xmm( xmm_src );
 957       struct x86_reg tmp0 = make_xmm( 2 );
 958       struct x86_reg tmp1 = make_xmm( 3 );
 959
 960       assert( xmm_dst != xmm_src );
 961       assert( xmm_dst != 2 && xmm_dst != 3 );
 962       assert( xmm_src != 2 && xmm_src != 3 );
 963
 964       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
 965       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
 966       sse_rsqrtps( func, tmp1, src  );
 967       sse_mulps(   func, src,  tmp1 );
 968       sse_mulps(   func, dst,  tmp1 );
 969       sse_mulps(   func, src,  tmp1 );
 970       sse_subps(   func, tmp0, src  );
 971       sse_mulps(   func, dst,  tmp0 );
 972    }
 973 #else
 974    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 975     * good enough.
 976     */
 977    sse_rsqrtps(
 978       func,
 979       make_xmm( xmm_dst ),
 980       make_xmm( xmm_src ) );
 981 #endif
 982 }
 983
 984 static void
 985 emit_setsign(
 986    struct x86_function *func,
 987    unsigned xmm )
 988 {
 989    sse_orps(
 990       func,
 991       make_xmm( xmm ),
 992       get_temp(
 993          TGSI_EXEC_TEMP_80000000_I,
 994          TGSI_EXEC_TEMP_80000000_C ) );
 995 }
 996
 997 static void PIPE_CDECL
 998 sin4f(
 999    float *store )
1000 {
1001    store[0] = sinf( store[0] );
1002    store[1] = sinf( store[1] );
1003    store[2] = sinf( store[2] );
1004    store[3] = sinf( store[3] );
1005 }
1006
1007 static void
1008 emit_sin (struct x86_function *func,
1009           unsigned xmm_save,
1010           unsigned xmm_dst)
1011 {
1012    emit_func_call_dst(
1013       func,
1014       xmm_save,
1015       xmm_dst,
1016       sin4f );
1017 }
1018
1019 static void
1020 emit_sub(
1021    struct x86_function *func,
1022    unsigned xmm_dst,
1023    unsigned xmm_src )
1024 {
1025    sse_subps(
1026       func,
1027       make_xmm( xmm_dst ),
1028       make_xmm( xmm_src ) );
1029 }
1030
1031 /**
1032  * Register fetch.
1033  */
1034
1035 static void
1036 emit_fetch(
1037    struct x86_function *func,
1038    unsigned xmm,
1039    const struct tgsi_full_src_register *reg,
1040    const unsigned chan_index )
1041 {
1042    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1043
1044    switch (swizzle) {
1045    case TGSI_EXTSWIZZLE_X:
1046    case TGSI_EXTSWIZZLE_Y:
1047    case TGSI_EXTSWIZZLE_Z:
1048    case TGSI_EXTSWIZZLE_W:
1049       switch (reg->SrcRegister.File) {
1050       case TGSI_FILE_CONSTANT:
1051          emit_const(
1052             func,
1053             xmm,
1054             reg->SrcRegister.Index,
1055             swizzle,
1056             reg->SrcRegister.Indirect,
1057             reg->SrcRegisterInd.File,
1058             reg->SrcRegisterInd.Index );
1059          break;
1060
1061       case TGSI_FILE_IMMEDIATE:
1062          emit_immediate(
1063             func,
1064             xmm,
1065             reg->SrcRegister.Index,
1066             swizzle );
1067          break;
1068
1069       case TGSI_FILE_INPUT:
1070          emit_inputf(
1071             func,
1072             xmm,
1073             reg->SrcRegister.Index,
1074             swizzle );
1075          break;
1076
1077       case TGSI_FILE_TEMPORARY:
1078          emit_tempf(
1079             func,
1080             xmm,
1081             reg->SrcRegister.Index,
1082             swizzle );
1083          break;
1084
1085       default:
1086          assert( 0 );
1087       }
1088       break;
1089
1090    case TGSI_EXTSWIZZLE_ZERO:
1091       emit_tempf(
1092          func,
1093          xmm,
1094          TGSI_EXEC_TEMP_00000000_I,
1095          TGSI_EXEC_TEMP_00000000_C );
1096       break;
1097
1098    case TGSI_EXTSWIZZLE_ONE:
1099       emit_tempf(
1100          func,
1101          xmm,
1102          TEMP_ONE_I,
1103          TEMP_ONE_C );
1104       break;
1105
1106    default:
1107       assert( 0 );
1108    }
1109
1110    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1111    case TGSI_UTIL_SIGN_CLEAR:
1112       emit_abs( func, xmm );
1113       break;
1114
1115    case TGSI_UTIL_SIGN_SET:
1116       emit_setsign( func, xmm );
1117       break;
1118
1119    case TGSI_UTIL_SIGN_TOGGLE:
1120       emit_neg( func, xmm );
1121       break;
1122
1123    case TGSI_UTIL_SIGN_KEEP:
1124       break;
1125    }
1126 }
1127
1128 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1129    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1130
1131 /**
1132  * Register store.
1133  */
1134
1135 static void
1136 emit_store(
1137    struct x86_function *func,
1138    unsigned xmm,
1139    const struct tgsi_full_dst_register *reg,
1140    const struct tgsi_full_instruction *inst,
1141    unsigned chan_index )
1142 {
1143    switch( reg->DstRegister.File ) {
1144    case TGSI_FILE_OUTPUT:
1145       emit_output(
1146          func,
1147          xmm,
1148          reg->DstRegister.Index,
1149          chan_index );
1150       break;
1151
1152    case TGSI_FILE_TEMPORARY:
1153       emit_temps(
1154          func,
1155          xmm,
1156          reg->DstRegister.Index,
1157          chan_index );
1158       break;
1159
1160    case TGSI_FILE_ADDRESS:
1161       emit_addrs(
1162          func,
1163          xmm,
1164          reg->DstRegister.Index,
1165          chan_index );
1166       break;
1167
1168    default:
1169       assert( 0 );
1170    }
1171
1172    switch( inst->Instruction.Saturate ) {
1173    case TGSI_SAT_NONE:
1174       break;
1175
1176    case TGSI_SAT_ZERO_ONE:
1177       /* assert( 0 ); */
1178       break;
1179
1180    case TGSI_SAT_MINUS_PLUS_ONE:
1181       assert( 0 );
1182       break;
1183    }
1184 }
1185
1186 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1187    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1188
1189 /**
1190  * High-level instruction translators.
1191  */
1192
1193 static void
1194 emit_kil(
1195    struct x86_function *func,
1196    const struct tgsi_full_src_register *reg )
1197 {
1198    unsigned uniquemask;
1199    unsigned registers[4];
1200    unsigned nextregister = 0;
1201    unsigned firstchan = ~0;
1202    unsigned chan_index;
1203
1204    /* This mask stores component bits that were already tested. Note that
1205     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1206     * tested. */
1207    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1208
1209    FOR_EACH_CHANNEL( chan_index ) {
1210       unsigned swizzle;
1211
1212       /* unswizzle channel */
1213       swizzle = tgsi_util_get_full_src_register_extswizzle(
1214          reg,
1215          chan_index );
1216
1217       /* check if the component has not been already tested */
1218       if( !(uniquemask & (1 << swizzle)) ) {
1219          uniquemask |= 1 << swizzle;
1220
1221          /* allocate register */
1222          registers[chan_index] = nextregister;
1223          emit_fetch(
1224             func,
1225             nextregister,
1226             reg,
1227             chan_index );
1228          nextregister++;
1229
1230          /* mark the first channel used */
1231          if( firstchan == ~0 ) {
1232             firstchan = chan_index;
1233          }
1234       }
1235    }
1236
1237    x86_push(
1238       func,
1239       x86_make_reg( file_REG32, reg_AX ) );
1240    x86_push(
1241       func,
1242       x86_make_reg( file_REG32, reg_DX ) );
1243
1244    FOR_EACH_CHANNEL( chan_index ) {
1245       if( uniquemask & (1 << chan_index) ) {
1246          sse_cmpps(
1247             func,
1248             make_xmm( registers[chan_index] ),
1249             get_temp(
1250                TGSI_EXEC_TEMP_00000000_I,
1251                TGSI_EXEC_TEMP_00000000_C ),
1252             cc_LessThan );
1253
1254          if( chan_index == firstchan ) {
1255             sse_pmovmskb(
1256                func,
1257                x86_make_reg( file_REG32, reg_AX ),
1258                make_xmm( registers[chan_index] ) );
1259          }
1260          else {
1261             sse_pmovmskb(
1262                func,
1263                x86_make_reg( file_REG32, reg_DX ),
1264                make_xmm( registers[chan_index] ) );
1265             x86_or(
1266                func,
1267                x86_make_reg( file_REG32, reg_AX ),
1268                x86_make_reg( file_REG32, reg_DX ) );
1269          }
1270       }
1271    }
1272
1273    x86_or(
1274       func,
1275       get_temp(
1276          TGSI_EXEC_TEMP_KILMASK_I,
1277          TGSI_EXEC_TEMP_KILMASK_C ),
1278       x86_make_reg( file_REG32, reg_AX ) );
1279
1280    x86_pop(
1281       func,
1282       x86_make_reg( file_REG32, reg_DX ) );
1283    x86_pop(
1284       func,
1285       x86_make_reg( file_REG32, reg_AX ) );
1286 }
1287
1288
1289 static void
1290 emit_kilp(
1291    struct x86_function *func )
1292 {
1293    /* XXX todo / fix me */
1294 }
1295
1296
1297 static void
1298 emit_setcc(
1299    struct x86_function *func,
1300    struct tgsi_full_instruction *inst,
1301    enum sse_cc cc )
1302 {
1303    unsigned chan_index;
1304
1305    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1306       FETCH( func, *inst, 0, 0, chan_index );
1307       FETCH( func, *inst, 1, 1, chan_index );
1308       sse_cmpps(
1309          func,
1310          make_xmm( 0 ),
1311          make_xmm( 1 ),
1312          cc );
1313       sse_andps(
1314          func,
1315          make_xmm( 0 ),
1316          get_temp(
1317             TEMP_ONE_I,
1318             TEMP_ONE_C ) );
1319       STORE( func, *inst, 0, 0, chan_index );
1320    }
1321 }
1322
1323 static void
1324 emit_cmp(
1325    struct x86_function *func,
1326    struct tgsi_full_instruction *inst )
1327 {
1328    unsigned chan_index;
1329
1330    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1331       FETCH( func, *inst, 0, 0, chan_index );
1332       FETCH( func, *inst, 1, 1, chan_index );
1333       FETCH( func, *inst, 2, 2, chan_index );
1334       sse_cmpps(
1335          func,
1336          make_xmm( 0 ),
1337          get_temp(
1338             TGSI_EXEC_TEMP_00000000_I,
1339             TGSI_EXEC_TEMP_00000000_C ),
1340          cc_LessThan );
1341       sse_andps(
1342          func,
1343          make_xmm( 1 ),
1344          make_xmm( 0 ) );
1345       sse_andnps(
1346          func,
1347          make_xmm( 0 ),
1348          make_xmm( 2 ) );
1349       sse_orps(
1350          func,
1351          make_xmm( 0 ),
1352          make_xmm( 1 ) );
1353       STORE( func, *inst, 0, 0, chan_index );
1354    }
1355 }
1356
1357 static int
1358 emit_instruction(
1359    struct x86_function *func,
1360    struct tgsi_full_instruction *inst )
1361 {
1362    unsigned chan_index;
1363
1364    switch (inst->Instruction.Opcode) {
1365    case TGSI_OPCODE_ARL:
1366       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1367          FETCH( func, *inst, 0, 0, chan_index );
1368          emit_f2it( func, 0 );
1369          STORE( func, *inst, 0, 0, chan_index );
1370       }
1371       break;
1372
1373    case TGSI_OPCODE_MOV:
1374    case TGSI_OPCODE_SWZ:
1375       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1376          FETCH( func, *inst, 0, 0, chan_index );
1377          STORE( func, *inst, 0, 0, chan_index );
1378       }
1379       break;
1380
1381    case TGSI_OPCODE_LIT:
1382       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1383           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1384          emit_tempf(
1385             func,
1386             0,
1387             TEMP_ONE_I,
1388             TEMP_ONE_C);
1389          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1390             STORE( func, *inst, 0, 0, CHAN_X );
1391          }
1392          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1393             STORE( func, *inst, 0, 0, CHAN_W );
1394          }
1395       }
1396       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1397           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1398          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1399             FETCH( func, *inst, 0, 0, CHAN_X );
1400             sse_maxps(
1401                func,
1402                make_xmm( 0 ),
1403                get_temp(
1404                   TGSI_EXEC_TEMP_00000000_I,
1405                   TGSI_EXEC_TEMP_00000000_C ) );
1406             STORE( func, *inst, 0, 0, CHAN_Y );
1407          }
1408          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1409             /* XMM[1] = SrcReg[0].yyyy */
1410             FETCH( func, *inst, 1, 0, CHAN_Y );
1411             /* XMM[1] = max(XMM[1], 0) */
1412             sse_maxps(
1413                func,
1414                make_xmm( 1 ),
1415                get_temp(
1416                   TGSI_EXEC_TEMP_00000000_I,
1417                   TGSI_EXEC_TEMP_00000000_C ) );
1418             /* XMM[2] = SrcReg[0].wwww */
1419             FETCH( func, *inst, 2, 0, CHAN_W );
1420             /* XMM[2] = min(XMM[2], 128.0) */
1421             sse_minps(
1422                func,
1423                make_xmm( 2 ),
1424                get_temp(
1425                   TGSI_EXEC_TEMP_128_I,
1426                   TGSI_EXEC_TEMP_128_C ) );
1427             /* XMM[2] = max(XMM[2], -128.0) */
1428             sse_maxps(
1429                func,
1430                make_xmm( 2 ),
1431                get_temp(
1432                   TGSI_EXEC_TEMP_MINUS_128_I,
1433                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1434             emit_pow( func, 3, 1, 2 );
1435             FETCH( func, *inst, 0, 0, CHAN_X );
1436             sse_xorps(
1437                func,
1438                make_xmm( 2 ),
1439                make_xmm( 2 ) );
1440             sse_cmpps(
1441                func,
1442                make_xmm( 2 ),
1443                make_xmm( 0 ),
1444                cc_LessThanEqual );
1445             sse_andps(
1446                func,
1447                make_xmm( 2 ),
1448                make_xmm( 1 ) );
1449             STORE( func, *inst, 2, 0, CHAN_Z );
1450          }
1451       }
1452       break;
1453
1454    case TGSI_OPCODE_RCP:
1455    /* TGSI_OPCODE_RECIP */
1456       FETCH( func, *inst, 0, 0, CHAN_X );
1457       emit_rcp( func, 0, 0 );
1458       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1459          STORE( func, *inst, 0, 0, chan_index );
1460       }
1461       break;
1462
1463    case TGSI_OPCODE_RSQ:
1464    /* TGSI_OPCODE_RECIPSQRT */
1465       FETCH( func, *inst, 0, 0, CHAN_X );
1466       emit_rsqrt( func, 1, 0 );
1467       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1468          STORE( func, *inst, 1, 0, chan_index );
1469       }
1470       break;
1471
1472    case TGSI_OPCODE_EXP:
1473       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1474           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1475           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1476          FETCH( func, *inst, 0, 0, CHAN_X );
1477          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1478              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1479             emit_MOV( func, 1, 0 );
1480             emit_flr( func, 2, 1 );
1481             /* dst.x = ex2(floor(src.x)) */
1482             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1483                emit_MOV( func, 2, 1 );
1484                emit_ex2( func, 3, 2 );
1485                STORE( func, *inst, 2, 0, CHAN_X );
1486             }
1487             /* dst.y = src.x - floor(src.x) */
1488             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1489                emit_MOV( func, 2, 0 );
1490                emit_sub( func, 2, 1 );
1491                STORE( func, *inst, 2, 0, CHAN_Y );
1492             }
1493          }
1494          /* dst.z = ex2(src.x) */
1495          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1496             emit_ex2( func, 3, 0 );
1497             STORE( func, *inst, 0, 0, CHAN_Z );
1498          }
1499       }
1500       /* dst.w = 1.0 */
1501       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1502          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1503          STORE( func, *inst, 0, 0, CHAN_W );
1504       }
1505       break;
1506
1507    case TGSI_OPCODE_LOG:
1508       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1509           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1510           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1511          FETCH( func, *inst, 0, 0, CHAN_X );
1512          emit_abs( func, 0 );
1513          emit_MOV( func, 1, 0 );
1514          emit_lg2( func, 2, 1 );
1515          /* dst.z = lg2(abs(src.x)) */
1516          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1517             STORE( func, *inst, 1, 0, CHAN_Z );
1518          }
1519          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1520              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1521             emit_flr( func, 2, 1 );
1522             /* dst.x = floor(lg2(abs(src.x))) */
1523             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1524                STORE( func, *inst, 1, 0, CHAN_X );
1525             }
1526             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1527             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1528                emit_ex2( func, 2, 1 );
1529                emit_rcp( func, 1, 1 );
1530                emit_mul( func, 0, 1 );
1531                STORE( func, *inst, 0, 0, CHAN_Y );
1532             }
1533          }
1534       }
1535       /* dst.w = 1.0 */
1536       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1537          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1538          STORE( func, *inst, 0, 0, CHAN_W );
1539       }
1540       break;
1541
1542    case TGSI_OPCODE_MUL:
1543       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1544          FETCH( func, *inst, 0, 0, chan_index );
1545          FETCH( func, *inst, 1, 1, chan_index );
1546          emit_mul( func, 0, 1 );
1547          STORE( func, *inst, 0, 0, chan_index );
1548       }
1549       break;
1550
1551    case TGSI_OPCODE_ADD:
1552       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1553          FETCH( func, *inst, 0, 0, chan_index );
1554          FETCH( func, *inst, 1, 1, chan_index );
1555          emit_add( func, 0, 1 );
1556          STORE( func, *inst, 0, 0, chan_index );
1557       }
1558       break;
1559
1560    case TGSI_OPCODE_DP3:
1561    /* TGSI_OPCODE_DOT3 */
1562       FETCH( func, *inst, 0, 0, CHAN_X );
1563       FETCH( func, *inst, 1, 1, CHAN_X );
1564       emit_mul( func, 0, 1 );
1565       FETCH( func, *inst, 1, 0, CHAN_Y );
1566       FETCH( func, *inst, 2, 1, CHAN_Y );
1567       emit_mul( func, 1, 2 );
1568       emit_add( func, 0, 1 );
1569       FETCH( func, *inst, 1, 0, CHAN_Z );
1570       FETCH( func, *inst, 2, 1, CHAN_Z );
1571       emit_mul( func, 1, 2 );
1572       emit_add( func, 0, 1 );
1573       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1574          STORE( func, *inst, 0, 0, chan_index );
1575       }
1576       break;
1577
1578    case TGSI_OPCODE_DP4:
1579    /* TGSI_OPCODE_DOT4 */
1580       FETCH( func, *inst, 0, 0, CHAN_X );
1581       FETCH( func, *inst, 1, 1, CHAN_X );
1582       emit_mul( func, 0, 1 );
1583       FETCH( func, *inst, 1, 0, CHAN_Y );
1584       FETCH( func, *inst, 2, 1, CHAN_Y );
1585       emit_mul( func, 1, 2 );
1586       emit_add( func, 0, 1 );
1587       FETCH( func, *inst, 1, 0, CHAN_Z );
1588       FETCH( func, *inst, 2, 1, CHAN_Z );
1589       emit_mul(func, 1, 2 );
1590       emit_add(func, 0, 1 );
1591       FETCH( func, *inst, 1, 0, CHAN_W );
1592       FETCH( func, *inst, 2, 1, CHAN_W );
1593       emit_mul( func, 1, 2 );
1594       emit_add( func, 0, 1 );
1595       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1596          STORE( func, *inst, 0, 0, chan_index );
1597       }
1598       break;
1599
1600    case TGSI_OPCODE_DST:
1601       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1602          emit_tempf(
1603             func,
1604             0,
1605             TEMP_ONE_I,
1606             TEMP_ONE_C );
1607          STORE( func, *inst, 0, 0, CHAN_X );
1608       }
1609       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1610          FETCH( func, *inst, 0, 0, CHAN_Y );
1611          FETCH( func, *inst, 1, 1, CHAN_Y );
1612          emit_mul( func, 0, 1 );
1613          STORE( func, *inst, 0, 0, CHAN_Y );
1614       }
1615       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1616          FETCH( func, *inst, 0, 0, CHAN_Z );
1617          STORE( func, *inst, 0, 0, CHAN_Z );
1618       }
1619       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1620          FETCH( func, *inst, 0, 1, CHAN_W );
1621          STORE( func, *inst, 0, 0, CHAN_W );
1622       }
1623       break;
1624
1625    case TGSI_OPCODE_MIN:
1626       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1627          FETCH( func, *inst, 0, 0, chan_index );
1628          FETCH( func, *inst, 1, 1, chan_index );
1629          sse_minps(
1630             func,
1631             make_xmm( 0 ),
1632             make_xmm( 1 ) );
1633          STORE( func, *inst, 0, 0, chan_index );
1634       }
1635       break;
1636
1637    case TGSI_OPCODE_MAX:
1638       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1639          FETCH( func, *inst, 0, 0, chan_index );
1640          FETCH( func, *inst, 1, 1, chan_index );
1641          sse_maxps(
1642             func,
1643             make_xmm( 0 ),
1644             make_xmm( 1 ) );
1645          STORE( func, *inst, 0, 0, chan_index );
1646       }
1647       break;
1648
1649    case TGSI_OPCODE_SLT:
1650    /* TGSI_OPCODE_SETLT */
1651       emit_setcc( func, inst, cc_LessThan );
1652       break;
1653
1654    case TGSI_OPCODE_SGE:
1655    /* TGSI_OPCODE_SETGE */
1656       emit_setcc( func, inst, cc_NotLessThan );
1657       break;
1658
1659    case TGSI_OPCODE_MAD:
1660    /* TGSI_OPCODE_MADD */
1661       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1662          FETCH( func, *inst, 0, 0, chan_index );
1663          FETCH( func, *inst, 1, 1, chan_index );
1664          FETCH( func, *inst, 2, 2, chan_index );
1665          emit_mul( func, 0, 1 );
1666          emit_add( func, 0, 2 );
1667          STORE( func, *inst, 0, 0, chan_index );
1668       }
1669       break;
1670
1671    case TGSI_OPCODE_SUB:
1672       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1673          FETCH( func, *inst, 0, 0, chan_index );
1674          FETCH( func, *inst, 1, 1, chan_index );
1675          emit_sub( func, 0, 1 );
1676          STORE( func, *inst, 0, 0, chan_index );
1677       }
1678       break;
1679
1680    case TGSI_OPCODE_LERP:
1681    /* TGSI_OPCODE_LRP */
1682       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1683          FETCH( func, *inst, 0, 0, chan_index );
1684          FETCH( func, *inst, 1, 1, chan_index );
1685          FETCH( func, *inst, 2, 2, chan_index );
1686          emit_sub( func, 1, 2 );
1687          emit_mul( func, 0, 1 );
1688          emit_add( func, 0, 2 );
1689          STORE( func, *inst, 0, 0, chan_index );
1690       }
1691       break;
1692
1693    case TGSI_OPCODE_CND:
1694       return 0;
1695       break;
1696
1697    case TGSI_OPCODE_CND0:
1698       return 0;
1699       break;
1700
1701    case TGSI_OPCODE_DOT2ADD:
1702    /* TGSI_OPCODE_DP2A */
1703       return 0;
1704       break;
1705
1706    case TGSI_OPCODE_INDEX:
1707       return 0;
1708       break;
1709
1710    case TGSI_OPCODE_NEGATE:
1711       return 0;
1712       break;
1713
1714    case TGSI_OPCODE_FRAC:
1715    /* TGSI_OPCODE_FRC */
1716       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1717          FETCH( func, *inst, 0, 0, chan_index );
1718          emit_frc( func, 0, 0 );
1719          STORE( func, *inst, 0, 0, chan_index );
1720       }
1721       break;
1722
1723    case TGSI_OPCODE_CLAMP:
1724       return 0;
1725       break;
1726
1727    case TGSI_OPCODE_FLOOR:
1728    /* TGSI_OPCODE_FLR */
1729       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1730          FETCH( func, *inst, 0, 0, chan_index );
1731          emit_flr( func, 0, 0 );
1732          STORE( func, *inst, 0, 0, chan_index );
1733       }
1734       break;
1735
1736    case TGSI_OPCODE_ROUND:
1737       return 0;
1738       break;
1739
1740    case TGSI_OPCODE_EXPBASE2:
1741    /* TGSI_OPCODE_EX2 */
1742       FETCH( func, *inst, 0, 0, CHAN_X );
1743       emit_ex2( func, 0, 0 );
1744       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1745          STORE( func, *inst, 0, 0, chan_index );
1746       }
1747       break;
1748
1749    case TGSI_OPCODE_LOGBASE2:
1750    /* TGSI_OPCODE_LG2 */
1751       FETCH( func, *inst, 0, 0, CHAN_X );
1752       emit_lg2( func, 0, 0 );
1753       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1754          STORE( func, *inst, 0, 0, chan_index );
1755       }
1756       break;
1757
1758    case TGSI_OPCODE_POWER:
1759    /* TGSI_OPCODE_POW */
1760       FETCH( func, *inst, 0, 0, CHAN_X );
1761       FETCH( func, *inst, 1, 1, CHAN_X );
1762       emit_pow( func, 0, 0, 1 );
1763       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1764          STORE( func, *inst, 0, 0, chan_index );
1765       }
1766       break;
1767
1768    case TGSI_OPCODE_CROSSPRODUCT:
1769    /* TGSI_OPCODE_XPD */
1770       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1771           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1772          FETCH( func, *inst, 1, 1, CHAN_Z );
1773          FETCH( func, *inst, 3, 0, CHAN_Z );
1774       }
1775       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1776           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1777          FETCH( func, *inst, 0, 0, CHAN_Y );
1778          FETCH( func, *inst, 4, 1, CHAN_Y );
1779       }
1780       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1781          emit_MOV( func, 2, 0 );
1782          emit_mul( func, 2, 1 );
1783          emit_MOV( func, 5, 3 );
1784          emit_mul( func, 5, 4 );
1785          emit_sub( func, 2, 5 );
1786          STORE( func, *inst, 2, 0, CHAN_X );
1787       }
1788       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1789           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1790          FETCH( func, *inst, 2, 1, CHAN_X );
1791          FETCH( func, *inst, 5, 0, CHAN_X );
1792       }
1793       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1794          emit_mul( func, 3, 2 );
1795          emit_mul( func, 1, 5 );
1796          emit_sub( func, 3, 1 );
1797          STORE( func, *inst, 3, 0, CHAN_Y );
1798       }
1799       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1800          emit_mul( func, 5, 4 );
1801          emit_mul( func, 0, 2 );
1802          emit_sub( func, 5, 0 );
1803          STORE( func, *inst, 5, 0, CHAN_Z );
1804       }
1805       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1806          emit_tempf(
1807             func,
1808             0,
1809             TEMP_ONE_I,
1810             TEMP_ONE_C );
1811          STORE( func, *inst, 0, 0, CHAN_W );
1812       }
1813       break;
1814
1815    case TGSI_OPCODE_MULTIPLYMATRIX:
1816       return 0;
1817       break;
1818
1819    case TGSI_OPCODE_ABS:
1820       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1821          FETCH( func, *inst, 0, 0, chan_index );
1822          emit_abs( func, 0) ;
1823
1824          STORE( func, *inst, 0, 0, chan_index );
1825       }
1826       break;
1827
1828    case TGSI_OPCODE_RCC:
1829       return 0;
1830       break;
1831
1832    case TGSI_OPCODE_DPH:
1833       FETCH( func, *inst, 0, 0, CHAN_X );
1834       FETCH( func, *inst, 1, 1, CHAN_X );
1835       emit_mul( func, 0, 1 );
1836       FETCH( func, *inst, 1, 0, CHAN_Y );
1837       FETCH( func, *inst, 2, 1, CHAN_Y );
1838       emit_mul( func, 1, 2 );
1839       emit_add( func, 0, 1 );
1840       FETCH( func, *inst, 1, 0, CHAN_Z );
1841       FETCH( func, *inst, 2, 1, CHAN_Z );
1842       emit_mul( func, 1, 2 );
1843       emit_add( func, 0, 1 );
1844       FETCH( func, *inst, 1, 1, CHAN_W );
1845       emit_add( func, 0, 1 );
1846       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1847          STORE( func, *inst, 0, 0, chan_index );
1848       }
1849       break;
1850
1851    case TGSI_OPCODE_COS:
1852       FETCH( func, *inst, 0, 0, CHAN_X );
1853       emit_cos( func, 0, 0 );
1854       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1855          STORE( func, *inst, 0, 0, chan_index );
1856       }
1857       break;
1858
1859    case TGSI_OPCODE_DDX:
1860       return 0;
1861       break;
1862
1863    case TGSI_OPCODE_DDY:
1864       return 0;
1865       break;
1866
1867    case TGSI_OPCODE_KILP:
1868       /* predicated kill */
1869       emit_kilp( func );
1870       return 0; /* XXX fix me */
1871       break;
1872
1873    case TGSI_OPCODE_KIL:
1874       /* conditional kill */
1875       emit_kil( func, &inst->FullSrcRegisters[0] );
1876       break;
1877
1878    case TGSI_OPCODE_PK2H:
1879       return 0;
1880       break;
1881
1882    case TGSI_OPCODE_PK2US:
1883       return 0;
1884       break;
1885
1886    case TGSI_OPCODE_PK4B:
1887       return 0;
1888       break;
1889
1890    case TGSI_OPCODE_PK4UB:
1891       return 0;
1892       break;
1893
1894    case TGSI_OPCODE_RFL:
1895       return 0;
1896       break;
1897
1898    case TGSI_OPCODE_SEQ:
1899       return 0;
1900       break;
1901
1902    case TGSI_OPCODE_SFL:
1903       return 0;
1904       break;
1905
1906    case TGSI_OPCODE_SGT:
1907       return 0;
1908       break;
1909
1910    case TGSI_OPCODE_SIN:
1911       FETCH( func, *inst, 0, 0, CHAN_X );
1912       emit_sin( func, 0, 0 );
1913       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1914          STORE( func, *inst, 0, 0, chan_index );
1915       }
1916       break;
1917
1918    case TGSI_OPCODE_SLE:
1919       return 0;
1920       break;
1921
1922    case TGSI_OPCODE_SNE:
1923       return 0;
1924       break;
1925
1926    case TGSI_OPCODE_STR:
1927       return 0;
1928       break;
1929
1930    case TGSI_OPCODE_TEX:
1931       if (0) {
1932          /* Disable dummy texture code:
1933           */
1934          emit_tempf(
1935             func,
1936             0,
1937             TEMP_ONE_I,
1938             TEMP_ONE_C );
1939          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1940             STORE( func, *inst, 0, 0, chan_index );
1941          }
1942       }
1943       else {
1944          return 0;
1945       }
1946       break;
1947
1948    case TGSI_OPCODE_TXD:
1949       return 0;
1950       break;
1951
1952    case TGSI_OPCODE_UP2H:
1953       return 0;
1954       break;
1955
1956    case TGSI_OPCODE_UP2US:
1957       return 0;
1958       break;
1959
1960    case TGSI_OPCODE_UP4B:
1961       return 0;
1962       break;
1963
1964    case TGSI_OPCODE_UP4UB:
1965       return 0;
1966       break;
1967
1968    case TGSI_OPCODE_X2D:
1969       return 0;
1970       break;
1971
1972    case TGSI_OPCODE_ARA:
1973       return 0;
1974       break;
1975
1976    case TGSI_OPCODE_ARR:
1977       return 0;
1978       break;
1979
1980    case TGSI_OPCODE_BRA:
1981       return 0;
1982       break;
1983
1984    case TGSI_OPCODE_CAL:
1985       return 0;
1986       break;
1987
1988    case TGSI_OPCODE_RET:
1989       emit_ret( func );
1990       break;
1991
1992    case TGSI_OPCODE_END:
1993       break;
1994
1995    case TGSI_OPCODE_SSG:
1996       return 0;
1997       break;
1998
1999    case TGSI_OPCODE_CMP:
2000       emit_cmp (func, inst);
2001       break;
2002
2003    case TGSI_OPCODE_SCS:
2004       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2005          FETCH( func, *inst, 0, 0, CHAN_X );
2006          emit_cos( func, 0, 0 );
2007          STORE( func, *inst, 0, 0, CHAN_X );
2008       }
2009       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2010          FETCH( func, *inst, 0, 0, CHAN_X );
2011          emit_sin( func, 0, 0 );
2012          STORE( func, *inst, 0, 0, CHAN_Y );
2013       }
2014       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2015          emit_tempf(
2016             func,
2017             0,
2018             TGSI_EXEC_TEMP_00000000_I,
2019             TGSI_EXEC_TEMP_00000000_C );
2020          STORE( func, *inst, 0, 0, CHAN_Z );
2021       }
2022       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2023          emit_tempf(
2024             func,
2025             0,
2026             TEMP_ONE_I,
2027             TEMP_ONE_C );
2028          STORE( func, *inst, 0, 0, CHAN_W );
2029       }
2030       break;
2031
2032    case TGSI_OPCODE_TXB:
2033       return 0;
2034       break;
2035
2036    case TGSI_OPCODE_NRM:
2037       return 0;
2038       break;
2039
2040    case TGSI_OPCODE_DIV:
2041       return 0;
2042       break;
2043
2044    case TGSI_OPCODE_DP2:
2045       return 0;
2046       break;
2047
2048    case TGSI_OPCODE_TXL:
2049       return 0;
2050       break;
2051
2052    case TGSI_OPCODE_BRK:
2053       return 0;
2054       break;
2055
2056    case TGSI_OPCODE_IF:
2057       return 0;
2058       break;
2059
2060    case TGSI_OPCODE_LOOP:
2061       return 0;
2062       break;
2063
2064    case TGSI_OPCODE_REP:
2065       return 0;
2066       break;
2067
2068    case TGSI_OPCODE_ELSE:
2069       return 0;
2070       break;
2071
2072    case TGSI_OPCODE_ENDIF:
2073       return 0;
2074       break;
2075
2076    case TGSI_OPCODE_ENDLOOP:
2077       return 0;
2078       break;
2079
2080    case TGSI_OPCODE_ENDREP:
2081       return 0;
2082       break;
2083
2084    case TGSI_OPCODE_PUSHA:
2085       return 0;
2086       break;
2087
2088    case TGSI_OPCODE_POPA:
2089       return 0;
2090       break;
2091
2092    case TGSI_OPCODE_CEIL:
2093       return 0;
2094       break;
2095
2096    case TGSI_OPCODE_I2F:
2097       return 0;
2098       break;
2099
2100    case TGSI_OPCODE_NOT:
2101       return 0;
2102       break;
2103
2104    case TGSI_OPCODE_TRUNC:
2105       return 0;
2106       break;
2107
2108    case TGSI_OPCODE_SHL:
2109       return 0;
2110       break;
2111
2112    case TGSI_OPCODE_SHR:
2113       return 0;
2114       break;
2115
2116    case TGSI_OPCODE_AND:
2117       return 0;
2118       break;
2119
2120    case TGSI_OPCODE_OR:
2121       return 0;
2122       break;
2123
2124    case TGSI_OPCODE_MOD:
2125       return 0;
2126       break;
2127
2128    case TGSI_OPCODE_XOR:
2129       return 0;
2130       break;
2131
2132    case TGSI_OPCODE_SAD:
2133       return 0;
2134       break;
2135
2136    case TGSI_OPCODE_TXF:
2137       return 0;
2138       break;
2139
2140    case TGSI_OPCODE_TXQ:
2141       return 0;
2142       break;
2143
2144    case TGSI_OPCODE_CONT:
2145       return 0;
2146       break;
2147
2148    case TGSI_OPCODE_EMIT:
2149       return 0;
2150       break;
2151
2152    case TGSI_OPCODE_ENDPRIM:
2153       return 0;
2154       break;
2155
2156    default:
2157       return 0;
2158    }
2159
2160    return 1;
2161 }
2162
2163 static void
2164 emit_declaration(
2165    struct x86_function *func,
2166    struct tgsi_full_declaration *decl )
2167 {
2168    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2169       unsigned first, last, mask;
2170       unsigned i, j;
2171
2172       first = decl->DeclarationRange.First;
2173       last = decl->DeclarationRange.Last;
2174       mask = decl->Declaration.UsageMask;
2175
2176       for( i = first; i <= last; i++ ) {
2177          for( j = 0; j < NUM_CHANNELS; j++ ) {
2178             if( mask & (1 << j) ) {
2179                switch( decl->Declaration.Interpolate ) {
2180                case TGSI_INTERPOLATE_CONSTANT:
2181                   emit_coef_a0( func, 0, i, j );
2182                   emit_inputs( func, 0, i, j );
2183                   break;
2184
2185                case TGSI_INTERPOLATE_LINEAR:
2186                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2187                   emit_coef_dadx( func, 1, i, j );
2188                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2189                   emit_coef_dady( func, 3, i, j );
2190                   emit_mul( func, 0, 1 );    /* x * dadx */
2191                   emit_coef_a0( func, 4, i, j );
2192                   emit_mul( func, 2, 3 );    /* y * dady */
2193                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2194                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2195                   emit_inputs( func, 0, i, j );
2196                   break;
2197
2198                case TGSI_INTERPOLATE_PERSPECTIVE:
2199                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2200                   emit_coef_dadx( func, 1, i, j );
2201                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2202                   emit_coef_dady( func, 3, i, j );
2203                   emit_mul( func, 0, 1 );    /* x * dadx */
2204                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2205                   emit_coef_a0( func, 5, i, j );
2206                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2207                   emit_mul( func, 2, 3 );    /* y * dady */
2208                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2209                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2210                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2211                   emit_inputs( func, 0, i, j );
2212                   break;
2213
2214                default:
2215                   assert( 0 );
2216                   break;
2217                }
2218             }
2219          }
2220       }
2221    }
2222 }
2223
2224 static void aos_to_soa( struct x86_function *func,
2225                         uint arg_aos,
2226                         uint arg_soa,
2227                         uint arg_num,
2228                         uint arg_stride )
2229 {
2230    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2231    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2232    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2233    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2234    int inner_loop;
2235
2236
2237    /* Save EBX */
2238    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2239
2240    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2241    x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
2242    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2243    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2244
2245    /* do */
2246    inner_loop = x86_get_label( func );
2247    {
2248       x86_push( func, aos_input );
2249       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2250       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2251       x86_add( func, aos_input, stride );
2252       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2253       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2254       x86_add( func, aos_input, stride );
2255       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2256       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2257       x86_add( func, aos_input, stride );
2258       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2259       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2260       x86_pop( func, aos_input );
2261
2262       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2263       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2264       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2265       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2266       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2267       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2268
2269       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2270       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2271       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2272       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2273
2274       /* Advance to next input */
2275       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2276       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2277    }
2278    /* while --num_inputs */
2279    x86_dec( func, num_inputs );
2280    x86_jcc( func, cc_NE, inner_loop );
2281
2282    /* Restore EBX */
2283    x86_pop( func, aos_input );
2284 }
2285
2286 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2287 {
2288    struct x86_reg soa_output;
2289    struct x86_reg aos_output;
2290    struct x86_reg num_outputs;
2291    struct x86_reg temp;
2292    int inner_loop;
2293
2294    soa_output = x86_make_reg( file_REG32, reg_AX );
2295    aos_output = x86_make_reg( file_REG32, reg_BX );
2296    num_outputs = x86_make_reg( file_REG32, reg_CX );
2297    temp = x86_make_reg( file_REG32, reg_DX );
2298
2299    /* Save EBX */
2300    x86_push( func, aos_output );
2301
2302    x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2303    x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2304    x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2305
2306    /* do */
2307    inner_loop = x86_get_label( func );
2308    {
2309       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2310       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2311       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2312       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2313
2314       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2315       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2316       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2317       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2318       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2319       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2320
2321       x86_mov( func, temp, x86_fn_arg( func, stride ) );
2322       x86_push( func, aos_output );
2323       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2324       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2325       x86_add( func, aos_output, temp );
2326       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2327       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2328       x86_add( func, aos_output, temp );
2329       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2330       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2331       x86_add( func, aos_output, temp );
2332       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2333       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2334       x86_pop( func, aos_output );
2335
2336       /* Advance to next output */
2337       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2338       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2339    }
2340    /* while --num_outputs */
2341    x86_dec( func, num_outputs );
2342    x86_jcc( func, cc_NE, inner_loop );
2343
2344    /* Restore EBX */
2345    x86_pop( func, aos_output );
2346 }
2347
2348 /**
2349  * Translate a TGSI vertex/fragment shader to SSE2 code.
2350  * Slightly different things are done for vertex vs. fragment shaders.
2351  *
2352  * Note that fragment shaders are responsible for interpolating shader
2353  * inputs. Because on x86 we have only 4 GP registers, and here we
2354  * have 5 shader arguments (input, output, const, temp and coef), the
2355  * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2356  * GP register holding the output argument is aliased with the coeff
2357  * argument, as outputs are not needed in the DECLARATION phase.
2358  *
2359  * \param tokens  the TGSI input shader
2360  * \param func  the output SSE code/function
2361  * \param immediates  buffer to place immediates, later passed to SSE func
2362  * \param return  1 for success, 0 if translation failed
2363  */
2364 unsigned
2365 tgsi_emit_sse2(
2366    const struct tgsi_token *tokens,
2367    struct x86_function *func,
2368    float (*immediates)[4],
2369    boolean do_swizzles )
2370 {
2371    struct tgsi_parse_context parse;
2372    boolean instruction_phase = FALSE;
2373    unsigned ok = 1;
2374    uint num_immediates = 0;
2375
2376    util_init_math();
2377
2378    func->csr = func->store;
2379
2380    tgsi_parse_init( &parse, tokens );
2381
2382    /* Can't just use EDI, EBX without save/restoring them:
2383     */
2384    x86_push(
2385       func,
2386       get_immediate_base() );
2387
2388    x86_push(
2389       func,
2390       get_temp_base() );
2391
2392
2393    /*
2394     * Different function args for vertex/fragment shaders:
2395     */
2396    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2397       /* DECLARATION phase, do not load output argument. */
2398       x86_mov(
2399          func,
2400          get_input_base(),
2401          x86_fn_arg( func, 1 ) );
2402       /* skipping outputs argument here */
2403       x86_mov(
2404          func,
2405          get_const_base(),
2406          x86_fn_arg( func, 3 ) );
2407       x86_mov(
2408          func,
2409          get_temp_base(),
2410          x86_fn_arg( func, 4 ) );
2411       x86_mov(
2412          func,
2413          get_coef_base(),
2414          x86_fn_arg( func, 5 ) );
2415       x86_mov(
2416          func,
2417          get_immediate_base(),
2418          x86_fn_arg( func, 6 ) );
2419    }
2420    else {
2421       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2422
2423       if (do_swizzles)
2424          aos_to_soa( func,
2425                      6,         /* aos_input */
2426                      1,         /* machine->input */
2427                      7,         /* num_inputs */
2428                      8 );       /* input_stride */
2429
2430       x86_mov(
2431          func,
2432          get_input_base(),
2433          x86_fn_arg( func, 1 ) );
2434       x86_mov(
2435          func,
2436          get_output_base(),
2437          x86_fn_arg( func, 2 ) );
2438       x86_mov(
2439          func,
2440          get_const_base(),
2441          x86_fn_arg( func, 3 ) );
2442       x86_mov(
2443          func,
2444          get_temp_base(),
2445          x86_fn_arg( func, 4 ) );
2446       x86_mov(
2447          func,
2448          get_immediate_base(),
2449          x86_fn_arg( func, 5 ) );
2450    }
2451
2452    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2453       tgsi_parse_token( &parse );
2454
2455       switch( parse.FullToken.Token.Type ) {
2456       case TGSI_TOKEN_TYPE_DECLARATION:
2457          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2458             emit_declaration(
2459                func,
2460                &parse.FullToken.FullDeclaration );
2461          }
2462          break;
2463
2464       case TGSI_TOKEN_TYPE_INSTRUCTION:
2465          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2466             if( !instruction_phase ) {
2467                /* INSTRUCTION phase, overwrite coeff with output. */
2468                instruction_phase = TRUE;
2469                x86_mov(
2470                   func,
2471                   get_output_base(),
2472                   x86_fn_arg( func, 2 ) );
2473             }
2474          }
2475
2476          ok = emit_instruction(
2477             func,
2478             &parse.FullToken.FullInstruction );
2479
2480          if (!ok) {
2481             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2482                          parse.FullToken.FullInstruction.Instruction.Opcode,
2483                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2484                          "vertex shader" : "fragment shader");
2485          }
2486          break;
2487
2488       case TGSI_TOKEN_TYPE_IMMEDIATE:
2489          /* simply copy the immediate values into the next immediates[] slot */
2490          {
2491             const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2492             uint i;
2493             assert(size <= 4);
2494             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2495             for( i = 0; i < size; i++ ) {
2496                immediates[num_immediates][i] =
2497                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2498             }
2499 #if 0
2500             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2501                    num_immediates,
2502                    immediates[num_immediates][0],
2503                    immediates[num_immediates][1],
2504                    immediates[num_immediates][2],
2505                    immediates[num_immediates][3]);
2506 #endif
2507             num_immediates++;
2508          }
2509          break;
2510
2511       default:
2512          ok = 0;
2513          assert( 0 );
2514       }
2515    }
2516
2517    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2518       if (do_swizzles)
2519          soa_to_aos( func, 9, 2, 10, 11 );
2520    }
2521
2522    /* Can't just use EBX, EDI without save/restoring them:
2523     */
2524    x86_pop(
2525       func,
2526       get_temp_base() );
2527
2528    x86_pop(
2529       func,
2530       get_immediate_base() );
2531
2532    emit_ret( func );
2533
2534    tgsi_parse_free( &parse );
2535
2536    return ok;
2537 }
2538
2539 #endif /* PIPE_ARCH_X86 */
2540