src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_config.h"
  29
  30 #if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
  31
  32 #include "pipe/p_debug.h"
  33 #include "pipe/p_shader_tokens.h"
  34 #include "util/u_math.h"
  35 #include "util/u_sse.h"
  36 #include "tgsi/tgsi_parse.h"
  37 #include "tgsi/tgsi_util.h"
  38 #include "tgsi_exec.h"
  39 #include "tgsi_sse2.h"
  40
  41 #include "rtasm/rtasm_x86sse.h"
  42
  43 /* for 1/sqrt()
  44  *
  45  * This costs about 100fps (close to 10%) in gears:
  46  */
  47 #define HIGH_PRECISION 1
  48
  49 #define FAST_MATH 1
  50
  51
  52 #define FOR_EACH_CHANNEL( CHAN )\
  53    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  54
  55 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  56    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  57
  58 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  59    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  60
  61 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  62    FOR_EACH_CHANNEL( CHAN )\
  63       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  64
  65 #define CHAN_X 0
  66 #define CHAN_Y 1
  67 #define CHAN_Z 2
  68 #define CHAN_W 3
  69
  70 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  71 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  72
  73 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  74 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  75
  76 /**
  77  * X86 utility functions.
  78  */
  79
  80 static struct x86_reg
  81 make_xmm(
  82    unsigned xmm )
  83 {
  84    return x86_make_reg(
  85       file_XMM,
  86       (enum x86_reg_name) xmm );
  87 }
  88
  89 /**
  90  * X86 register mapping helpers.
  91  */
  92
  93 static struct x86_reg
  94 get_const_base( void )
  95 {
  96    return x86_make_reg(
  97       file_REG32,
  98       reg_CX );
  99 }
 100
 101 static struct x86_reg
 102 get_input_base( void )
 103 {
 104    return x86_make_reg(
 105       file_REG32,
 106       reg_AX );
 107 }
 108
 109 static struct x86_reg
 110 get_output_base( void )
 111 {
 112    return x86_make_reg(
 113       file_REG32,
 114       reg_DX );
 115 }
 116
 117 static struct x86_reg
 118 get_temp_base( void )
 119 {
 120    return x86_make_reg(
 121       file_REG32,
 122       reg_BX );
 123 }
 124
 125 static struct x86_reg
 126 get_coef_base( void )
 127 {
 128    return get_output_base();
 129 }
 130
 131 static struct x86_reg
 132 get_immediate_base( void )
 133 {
 134    return x86_make_reg(
 135       file_REG32,
 136       reg_DI );
 137 }
 138
 139
 140 /**
 141  * Data access helpers.
 142  */
 143
 144
 145 static struct x86_reg
 146 get_immediate(
 147    unsigned vec,
 148    unsigned chan )
 149 {
 150    return x86_make_disp(
 151       get_immediate_base(),
 152       (vec * 4 + chan) * 4 );
 153 }
 154
 155 static struct x86_reg
 156 get_const(
 157    unsigned vec,
 158    unsigned chan )
 159 {
 160    return x86_make_disp(
 161       get_const_base(),
 162       (vec * 4 + chan) * 4 );
 163 }
 164
 165 static struct x86_reg
 166 get_input(
 167    unsigned vec,
 168    unsigned chan )
 169 {
 170    return x86_make_disp(
 171       get_input_base(),
 172       (vec * 4 + chan) * 16 );
 173 }
 174
 175 static struct x86_reg
 176 get_output(
 177    unsigned vec,
 178    unsigned chan )
 179 {
 180    return x86_make_disp(
 181       get_output_base(),
 182       (vec * 4 + chan) * 16 );
 183 }
 184
 185 static struct x86_reg
 186 get_temp(
 187    unsigned vec,
 188    unsigned chan )
 189 {
 190    return x86_make_disp(
 191       get_temp_base(),
 192       (vec * 4 + chan) * 16 );
 193 }
 194
 195 static struct x86_reg
 196 get_coef(
 197    unsigned vec,
 198    unsigned chan,
 199    unsigned member )
 200 {
 201    return x86_make_disp(
 202       get_coef_base(),
 203       ((vec * 3 + member) * 4 + chan) * 4 );
 204 }
 205
 206
 207 static void
 208 emit_ret(
 209    struct x86_function  *func )
 210 {
 211    x86_ret( func );
 212 }
 213
 214
 215 /**
 216  * Data fetch helpers.
 217  */
 218
 219 /**
 220  * Copy a shader constant to xmm register
 221  * \param xmm  the destination xmm register
 222  * \param vec  the src const buffer index
 223  * \param chan  src channel to fetch (X, Y, Z or W)
 224  */
 225 static void
 226 emit_const(
 227    struct x86_function *func,
 228    uint xmm,
 229    int vec,
 230    uint chan,
 231    uint indirect,
 232    uint indirectFile,
 233    int indirectIndex )
 234 {
 235    if (indirect) {
 236       struct x86_reg r0 = get_input_base();
 237       struct x86_reg r1 = get_output_base();
 238       uint i;
 239
 240       assert( indirectFile == TGSI_FILE_ADDRESS );
 241       assert( indirectIndex == 0 );
 242
 243       x86_push( func, r0 );
 244       x86_push( func, r1 );
 245
 246       for (i = 0; i < QUAD_SIZE; i++) {
 247          x86_lea( func, r0, get_const( vec, chan ) );
 248          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 249
 250          /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
 251           */
 252          x86_add( func, r1, r1 );
 253          x86_add( func, r1, r1 );
 254          x86_add( func, r1, r1 );
 255          x86_add( func, r1, r1 );
 256
 257          x86_add( func, r0, r1 );
 258          x86_mov( func, r1, x86_deref( r0 ) );
 259          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 260       }
 261
 262       x86_pop( func, r1 );
 263       x86_pop( func, r0 );
 264
 265       sse_movaps(
 266          func,
 267          make_xmm( xmm ),
 268          get_temp( TEMP_R0, CHAN_X ) );
 269    }
 270    else {
 271       assert( vec >= 0 );
 272
 273       sse_movss(
 274          func,
 275          make_xmm( xmm ),
 276          get_const( vec, chan ) );
 277       sse_shufps(
 278          func,
 279          make_xmm( xmm ),
 280          make_xmm( xmm ),
 281          SHUF( 0, 0, 0, 0 ) );
 282    }
 283 }
 284
 285 static void
 286 emit_immediate(
 287    struct x86_function *func,
 288    unsigned xmm,
 289    unsigned vec,
 290    unsigned chan )
 291 {
 292    sse_movss(
 293       func,
 294       make_xmm( xmm ),
 295       get_immediate( vec, chan ) );
 296    sse_shufps(
 297       func,
 298       make_xmm( xmm ),
 299       make_xmm( xmm ),
 300       SHUF( 0, 0, 0, 0 ) );
 301 }
 302
 303
 304 /**
 305  * Copy a shader input to xmm register
 306  * \param xmm  the destination xmm register
 307  * \param vec  the src input attrib
 308  * \param chan  src channel to fetch (X, Y, Z or W)
 309  */
 310 static void
 311 emit_inputf(
 312    struct x86_function *func,
 313    unsigned xmm,
 314    unsigned vec,
 315    unsigned chan )
 316 {
 317    sse_movups(
 318       func,
 319       make_xmm( xmm ),
 320       get_input( vec, chan ) );
 321 }
 322
 323 /**
 324  * Store an xmm register to a shader output
 325  * \param xmm  the source xmm register
 326  * \param vec  the dest output attrib
 327  * \param chan  src dest channel to store (X, Y, Z or W)
 328  */
 329 static void
 330 emit_output(
 331    struct x86_function *func,
 332    unsigned xmm,
 333    unsigned vec,
 334    unsigned chan )
 335 {
 336    sse_movups(
 337       func,
 338       get_output( vec, chan ),
 339       make_xmm( xmm ) );
 340 }
 341
 342 /**
 343  * Copy a shader temporary to xmm register
 344  * \param xmm  the destination xmm register
 345  * \param vec  the src temp register
 346  * \param chan  src channel to fetch (X, Y, Z or W)
 347  */
 348 static void
 349 emit_tempf(
 350    struct x86_function *func,
 351    unsigned xmm,
 352    unsigned vec,
 353    unsigned chan )
 354 {
 355    sse_movaps(
 356       func,
 357       make_xmm( xmm ),
 358       get_temp( vec, chan ) );
 359 }
 360
 361 /**
 362  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 363  * \param xmm  the destination xmm register
 364  * \param vec  the src input/attribute coefficient index
 365  * \param chan  src channel to fetch (X, Y, Z or W)
 366  * \param member  0=a0, 1=dadx, 2=dady
 367  */
 368 static void
 369 emit_coef(
 370    struct x86_function *func,
 371    unsigned xmm,
 372    unsigned vec,
 373    unsigned chan,
 374    unsigned member )
 375 {
 376    sse_movss(
 377       func,
 378       make_xmm( xmm ),
 379       get_coef( vec, chan, member ) );
 380    sse_shufps(
 381       func,
 382       make_xmm( xmm ),
 383       make_xmm( xmm ),
 384       SHUF( 0, 0, 0, 0 ) );
 385 }
 386
 387 /**
 388  * Data store helpers.
 389  */
 390
 391 static void
 392 emit_inputs(
 393    struct x86_function *func,
 394    unsigned xmm,
 395    unsigned vec,
 396    unsigned chan )
 397 {
 398    sse_movups(
 399       func,
 400       get_input( vec, chan ),
 401       make_xmm( xmm ) );
 402 }
 403
 404 static void
 405 emit_temps(
 406    struct x86_function *func,
 407    unsigned xmm,
 408    unsigned vec,
 409    unsigned chan )
 410 {
 411    sse_movaps(
 412       func,
 413       get_temp( vec, chan ),
 414       make_xmm( xmm ) );
 415 }
 416
 417 static void
 418 emit_addrs(
 419    struct x86_function *func,
 420    unsigned xmm,
 421    unsigned vec,
 422    unsigned chan )
 423 {
 424    assert( vec == 0 );
 425
 426    emit_temps(
 427       func,
 428       xmm,
 429       vec + TGSI_EXEC_TEMP_ADDR,
 430       chan );
 431 }
 432
 433 /**
 434  * Coefficent fetch helpers.
 435  */
 436
 437 static void
 438 emit_coef_a0(
 439    struct x86_function *func,
 440    unsigned xmm,
 441    unsigned vec,
 442    unsigned chan )
 443 {
 444    emit_coef(
 445       func,
 446       xmm,
 447       vec,
 448       chan,
 449       0 );
 450 }
 451
 452 static void
 453 emit_coef_dadx(
 454    struct x86_function *func,
 455    unsigned xmm,
 456    unsigned vec,
 457    unsigned chan )
 458 {
 459    emit_coef(
 460       func,
 461       xmm,
 462       vec,
 463       chan,
 464       1 );
 465 }
 466
 467 static void
 468 emit_coef_dady(
 469    struct x86_function *func,
 470    unsigned xmm,
 471    unsigned vec,
 472    unsigned chan )
 473 {
 474    emit_coef(
 475       func,
 476       xmm,
 477       vec,
 478       chan,
 479       2 );
 480 }
 481
 482 /**
 483  * Function call helpers.
 484  */
 485
 486 /**
 487  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 488  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 489  * that the stack pointer is 16 byte aligned, as expected.
 490  */
 491 static void
 492 emit_func_call_dst(
 493    struct x86_function *func,
 494    unsigned xmm_save,
 495    unsigned xmm_dst,
 496    void (PIPE_CDECL *code)() )
 497 {
 498    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 499    unsigned i, n, xmm;
 500    unsigned xmm_mask;
 501
 502    /* Bitmask of the xmm registers to save */
 503    xmm_mask = (1 << xmm_save) - 1;
 504    xmm_mask &= ~(1 << xmm_dst);
 505
 506    sse_movaps(
 507       func,
 508       get_temp( TEMP_R0, 0 ),
 509       make_xmm( xmm_dst ) );
 510
 511    x86_push(
 512       func,
 513       x86_make_reg( file_REG32, reg_AX) );
 514    x86_push(
 515       func,
 516       x86_make_reg( file_REG32, reg_CX) );
 517    x86_push(
 518       func,
 519       x86_make_reg( file_REG32, reg_DX) );
 520
 521    for(i = 0, n = 0; i < 8; ++i)
 522       if(xmm_mask & (1 << i))
 523          ++n;
 524
 525    x86_sub_imm(
 526       func,
 527       x86_make_reg( file_REG32, reg_SP ),
 528       n*16);
 529
 530    for(i = 0, n = 0; i < 8; ++i)
 531       if(xmm_mask & (1 << i)) {
 532          sse_movups(
 533             func,
 534             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 535             make_xmm( xmm ) );
 536          ++n;
 537       }
 538
 539    x86_lea(
 540       func,
 541       ecx,
 542       get_temp( TEMP_R0, 0 ) );
 543
 544    x86_push( func, ecx );
 545    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 546    x86_call( func, ecx );
 547    x86_pop(func, ecx );
 548
 549    for(i = 0, n = 0; i < 8; ++i)
 550       if(xmm_mask & (1 << i)) {
 551          sse_movups(
 552             func,
 553             make_xmm( xmm ),
 554             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 555          ++n;
 556       }
 557
 558    x86_add_imm(
 559       func,
 560       x86_make_reg( file_REG32, reg_SP ),
 561       n*16);
 562
 563    /* Restore GP registers in a reverse order.
 564     */
 565    x86_pop(
 566       func,
 567       x86_make_reg( file_REG32, reg_DX) );
 568    x86_pop(
 569       func,
 570       x86_make_reg( file_REG32, reg_CX) );
 571    x86_pop(
 572       func,
 573       x86_make_reg( file_REG32, reg_AX) );
 574
 575    sse_movaps(
 576       func,
 577       make_xmm( xmm_dst ),
 578       get_temp( TEMP_R0, 0 ) );
 579 }
 580
 581 static void
 582 emit_func_call_dst_src(
 583    struct x86_function *func,
 584    unsigned xmm_save,
 585    unsigned xmm_dst,
 586    unsigned xmm_src,
 587    void (PIPE_CDECL *code)() )
 588 {
 589    sse_movaps(
 590       func,
 591       get_temp( TEMP_R0, 1 ),
 592       make_xmm( xmm_src ) );
 593
 594    emit_func_call_dst(
 595       func,
 596       xmm_save,
 597       xmm_dst,
 598       code );
 599 }
 600
 601 /*
 602  * Fast SSE2 implementation of special math functions.
 603  */
 604
 605 #define POLY0(x, c0) _mm_set1_ps(c0)
 606 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 607 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 608 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 609 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 610 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 611
 612 #define EXP_POLY_DEGREE 3
 613 #define LOG_POLY_DEGREE 5
 614
 615 /**
 616  * See http://www.devmaster.net/forums/showthread.php?p=43580
 617  */
 618 static INLINE __m128
 619 exp2f4(__m128 x)
 620 {
 621    __m128i ipart;
 622    __m128 fpart, expipart, expfpart;
 623
 624    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 625    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 626
 627    /* ipart = int(x - 0.5) */
 628    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 629
 630    /* fpart = x - ipart */
 631    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 632
 633    /* expipart = (float) (1 << ipart) */
 634    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 635
 636    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 637 #if EXP_POLY_DEGREE == 5
 638    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 639 #elif EXP_POLY_DEGREE == 4
 640    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 641 #elif EXP_POLY_DEGREE == 3
 642    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 643 #elif EXP_POLY_DEGREE == 2
 644    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 645 #else
 646 #error
 647 #endif
 648
 649    return _mm_mul_ps(expipart, expfpart);
 650 }
 651
 652 /**
 653  * See http://www.devmaster.net/forums/showthread.php?p=43580
 654  */
 655 static INLINE __m128
 656 log2f4(__m128 x)
 657 {
 658    __m128i expmask = _mm_set1_epi32(0x7f800000);
 659    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 660    __m128 one = _mm_set1_ps(1.0f);
 661
 662    __m128i i = _mm_castps_si128(x);
 663
 664    /* exp = (float) exponent(x) */
 665    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 666
 667    /* mant = (float) mantissa(x) */
 668    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 669
 670    __m128 logmant;
 671
 672    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 673     * These coefficients can be generate with
 674     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 675     */
 676 #if LOG_POLY_DEGREE == 6
 677    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 678 #elif LOG_POLY_DEGREE == 5
 679    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 680 #elif LOG_POLY_DEGREE == 4
 681    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 682 #elif LOG_POLY_DEGREE == 3
 683    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 684 #else
 685 #error
 686 #endif
 687
 688    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 689    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 690
 691    return _mm_add_ps(logmant, exp);
 692 }
 693
 694 static INLINE __m128
 695 powf4(__m128 x, __m128 y)
 696 {
 697    return exp2f4(_mm_mul_ps(log2f4(x), y));
 698 }
 699
 700
 701 /**
 702  * Low-level instruction translators.
 703  */
 704
 705 static void
 706 emit_abs(
 707    struct x86_function *func,
 708    unsigned xmm )
 709 {
 710    sse_andps(
 711       func,
 712       make_xmm( xmm ),
 713       get_temp(
 714          TGSI_EXEC_TEMP_7FFFFFFF_I,
 715          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 716 }
 717
 718 static void
 719 emit_add(
 720    struct x86_function *func,
 721    unsigned xmm_dst,
 722    unsigned xmm_src )
 723 {
 724    sse_addps(
 725       func,
 726       make_xmm( xmm_dst ),
 727       make_xmm( xmm_src ) );
 728 }
 729
 730 static void PIPE_CDECL
 731 cos4f(
 732    float *store )
 733 {
 734    store[0] = cosf( store[0] );
 735    store[1] = cosf( store[1] );
 736    store[2] = cosf( store[2] );
 737    store[3] = cosf( store[3] );
 738 }
 739
 740 static void
 741 emit_cos(
 742    struct x86_function *func,
 743    unsigned xmm_save,
 744    unsigned xmm_dst )
 745 {
 746    emit_func_call_dst(
 747       func,
 748       xmm_save,
 749       xmm_dst,
 750       cos4f );
 751 }
 752
 753 static void PIPE_CDECL
 754 #if defined(PIPE_CC_GCC)
 755 __attribute__((force_align_arg_pointer))
 756 #endif
 757 ex24f(
 758    float *store )
 759 {
 760    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 761 }
 762
 763 static void
 764 emit_ex2(
 765    struct x86_function *func,
 766    unsigned xmm_save,
 767    unsigned xmm_dst )
 768 {
 769    emit_func_call_dst(
 770       func,
 771       xmm_save,
 772       xmm_dst,
 773       ex24f );
 774 }
 775
 776 static void
 777 emit_f2it(
 778    struct x86_function *func,
 779    unsigned xmm )
 780 {
 781    sse2_cvttps2dq(
 782       func,
 783       make_xmm( xmm ),
 784       make_xmm( xmm ) );
 785 }
 786
 787 static void PIPE_CDECL
 788 flr4f(
 789    float *store )
 790 {
 791    store[0] = floorf( store[0] );
 792    store[1] = floorf( store[1] );
 793    store[2] = floorf( store[2] );
 794    store[3] = floorf( store[3] );
 795 }
 796
 797 static void
 798 emit_flr(
 799    struct x86_function *func,
 800    unsigned xmm_save,
 801    unsigned xmm_dst )
 802 {
 803    emit_func_call_dst(
 804       func,
 805       xmm_save,
 806       xmm_dst,
 807       flr4f );
 808 }
 809
 810 static void PIPE_CDECL
 811 frc4f(
 812    float *store )
 813 {
 814    store[0] -= floorf( store[0] );
 815    store[1] -= floorf( store[1] );
 816    store[2] -= floorf( store[2] );
 817    store[3] -= floorf( store[3] );
 818 }
 819
 820 static void
 821 emit_frc(
 822    struct x86_function *func,
 823    unsigned xmm_save,
 824    unsigned xmm_dst )
 825 {
 826    emit_func_call_dst(
 827       func,
 828       xmm_save,
 829       xmm_dst,
 830       frc4f );
 831 }
 832
 833 static void PIPE_CDECL
 834 #if defined(PIPE_CC_GCC)
 835 __attribute__((force_align_arg_pointer))
 836 #endif
 837 lg24f(
 838    float *store )
 839 {
 840    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 841 }
 842
 843 static void
 844 emit_lg2(
 845    struct x86_function *func,
 846    unsigned xmm_save,
 847    unsigned xmm_dst )
 848 {
 849    emit_func_call_dst(
 850       func,
 851       xmm_save,
 852       xmm_dst,
 853       lg24f );
 854 }
 855
 856 static void
 857 emit_MOV(
 858    struct x86_function *func,
 859    unsigned xmm_dst,
 860    unsigned xmm_src )
 861 {
 862    sse_movups(
 863       func,
 864       make_xmm( xmm_dst ),
 865       make_xmm( xmm_src ) );
 866 }
 867
 868 static void
 869 emit_mul (struct x86_function *func,
 870           unsigned xmm_dst,
 871           unsigned xmm_src)
 872 {
 873    sse_mulps(
 874       func,
 875       make_xmm( xmm_dst ),
 876       make_xmm( xmm_src ) );
 877 }
 878
 879 static void
 880 emit_neg(
 881    struct x86_function *func,
 882    unsigned xmm )
 883 {
 884    sse_xorps(
 885       func,
 886       make_xmm( xmm ),
 887       get_temp(
 888          TGSI_EXEC_TEMP_80000000_I,
 889          TGSI_EXEC_TEMP_80000000_C ) );
 890 }
 891
 892 static void PIPE_CDECL
 893 #if defined(PIPE_CC_GCC)
 894 __attribute__((force_align_arg_pointer))
 895 #endif
 896 pow4f(
 897    float *store )
 898 {
 899 #if 1
 900    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
 901 #else
 902    store[0] = powf( store[0], store[4] );
 903    store[1] = powf( store[1], store[5] );
 904    store[2] = powf( store[2], store[6] );
 905    store[3] = powf( store[3], store[7] );
 906 #endif
 907 }
 908
 909 static void
 910 emit_pow(
 911    struct x86_function *func,
 912    unsigned xmm_save,
 913    unsigned xmm_dst,
 914    unsigned xmm_src )
 915 {
 916    emit_func_call_dst_src(
 917       func,
 918       xmm_save,
 919       xmm_dst,
 920       xmm_src,
 921       pow4f );
 922 }
 923
 924 static void
 925 emit_rcp (
 926    struct x86_function *func,
 927    unsigned xmm_dst,
 928    unsigned xmm_src )
 929 {
 930    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 931     * good enough.  Need to either emit a proper divide or use the
 932     * iterative technique described below in emit_rsqrt().
 933     */
 934    sse2_rcpps(
 935       func,
 936       make_xmm( xmm_dst ),
 937       make_xmm( xmm_src ) );
 938 }
 939
 940 static void
 941 emit_rsqrt(
 942    struct x86_function *func,
 943    unsigned xmm_dst,
 944    unsigned xmm_src )
 945 {
 946 #if HIGH_PRECISION
 947    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
 948     * implementations, it is possible to improve its precision at
 949     * fairly low cost, using a newton/raphson step, as below:
 950     *
 951     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
 952     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
 953     *
 954     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
 955     */
 956    {
 957       struct x86_reg dst = make_xmm( xmm_dst );
 958       struct x86_reg src = make_xmm( xmm_src );
 959       struct x86_reg tmp0 = make_xmm( 2 );
 960       struct x86_reg tmp1 = make_xmm( 3 );
 961
 962       assert( xmm_dst != xmm_src );
 963       assert( xmm_dst != 2 && xmm_dst != 3 );
 964       assert( xmm_src != 2 && xmm_src != 3 );
 965
 966       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
 967       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
 968       sse_rsqrtps( func, tmp1, src  );
 969       sse_mulps(   func, src,  tmp1 );
 970       sse_mulps(   func, dst,  tmp1 );
 971       sse_mulps(   func, src,  tmp1 );
 972       sse_subps(   func, tmp0, src  );
 973       sse_mulps(   func, dst,  tmp0 );
 974    }
 975 #else
 976    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 977     * good enough.
 978     */
 979    sse_rsqrtps(
 980       func,
 981       make_xmm( xmm_dst ),
 982       make_xmm( xmm_src ) );
 983 #endif
 984 }
 985
 986 static void
 987 emit_setsign(
 988    struct x86_function *func,
 989    unsigned xmm )
 990 {
 991    sse_orps(
 992       func,
 993       make_xmm( xmm ),
 994       get_temp(
 995          TGSI_EXEC_TEMP_80000000_I,
 996          TGSI_EXEC_TEMP_80000000_C ) );
 997 }
 998
 999 static void PIPE_CDECL
1000 sin4f(
1001    float *store )
1002 {
1003    store[0] = sinf( store[0] );
1004    store[1] = sinf( store[1] );
1005    store[2] = sinf( store[2] );
1006    store[3] = sinf( store[3] );
1007 }
1008
1009 static void
1010 emit_sin (struct x86_function *func,
1011           unsigned xmm_save,
1012           unsigned xmm_dst)
1013 {
1014    emit_func_call_dst(
1015       func,
1016       xmm_save,
1017       xmm_dst,
1018       sin4f );
1019 }
1020
1021 static void
1022 emit_sub(
1023    struct x86_function *func,
1024    unsigned xmm_dst,
1025    unsigned xmm_src )
1026 {
1027    sse_subps(
1028       func,
1029       make_xmm( xmm_dst ),
1030       make_xmm( xmm_src ) );
1031 }
1032
1033 /**
1034  * Register fetch.
1035  */
1036
1037 static void
1038 emit_fetch(
1039    struct x86_function *func,
1040    unsigned xmm,
1041    const struct tgsi_full_src_register *reg,
1042    const unsigned chan_index )
1043 {
1044    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1045
1046    switch (swizzle) {
1047    case TGSI_EXTSWIZZLE_X:
1048    case TGSI_EXTSWIZZLE_Y:
1049    case TGSI_EXTSWIZZLE_Z:
1050    case TGSI_EXTSWIZZLE_W:
1051       switch (reg->SrcRegister.File) {
1052       case TGSI_FILE_CONSTANT:
1053          emit_const(
1054             func,
1055             xmm,
1056             reg->SrcRegister.Index,
1057             swizzle,
1058             reg->SrcRegister.Indirect,
1059             reg->SrcRegisterInd.File,
1060             reg->SrcRegisterInd.Index );
1061          break;
1062
1063       case TGSI_FILE_IMMEDIATE:
1064          emit_immediate(
1065             func,
1066             xmm,
1067             reg->SrcRegister.Index,
1068             swizzle );
1069          break;
1070
1071       case TGSI_FILE_INPUT:
1072          emit_inputf(
1073             func,
1074             xmm,
1075             reg->SrcRegister.Index,
1076             swizzle );
1077          break;
1078
1079       case TGSI_FILE_TEMPORARY:
1080          emit_tempf(
1081             func,
1082             xmm,
1083             reg->SrcRegister.Index,
1084             swizzle );
1085          break;
1086
1087       default:
1088          assert( 0 );
1089       }
1090       break;
1091
1092    case TGSI_EXTSWIZZLE_ZERO:
1093       emit_tempf(
1094          func,
1095          xmm,
1096          TGSI_EXEC_TEMP_00000000_I,
1097          TGSI_EXEC_TEMP_00000000_C );
1098       break;
1099
1100    case TGSI_EXTSWIZZLE_ONE:
1101       emit_tempf(
1102          func,
1103          xmm,
1104          TEMP_ONE_I,
1105          TEMP_ONE_C );
1106       break;
1107
1108    default:
1109       assert( 0 );
1110    }
1111
1112    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1113    case TGSI_UTIL_SIGN_CLEAR:
1114       emit_abs( func, xmm );
1115       break;
1116
1117    case TGSI_UTIL_SIGN_SET:
1118       emit_setsign( func, xmm );
1119       break;
1120
1121    case TGSI_UTIL_SIGN_TOGGLE:
1122       emit_neg( func, xmm );
1123       break;
1124
1125    case TGSI_UTIL_SIGN_KEEP:
1126       break;
1127    }
1128 }
1129
1130 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1131    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1132
1133 /**
1134  * Register store.
1135  */
1136
1137 static void
1138 emit_store(
1139    struct x86_function *func,
1140    unsigned xmm,
1141    const struct tgsi_full_dst_register *reg,
1142    const struct tgsi_full_instruction *inst,
1143    unsigned chan_index )
1144 {
1145    switch( reg->DstRegister.File ) {
1146    case TGSI_FILE_OUTPUT:
1147       emit_output(
1148          func,
1149          xmm,
1150          reg->DstRegister.Index,
1151          chan_index );
1152       break;
1153
1154    case TGSI_FILE_TEMPORARY:
1155       emit_temps(
1156          func,
1157          xmm,
1158          reg->DstRegister.Index,
1159          chan_index );
1160       break;
1161
1162    case TGSI_FILE_ADDRESS:
1163       emit_addrs(
1164          func,
1165          xmm,
1166          reg->DstRegister.Index,
1167          chan_index );
1168       break;
1169
1170    default:
1171       assert( 0 );
1172    }
1173
1174    switch( inst->Instruction.Saturate ) {
1175    case TGSI_SAT_NONE:
1176       break;
1177
1178    case TGSI_SAT_ZERO_ONE:
1179       /* assert( 0 ); */
1180       break;
1181
1182    case TGSI_SAT_MINUS_PLUS_ONE:
1183       assert( 0 );
1184       break;
1185    }
1186 }
1187
1188 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1189    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1190
1191 /**
1192  * High-level instruction translators.
1193  */
1194
1195 static void
1196 emit_kil(
1197    struct x86_function *func,
1198    const struct tgsi_full_src_register *reg )
1199 {
1200    unsigned uniquemask;
1201    unsigned registers[4];
1202    unsigned nextregister = 0;
1203    unsigned firstchan = ~0;
1204    unsigned chan_index;
1205
1206    /* This mask stores component bits that were already tested. Note that
1207     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1208     * tested. */
1209    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1210
1211    FOR_EACH_CHANNEL( chan_index ) {
1212       unsigned swizzle;
1213
1214       /* unswizzle channel */
1215       swizzle = tgsi_util_get_full_src_register_extswizzle(
1216          reg,
1217          chan_index );
1218
1219       /* check if the component has not been already tested */
1220       if( !(uniquemask & (1 << swizzle)) ) {
1221          uniquemask |= 1 << swizzle;
1222
1223          /* allocate register */
1224          registers[chan_index] = nextregister;
1225          emit_fetch(
1226             func,
1227             nextregister,
1228             reg,
1229             chan_index );
1230          nextregister++;
1231
1232          /* mark the first channel used */
1233          if( firstchan == ~0 ) {
1234             firstchan = chan_index;
1235          }
1236       }
1237    }
1238
1239    x86_push(
1240       func,
1241       x86_make_reg( file_REG32, reg_AX ) );
1242    x86_push(
1243       func,
1244       x86_make_reg( file_REG32, reg_DX ) );
1245
1246    FOR_EACH_CHANNEL( chan_index ) {
1247       if( uniquemask & (1 << chan_index) ) {
1248          sse_cmpps(
1249             func,
1250             make_xmm( registers[chan_index] ),
1251             get_temp(
1252                TGSI_EXEC_TEMP_00000000_I,
1253                TGSI_EXEC_TEMP_00000000_C ),
1254             cc_LessThan );
1255
1256          if( chan_index == firstchan ) {
1257             sse_pmovmskb(
1258                func,
1259                x86_make_reg( file_REG32, reg_AX ),
1260                make_xmm( registers[chan_index] ) );
1261          }
1262          else {
1263             sse_pmovmskb(
1264                func,
1265                x86_make_reg( file_REG32, reg_DX ),
1266                make_xmm( registers[chan_index] ) );
1267             x86_or(
1268                func,
1269                x86_make_reg( file_REG32, reg_AX ),
1270                x86_make_reg( file_REG32, reg_DX ) );
1271          }
1272       }
1273    }
1274
1275    x86_or(
1276       func,
1277       get_temp(
1278          TGSI_EXEC_TEMP_KILMASK_I,
1279          TGSI_EXEC_TEMP_KILMASK_C ),
1280       x86_make_reg( file_REG32, reg_AX ) );
1281
1282    x86_pop(
1283       func,
1284       x86_make_reg( file_REG32, reg_DX ) );
1285    x86_pop(
1286       func,
1287       x86_make_reg( file_REG32, reg_AX ) );
1288 }
1289
1290
1291 static void
1292 emit_kilp(
1293    struct x86_function *func )
1294 {
1295    /* XXX todo / fix me */
1296 }
1297
1298
1299 static void
1300 emit_setcc(
1301    struct x86_function *func,
1302    struct tgsi_full_instruction *inst,
1303    enum sse_cc cc )
1304 {
1305    unsigned chan_index;
1306
1307    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1308       FETCH( func, *inst, 0, 0, chan_index );
1309       FETCH( func, *inst, 1, 1, chan_index );
1310       sse_cmpps(
1311          func,
1312          make_xmm( 0 ),
1313          make_xmm( 1 ),
1314          cc );
1315       sse_andps(
1316          func,
1317          make_xmm( 0 ),
1318          get_temp(
1319             TEMP_ONE_I,
1320             TEMP_ONE_C ) );
1321       STORE( func, *inst, 0, 0, chan_index );
1322    }
1323 }
1324
1325 static void
1326 emit_cmp(
1327    struct x86_function *func,
1328    struct tgsi_full_instruction *inst )
1329 {
1330    unsigned chan_index;
1331
1332    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1333       FETCH( func, *inst, 0, 0, chan_index );
1334       FETCH( func, *inst, 1, 1, chan_index );
1335       FETCH( func, *inst, 2, 2, chan_index );
1336       sse_cmpps(
1337          func,
1338          make_xmm( 0 ),
1339          get_temp(
1340             TGSI_EXEC_TEMP_00000000_I,
1341             TGSI_EXEC_TEMP_00000000_C ),
1342          cc_LessThan );
1343       sse_andps(
1344          func,
1345          make_xmm( 1 ),
1346          make_xmm( 0 ) );
1347       sse_andnps(
1348          func,
1349          make_xmm( 0 ),
1350          make_xmm( 2 ) );
1351       sse_orps(
1352          func,
1353          make_xmm( 0 ),
1354          make_xmm( 1 ) );
1355       STORE( func, *inst, 0, 0, chan_index );
1356    }
1357 }
1358
1359 static int
1360 emit_instruction(
1361    struct x86_function *func,
1362    struct tgsi_full_instruction *inst )
1363 {
1364    unsigned chan_index;
1365
1366    switch (inst->Instruction.Opcode) {
1367    case TGSI_OPCODE_ARL:
1368       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1369          FETCH( func, *inst, 0, 0, chan_index );
1370          emit_f2it( func, 0 );
1371          STORE( func, *inst, 0, 0, chan_index );
1372       }
1373       break;
1374
1375    case TGSI_OPCODE_MOV:
1376    case TGSI_OPCODE_SWZ:
1377       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1378          FETCH( func, *inst, 0, 0, chan_index );
1379          STORE( func, *inst, 0, 0, chan_index );
1380       }
1381       break;
1382
1383    case TGSI_OPCODE_LIT:
1384       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1385           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1386          emit_tempf(
1387             func,
1388             0,
1389             TEMP_ONE_I,
1390             TEMP_ONE_C);
1391          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1392             STORE( func, *inst, 0, 0, CHAN_X );
1393          }
1394          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1395             STORE( func, *inst, 0, 0, CHAN_W );
1396          }
1397       }
1398       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1399           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1400          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1401             FETCH( func, *inst, 0, 0, CHAN_X );
1402             sse_maxps(
1403                func,
1404                make_xmm( 0 ),
1405                get_temp(
1406                   TGSI_EXEC_TEMP_00000000_I,
1407                   TGSI_EXEC_TEMP_00000000_C ) );
1408             STORE( func, *inst, 0, 0, CHAN_Y );
1409          }
1410          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1411             /* XMM[1] = SrcReg[0].yyyy */
1412             FETCH( func, *inst, 1, 0, CHAN_Y );
1413             /* XMM[1] = max(XMM[1], 0) */
1414             sse_maxps(
1415                func,
1416                make_xmm( 1 ),
1417                get_temp(
1418                   TGSI_EXEC_TEMP_00000000_I,
1419                   TGSI_EXEC_TEMP_00000000_C ) );
1420             /* XMM[2] = SrcReg[0].wwww */
1421             FETCH( func, *inst, 2, 0, CHAN_W );
1422             /* XMM[2] = min(XMM[2], 128.0) */
1423             sse_minps(
1424                func,
1425                make_xmm( 2 ),
1426                get_temp(
1427                   TGSI_EXEC_TEMP_128_I,
1428                   TGSI_EXEC_TEMP_128_C ) );
1429             /* XMM[2] = max(XMM[2], -128.0) */
1430             sse_maxps(
1431                func,
1432                make_xmm( 2 ),
1433                get_temp(
1434                   TGSI_EXEC_TEMP_MINUS_128_I,
1435                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1436             emit_pow( func, 3, 1, 2 );
1437             FETCH( func, *inst, 0, 0, CHAN_X );
1438             sse_xorps(
1439                func,
1440                make_xmm( 2 ),
1441                make_xmm( 2 ) );
1442             sse_cmpps(
1443                func,
1444                make_xmm( 2 ),
1445                make_xmm( 0 ),
1446                cc_LessThanEqual );
1447             sse_andps(
1448                func,
1449                make_xmm( 2 ),
1450                make_xmm( 1 ) );
1451             STORE( func, *inst, 2, 0, CHAN_Z );
1452          }
1453       }
1454       break;
1455
1456    case TGSI_OPCODE_RCP:
1457    /* TGSI_OPCODE_RECIP */
1458       FETCH( func, *inst, 0, 0, CHAN_X );
1459       emit_rcp( func, 0, 0 );
1460       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1461          STORE( func, *inst, 0, 0, chan_index );
1462       }
1463       break;
1464
1465    case TGSI_OPCODE_RSQ:
1466    /* TGSI_OPCODE_RECIPSQRT */
1467       FETCH( func, *inst, 0, 0, CHAN_X );
1468       emit_rsqrt( func, 1, 0 );
1469       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1470          STORE( func, *inst, 1, 0, chan_index );
1471       }
1472       break;
1473
1474    case TGSI_OPCODE_EXP:
1475       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1476           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1477           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1478          FETCH( func, *inst, 0, 0, CHAN_X );
1479          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1480              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1481             emit_MOV( func, 1, 0 );
1482             emit_flr( func, 2, 1 );
1483             /* dst.x = ex2(floor(src.x)) */
1484             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1485                emit_MOV( func, 2, 1 );
1486                emit_ex2( func, 3, 2 );
1487                STORE( func, *inst, 2, 0, CHAN_X );
1488             }
1489             /* dst.y = src.x - floor(src.x) */
1490             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1491                emit_MOV( func, 2, 0 );
1492                emit_sub( func, 2, 1 );
1493                STORE( func, *inst, 2, 0, CHAN_Y );
1494             }
1495          }
1496          /* dst.z = ex2(src.x) */
1497          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1498             emit_ex2( func, 3, 0 );
1499             STORE( func, *inst, 0, 0, CHAN_Z );
1500          }
1501       }
1502       /* dst.w = 1.0 */
1503       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1504          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1505          STORE( func, *inst, 0, 0, CHAN_W );
1506       }
1507       break;
1508
1509    case TGSI_OPCODE_LOG:
1510       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1511           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1512           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1513          FETCH( func, *inst, 0, 0, CHAN_X );
1514          emit_abs( func, 0 );
1515          emit_MOV( func, 1, 0 );
1516          emit_lg2( func, 2, 1 );
1517          /* dst.z = lg2(abs(src.x)) */
1518          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1519             STORE( func, *inst, 1, 0, CHAN_Z );
1520          }
1521          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1522              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1523             emit_flr( func, 2, 1 );
1524             /* dst.x = floor(lg2(abs(src.x))) */
1525             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1526                STORE( func, *inst, 1, 0, CHAN_X );
1527             }
1528             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1529             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1530                emit_ex2( func, 2, 1 );
1531                emit_rcp( func, 1, 1 );
1532                emit_mul( func, 0, 1 );
1533                STORE( func, *inst, 0, 0, CHAN_Y );
1534             }
1535          }
1536       }
1537       /* dst.w = 1.0 */
1538       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1539          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1540          STORE( func, *inst, 0, 0, CHAN_W );
1541       }
1542       break;
1543
1544    case TGSI_OPCODE_MUL:
1545       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1546          FETCH( func, *inst, 0, 0, chan_index );
1547          FETCH( func, *inst, 1, 1, chan_index );
1548          emit_mul( func, 0, 1 );
1549          STORE( func, *inst, 0, 0, chan_index );
1550       }
1551       break;
1552
1553    case TGSI_OPCODE_ADD:
1554       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1555          FETCH( func, *inst, 0, 0, chan_index );
1556          FETCH( func, *inst, 1, 1, chan_index );
1557          emit_add( func, 0, 1 );
1558          STORE( func, *inst, 0, 0, chan_index );
1559       }
1560       break;
1561
1562    case TGSI_OPCODE_DP3:
1563    /* TGSI_OPCODE_DOT3 */
1564       FETCH( func, *inst, 0, 0, CHAN_X );
1565       FETCH( func, *inst, 1, 1, CHAN_X );
1566       emit_mul( func, 0, 1 );
1567       FETCH( func, *inst, 1, 0, CHAN_Y );
1568       FETCH( func, *inst, 2, 1, CHAN_Y );
1569       emit_mul( func, 1, 2 );
1570       emit_add( func, 0, 1 );
1571       FETCH( func, *inst, 1, 0, CHAN_Z );
1572       FETCH( func, *inst, 2, 1, CHAN_Z );
1573       emit_mul( func, 1, 2 );
1574       emit_add( func, 0, 1 );
1575       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1576          STORE( func, *inst, 0, 0, chan_index );
1577       }
1578       break;
1579
1580    case TGSI_OPCODE_DP4:
1581    /* TGSI_OPCODE_DOT4 */
1582       FETCH( func, *inst, 0, 0, CHAN_X );
1583       FETCH( func, *inst, 1, 1, CHAN_X );
1584       emit_mul( func, 0, 1 );
1585       FETCH( func, *inst, 1, 0, CHAN_Y );
1586       FETCH( func, *inst, 2, 1, CHAN_Y );
1587       emit_mul( func, 1, 2 );
1588       emit_add( func, 0, 1 );
1589       FETCH( func, *inst, 1, 0, CHAN_Z );
1590       FETCH( func, *inst, 2, 1, CHAN_Z );
1591       emit_mul(func, 1, 2 );
1592       emit_add(func, 0, 1 );
1593       FETCH( func, *inst, 1, 0, CHAN_W );
1594       FETCH( func, *inst, 2, 1, CHAN_W );
1595       emit_mul( func, 1, 2 );
1596       emit_add( func, 0, 1 );
1597       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1598          STORE( func, *inst, 0, 0, chan_index );
1599       }
1600       break;
1601
1602    case TGSI_OPCODE_DST:
1603       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1604          emit_tempf(
1605             func,
1606             0,
1607             TEMP_ONE_I,
1608             TEMP_ONE_C );
1609          STORE( func, *inst, 0, 0, CHAN_X );
1610       }
1611       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1612          FETCH( func, *inst, 0, 0, CHAN_Y );
1613          FETCH( func, *inst, 1, 1, CHAN_Y );
1614          emit_mul( func, 0, 1 );
1615          STORE( func, *inst, 0, 0, CHAN_Y );
1616       }
1617       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1618          FETCH( func, *inst, 0, 0, CHAN_Z );
1619          STORE( func, *inst, 0, 0, CHAN_Z );
1620       }
1621       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1622          FETCH( func, *inst, 0, 1, CHAN_W );
1623          STORE( func, *inst, 0, 0, CHAN_W );
1624       }
1625       break;
1626
1627    case TGSI_OPCODE_MIN:
1628       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1629          FETCH( func, *inst, 0, 0, chan_index );
1630          FETCH( func, *inst, 1, 1, chan_index );
1631          sse_minps(
1632             func,
1633             make_xmm( 0 ),
1634             make_xmm( 1 ) );
1635          STORE( func, *inst, 0, 0, chan_index );
1636       }
1637       break;
1638
1639    case TGSI_OPCODE_MAX:
1640       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1641          FETCH( func, *inst, 0, 0, chan_index );
1642          FETCH( func, *inst, 1, 1, chan_index );
1643          sse_maxps(
1644             func,
1645             make_xmm( 0 ),
1646             make_xmm( 1 ) );
1647          STORE( func, *inst, 0, 0, chan_index );
1648       }
1649       break;
1650
1651    case TGSI_OPCODE_SLT:
1652    /* TGSI_OPCODE_SETLT */
1653       emit_setcc( func, inst, cc_LessThan );
1654       break;
1655
1656    case TGSI_OPCODE_SGE:
1657    /* TGSI_OPCODE_SETGE */
1658       emit_setcc( func, inst, cc_NotLessThan );
1659       break;
1660
1661    case TGSI_OPCODE_MAD:
1662    /* TGSI_OPCODE_MADD */
1663       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1664          FETCH( func, *inst, 0, 0, chan_index );
1665          FETCH( func, *inst, 1, 1, chan_index );
1666          FETCH( func, *inst, 2, 2, chan_index );
1667          emit_mul( func, 0, 1 );
1668          emit_add( func, 0, 2 );
1669          STORE( func, *inst, 0, 0, chan_index );
1670       }
1671       break;
1672
1673    case TGSI_OPCODE_SUB:
1674       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1675          FETCH( func, *inst, 0, 0, chan_index );
1676          FETCH( func, *inst, 1, 1, chan_index );
1677          emit_sub( func, 0, 1 );
1678          STORE( func, *inst, 0, 0, chan_index );
1679       }
1680       break;
1681
1682    case TGSI_OPCODE_LERP:
1683    /* TGSI_OPCODE_LRP */
1684       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1685          FETCH( func, *inst, 0, 0, chan_index );
1686          FETCH( func, *inst, 1, 1, chan_index );
1687          FETCH( func, *inst, 2, 2, chan_index );
1688          emit_sub( func, 1, 2 );
1689          emit_mul( func, 0, 1 );
1690          emit_add( func, 0, 2 );
1691          STORE( func, *inst, 0, 0, chan_index );
1692       }
1693       break;
1694
1695    case TGSI_OPCODE_CND:
1696       return 0;
1697       break;
1698
1699    case TGSI_OPCODE_CND0:
1700       return 0;
1701       break;
1702
1703    case TGSI_OPCODE_DOT2ADD:
1704    /* TGSI_OPCODE_DP2A */
1705       return 0;
1706       break;
1707
1708    case TGSI_OPCODE_INDEX:
1709       return 0;
1710       break;
1711
1712    case TGSI_OPCODE_NEGATE:
1713       return 0;
1714       break;
1715
1716    case TGSI_OPCODE_FRAC:
1717    /* TGSI_OPCODE_FRC */
1718       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1719          FETCH( func, *inst, 0, 0, chan_index );
1720          emit_frc( func, 0, 0 );
1721          STORE( func, *inst, 0, 0, chan_index );
1722       }
1723       break;
1724
1725    case TGSI_OPCODE_CLAMP:
1726       return 0;
1727       break;
1728
1729    case TGSI_OPCODE_FLOOR:
1730    /* TGSI_OPCODE_FLR */
1731       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1732          FETCH( func, *inst, 0, 0, chan_index );
1733          emit_flr( func, 0, 0 );
1734          STORE( func, *inst, 0, 0, chan_index );
1735       }
1736       break;
1737
1738    case TGSI_OPCODE_ROUND:
1739       return 0;
1740       break;
1741
1742    case TGSI_OPCODE_EXPBASE2:
1743    /* TGSI_OPCODE_EX2 */
1744       FETCH( func, *inst, 0, 0, CHAN_X );
1745       emit_ex2( func, 0, 0 );
1746       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1747          STORE( func, *inst, 0, 0, chan_index );
1748       }
1749       break;
1750
1751    case TGSI_OPCODE_LOGBASE2:
1752    /* TGSI_OPCODE_LG2 */
1753       FETCH( func, *inst, 0, 0, CHAN_X );
1754       emit_lg2( func, 0, 0 );
1755       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1756          STORE( func, *inst, 0, 0, chan_index );
1757       }
1758       break;
1759
1760    case TGSI_OPCODE_POWER:
1761    /* TGSI_OPCODE_POW */
1762       FETCH( func, *inst, 0, 0, CHAN_X );
1763       FETCH( func, *inst, 1, 1, CHAN_X );
1764       emit_pow( func, 0, 0, 1 );
1765       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1766          STORE( func, *inst, 0, 0, chan_index );
1767       }
1768       break;
1769
1770    case TGSI_OPCODE_CROSSPRODUCT:
1771    /* TGSI_OPCODE_XPD */
1772       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1773           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1774          FETCH( func, *inst, 1, 1, CHAN_Z );
1775          FETCH( func, *inst, 3, 0, CHAN_Z );
1776       }
1777       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1778           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1779          FETCH( func, *inst, 0, 0, CHAN_Y );
1780          FETCH( func, *inst, 4, 1, CHAN_Y );
1781       }
1782       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1783          emit_MOV( func, 2, 0 );
1784          emit_mul( func, 2, 1 );
1785          emit_MOV( func, 5, 3 );
1786          emit_mul( func, 5, 4 );
1787          emit_sub( func, 2, 5 );
1788          STORE( func, *inst, 2, 0, CHAN_X );
1789       }
1790       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1791           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1792          FETCH( func, *inst, 2, 1, CHAN_X );
1793          FETCH( func, *inst, 5, 0, CHAN_X );
1794       }
1795       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1796          emit_mul( func, 3, 2 );
1797          emit_mul( func, 1, 5 );
1798          emit_sub( func, 3, 1 );
1799          STORE( func, *inst, 3, 0, CHAN_Y );
1800       }
1801       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1802          emit_mul( func, 5, 4 );
1803          emit_mul( func, 0, 2 );
1804          emit_sub( func, 5, 0 );
1805          STORE( func, *inst, 5, 0, CHAN_Z );
1806       }
1807       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1808          emit_tempf(
1809             func,
1810             0,
1811             TEMP_ONE_I,
1812             TEMP_ONE_C );
1813          STORE( func, *inst, 0, 0, CHAN_W );
1814       }
1815       break;
1816
1817    case TGSI_OPCODE_MULTIPLYMATRIX:
1818       return 0;
1819       break;
1820
1821    case TGSI_OPCODE_ABS:
1822       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1823          FETCH( func, *inst, 0, 0, chan_index );
1824          emit_abs( func, 0) ;
1825
1826          STORE( func, *inst, 0, 0, chan_index );
1827       }
1828       break;
1829
1830    case TGSI_OPCODE_RCC:
1831       return 0;
1832       break;
1833
1834    case TGSI_OPCODE_DPH:
1835       FETCH( func, *inst, 0, 0, CHAN_X );
1836       FETCH( func, *inst, 1, 1, CHAN_X );
1837       emit_mul( func, 0, 1 );
1838       FETCH( func, *inst, 1, 0, CHAN_Y );
1839       FETCH( func, *inst, 2, 1, CHAN_Y );
1840       emit_mul( func, 1, 2 );
1841       emit_add( func, 0, 1 );
1842       FETCH( func, *inst, 1, 0, CHAN_Z );
1843       FETCH( func, *inst, 2, 1, CHAN_Z );
1844       emit_mul( func, 1, 2 );
1845       emit_add( func, 0, 1 );
1846       FETCH( func, *inst, 1, 1, CHAN_W );
1847       emit_add( func, 0, 1 );
1848       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1849          STORE( func, *inst, 0, 0, chan_index );
1850       }
1851       break;
1852
1853    case TGSI_OPCODE_COS:
1854       FETCH( func, *inst, 0, 0, CHAN_X );
1855       emit_cos( func, 0, 0 );
1856       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1857          STORE( func, *inst, 0, 0, chan_index );
1858       }
1859       break;
1860
1861    case TGSI_OPCODE_DDX:
1862       return 0;
1863       break;
1864
1865    case TGSI_OPCODE_DDY:
1866       return 0;
1867       break;
1868
1869    case TGSI_OPCODE_KILP:
1870       /* predicated kill */
1871       emit_kilp( func );
1872       return 0; /* XXX fix me */
1873       break;
1874
1875    case TGSI_OPCODE_KIL:
1876       /* conditional kill */
1877       emit_kil( func, &inst->FullSrcRegisters[0] );
1878       break;
1879
1880    case TGSI_OPCODE_PK2H:
1881       return 0;
1882       break;
1883
1884    case TGSI_OPCODE_PK2US:
1885       return 0;
1886       break;
1887
1888    case TGSI_OPCODE_PK4B:
1889       return 0;
1890       break;
1891
1892    case TGSI_OPCODE_PK4UB:
1893       return 0;
1894       break;
1895
1896    case TGSI_OPCODE_RFL:
1897       return 0;
1898       break;
1899
1900    case TGSI_OPCODE_SEQ:
1901       return 0;
1902       break;
1903
1904    case TGSI_OPCODE_SFL:
1905       return 0;
1906       break;
1907
1908    case TGSI_OPCODE_SGT:
1909       return 0;
1910       break;
1911
1912    case TGSI_OPCODE_SIN:
1913       FETCH( func, *inst, 0, 0, CHAN_X );
1914       emit_sin( func, 0, 0 );
1915       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1916          STORE( func, *inst, 0, 0, chan_index );
1917       }
1918       break;
1919
1920    case TGSI_OPCODE_SLE:
1921       return 0;
1922       break;
1923
1924    case TGSI_OPCODE_SNE:
1925       return 0;
1926       break;
1927
1928    case TGSI_OPCODE_STR:
1929       return 0;
1930       break;
1931
1932    case TGSI_OPCODE_TEX:
1933       if (0) {
1934          /* Disable dummy texture code:
1935           */
1936          emit_tempf(
1937             func,
1938             0,
1939             TEMP_ONE_I,
1940             TEMP_ONE_C );
1941          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1942             STORE( func, *inst, 0, 0, chan_index );
1943          }
1944       }
1945       else {
1946          return 0;
1947       }
1948       break;
1949
1950    case TGSI_OPCODE_TXD:
1951       return 0;
1952       break;
1953
1954    case TGSI_OPCODE_UP2H:
1955       return 0;
1956       break;
1957
1958    case TGSI_OPCODE_UP2US:
1959       return 0;
1960       break;
1961
1962    case TGSI_OPCODE_UP4B:
1963       return 0;
1964       break;
1965
1966    case TGSI_OPCODE_UP4UB:
1967       return 0;
1968       break;
1969
1970    case TGSI_OPCODE_X2D:
1971       return 0;
1972       break;
1973
1974    case TGSI_OPCODE_ARA:
1975       return 0;
1976       break;
1977
1978    case TGSI_OPCODE_ARR:
1979       return 0;
1980       break;
1981
1982    case TGSI_OPCODE_BRA:
1983       return 0;
1984       break;
1985
1986    case TGSI_OPCODE_CAL:
1987       return 0;
1988       break;
1989
1990    case TGSI_OPCODE_RET:
1991       emit_ret( func );
1992       break;
1993
1994    case TGSI_OPCODE_END:
1995       break;
1996
1997    case TGSI_OPCODE_SSG:
1998       return 0;
1999       break;
2000
2001    case TGSI_OPCODE_CMP:
2002       emit_cmp (func, inst);
2003       break;
2004
2005    case TGSI_OPCODE_SCS:
2006       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2007          FETCH( func, *inst, 0, 0, CHAN_X );
2008          emit_cos( func, 0, 0 );
2009          STORE( func, *inst, 0, 0, CHAN_X );
2010       }
2011       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2012          FETCH( func, *inst, 0, 0, CHAN_X );
2013          emit_sin( func, 0, 0 );
2014          STORE( func, *inst, 0, 0, CHAN_Y );
2015       }
2016       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2017          emit_tempf(
2018             func,
2019             0,
2020             TGSI_EXEC_TEMP_00000000_I,
2021             TGSI_EXEC_TEMP_00000000_C );
2022          STORE( func, *inst, 0, 0, CHAN_Z );
2023       }
2024       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2025          emit_tempf(
2026             func,
2027             0,
2028             TEMP_ONE_I,
2029             TEMP_ONE_C );
2030          STORE( func, *inst, 0, 0, CHAN_W );
2031       }
2032       break;
2033
2034    case TGSI_OPCODE_TXB:
2035       return 0;
2036       break;
2037
2038    case TGSI_OPCODE_NRM:
2039       return 0;
2040       break;
2041
2042    case TGSI_OPCODE_DIV:
2043       return 0;
2044       break;
2045
2046    case TGSI_OPCODE_DP2:
2047       return 0;
2048       break;
2049
2050    case TGSI_OPCODE_TXL:
2051       return 0;
2052       break;
2053
2054    case TGSI_OPCODE_BRK:
2055       return 0;
2056       break;
2057
2058    case TGSI_OPCODE_IF:
2059       return 0;
2060       break;
2061
2062    case TGSI_OPCODE_LOOP:
2063       return 0;
2064       break;
2065
2066    case TGSI_OPCODE_REP:
2067       return 0;
2068       break;
2069
2070    case TGSI_OPCODE_ELSE:
2071       return 0;
2072       break;
2073
2074    case TGSI_OPCODE_ENDIF:
2075       return 0;
2076       break;
2077
2078    case TGSI_OPCODE_ENDLOOP:
2079       return 0;
2080       break;
2081
2082    case TGSI_OPCODE_ENDREP:
2083       return 0;
2084       break;
2085
2086    case TGSI_OPCODE_PUSHA:
2087       return 0;
2088       break;
2089
2090    case TGSI_OPCODE_POPA:
2091       return 0;
2092       break;
2093
2094    case TGSI_OPCODE_CEIL:
2095       return 0;
2096       break;
2097
2098    case TGSI_OPCODE_I2F:
2099       return 0;
2100       break;
2101
2102    case TGSI_OPCODE_NOT:
2103       return 0;
2104       break;
2105
2106    case TGSI_OPCODE_TRUNC:
2107       return 0;
2108       break;
2109
2110    case TGSI_OPCODE_SHL:
2111       return 0;
2112       break;
2113
2114    case TGSI_OPCODE_SHR:
2115       return 0;
2116       break;
2117
2118    case TGSI_OPCODE_AND:
2119       return 0;
2120       break;
2121
2122    case TGSI_OPCODE_OR:
2123       return 0;
2124       break;
2125
2126    case TGSI_OPCODE_MOD:
2127       return 0;
2128       break;
2129
2130    case TGSI_OPCODE_XOR:
2131       return 0;
2132       break;
2133
2134    case TGSI_OPCODE_SAD:
2135       return 0;
2136       break;
2137
2138    case TGSI_OPCODE_TXF:
2139       return 0;
2140       break;
2141
2142    case TGSI_OPCODE_TXQ:
2143       return 0;
2144       break;
2145
2146    case TGSI_OPCODE_CONT:
2147       return 0;
2148       break;
2149
2150    case TGSI_OPCODE_EMIT:
2151       return 0;
2152       break;
2153
2154    case TGSI_OPCODE_ENDPRIM:
2155       return 0;
2156       break;
2157
2158    default:
2159       return 0;
2160    }
2161
2162    return 1;
2163 }
2164
2165 static void
2166 emit_declaration(
2167    struct x86_function *func,
2168    struct tgsi_full_declaration *decl )
2169 {
2170    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2171       unsigned first, last, mask;
2172       unsigned i, j;
2173
2174       first = decl->DeclarationRange.First;
2175       last = decl->DeclarationRange.Last;
2176       mask = decl->Declaration.UsageMask;
2177
2178       for( i = first; i <= last; i++ ) {
2179          for( j = 0; j < NUM_CHANNELS; j++ ) {
2180             if( mask & (1 << j) ) {
2181                switch( decl->Declaration.Interpolate ) {
2182                case TGSI_INTERPOLATE_CONSTANT:
2183                   emit_coef_a0( func, 0, i, j );
2184                   emit_inputs( func, 0, i, j );
2185                   break;
2186
2187                case TGSI_INTERPOLATE_LINEAR:
2188                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2189                   emit_coef_dadx( func, 1, i, j );
2190                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2191                   emit_coef_dady( func, 3, i, j );
2192                   emit_mul( func, 0, 1 );    /* x * dadx */
2193                   emit_coef_a0( func, 4, i, j );
2194                   emit_mul( func, 2, 3 );    /* y * dady */
2195                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2196                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2197                   emit_inputs( func, 0, i, j );
2198                   break;
2199
2200                case TGSI_INTERPOLATE_PERSPECTIVE:
2201                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2202                   emit_coef_dadx( func, 1, i, j );
2203                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2204                   emit_coef_dady( func, 3, i, j );
2205                   emit_mul( func, 0, 1 );    /* x * dadx */
2206                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2207                   emit_coef_a0( func, 5, i, j );
2208                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2209                   emit_mul( func, 2, 3 );    /* y * dady */
2210                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2211                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2212                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2213                   emit_inputs( func, 0, i, j );
2214                   break;
2215
2216                default:
2217                   assert( 0 );
2218                   break;
2219                }
2220             }
2221          }
2222       }
2223    }
2224 }
2225
2226 static void aos_to_soa( struct x86_function *func,
2227                         uint arg_aos,
2228                         uint arg_soa,
2229                         uint arg_num,
2230                         uint arg_stride )
2231 {
2232    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2233    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2234    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2235    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2236    int inner_loop;
2237
2238
2239    /* Save EBX */
2240    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2241
2242    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2243    x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
2244    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2245    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2246
2247    /* do */
2248    inner_loop = x86_get_label( func );
2249    {
2250       x86_push( func, aos_input );
2251       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2252       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2253       x86_add( func, aos_input, stride );
2254       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2255       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2256       x86_add( func, aos_input, stride );
2257       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2258       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2259       x86_add( func, aos_input, stride );
2260       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2261       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2262       x86_pop( func, aos_input );
2263
2264       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2265       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2266       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2267       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2268       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2269       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2270
2271       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2272       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2273       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2274       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2275
2276       /* Advance to next input */
2277       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2278       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2279    }
2280    /* while --num_inputs */
2281    x86_dec( func, num_inputs );
2282    x86_jcc( func, cc_NE, inner_loop );
2283
2284    /* Restore EBX */
2285    x86_pop( func, aos_input );
2286 }
2287
2288 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2289 {
2290    struct x86_reg soa_output;
2291    struct x86_reg aos_output;
2292    struct x86_reg num_outputs;
2293    struct x86_reg temp;
2294    int inner_loop;
2295
2296    soa_output = x86_make_reg( file_REG32, reg_AX );
2297    aos_output = x86_make_reg( file_REG32, reg_BX );
2298    num_outputs = x86_make_reg( file_REG32, reg_CX );
2299    temp = x86_make_reg( file_REG32, reg_DX );
2300
2301    /* Save EBX */
2302    x86_push( func, aos_output );
2303
2304    x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2305    x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2306    x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2307
2308    /* do */
2309    inner_loop = x86_get_label( func );
2310    {
2311       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2312       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2313       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2314       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2315
2316       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2317       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2318       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2319       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2320       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2321       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2322
2323       x86_mov( func, temp, x86_fn_arg( func, stride ) );
2324       x86_push( func, aos_output );
2325       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2326       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2327       x86_add( func, aos_output, temp );
2328       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2329       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2330       x86_add( func, aos_output, temp );
2331       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2332       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2333       x86_add( func, aos_output, temp );
2334       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2335       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2336       x86_pop( func, aos_output );
2337
2338       /* Advance to next output */
2339       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2340       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2341    }
2342    /* while --num_outputs */
2343    x86_dec( func, num_outputs );
2344    x86_jcc( func, cc_NE, inner_loop );
2345
2346    /* Restore EBX */
2347    x86_pop( func, aos_output );
2348 }
2349
2350 /**
2351  * Translate a TGSI vertex/fragment shader to SSE2 code.
2352  * Slightly different things are done for vertex vs. fragment shaders.
2353  *
2354  * Note that fragment shaders are responsible for interpolating shader
2355  * inputs. Because on x86 we have only 4 GP registers, and here we
2356  * have 5 shader arguments (input, output, const, temp and coef), the
2357  * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2358  * GP register holding the output argument is aliased with the coeff
2359  * argument, as outputs are not needed in the DECLARATION phase.
2360  *
2361  * \param tokens  the TGSI input shader
2362  * \param func  the output SSE code/function
2363  * \param immediates  buffer to place immediates, later passed to SSE func
2364  * \param return  1 for success, 0 if translation failed
2365  */
2366 unsigned
2367 tgsi_emit_sse2(
2368    const struct tgsi_token *tokens,
2369    struct x86_function *func,
2370    float (*immediates)[4],
2371    boolean do_swizzles )
2372 {
2373    struct tgsi_parse_context parse;
2374    boolean instruction_phase = FALSE;
2375    unsigned ok = 1;
2376    uint num_immediates = 0;
2377
2378    util_init_math();
2379
2380    func->csr = func->store;
2381
2382    tgsi_parse_init( &parse, tokens );
2383
2384    /* Can't just use EDI, EBX without save/restoring them:
2385     */
2386    x86_push(
2387       func,
2388       get_immediate_base() );
2389
2390    x86_push(
2391       func,
2392       get_temp_base() );
2393
2394
2395    /*
2396     * Different function args for vertex/fragment shaders:
2397     */
2398    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2399       /* DECLARATION phase, do not load output argument. */
2400       x86_mov(
2401          func,
2402          get_input_base(),
2403          x86_fn_arg( func, 1 ) );
2404       /* skipping outputs argument here */
2405       x86_mov(
2406          func,
2407          get_const_base(),
2408          x86_fn_arg( func, 3 ) );
2409       x86_mov(
2410          func,
2411          get_temp_base(),
2412          x86_fn_arg( func, 4 ) );
2413       x86_mov(
2414          func,
2415          get_coef_base(),
2416          x86_fn_arg( func, 5 ) );
2417       x86_mov(
2418          func,
2419          get_immediate_base(),
2420          x86_fn_arg( func, 6 ) );
2421    }
2422    else {
2423       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2424
2425       if (do_swizzles)
2426          aos_to_soa( func,
2427                      6,         /* aos_input */
2428                      1,         /* machine->input */
2429                      7,         /* num_inputs */
2430                      8 );       /* input_stride */
2431
2432       x86_mov(
2433          func,
2434          get_input_base(),
2435          x86_fn_arg( func, 1 ) );
2436       x86_mov(
2437          func,
2438          get_output_base(),
2439          x86_fn_arg( func, 2 ) );
2440       x86_mov(
2441          func,
2442          get_const_base(),
2443          x86_fn_arg( func, 3 ) );
2444       x86_mov(
2445          func,
2446          get_temp_base(),
2447          x86_fn_arg( func, 4 ) );
2448       x86_mov(
2449          func,
2450          get_immediate_base(),
2451          x86_fn_arg( func, 5 ) );
2452    }
2453
2454    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2455       tgsi_parse_token( &parse );
2456
2457       switch( parse.FullToken.Token.Type ) {
2458       case TGSI_TOKEN_TYPE_DECLARATION:
2459          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2460             emit_declaration(
2461                func,
2462                &parse.FullToken.FullDeclaration );
2463          }
2464          break;
2465
2466       case TGSI_TOKEN_TYPE_INSTRUCTION:
2467          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2468             if( !instruction_phase ) {
2469                /* INSTRUCTION phase, overwrite coeff with output. */
2470                instruction_phase = TRUE;
2471                x86_mov(
2472                   func,
2473                   get_output_base(),
2474                   x86_fn_arg( func, 2 ) );
2475             }
2476          }
2477
2478          ok = emit_instruction(
2479             func,
2480             &parse.FullToken.FullInstruction );
2481
2482          if (!ok) {
2483             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2484                          parse.FullToken.FullInstruction.Instruction.Opcode,
2485                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2486                          "vertex shader" : "fragment shader");
2487          }
2488          break;
2489
2490       case TGSI_TOKEN_TYPE_IMMEDIATE:
2491          /* simply copy the immediate values into the next immediates[] slot */
2492          {
2493             const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2494             uint i;
2495             assert(size <= 4);
2496             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2497             for( i = 0; i < size; i++ ) {
2498                immediates[num_immediates][i] =
2499                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2500             }
2501 #if 0
2502             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2503                    num_immediates,
2504                    immediates[num_immediates][0],
2505                    immediates[num_immediates][1],
2506                    immediates[num_immediates][2],
2507                    immediates[num_immediates][3]);
2508 #endif
2509             num_immediates++;
2510          }
2511          break;
2512
2513       default:
2514          ok = 0;
2515          assert( 0 );
2516       }
2517    }
2518
2519    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2520       if (do_swizzles)
2521          soa_to_aos( func, 9, 2, 10, 11 );
2522    }
2523
2524    /* Can't just use EBX, EDI without save/restoring them:
2525     */
2526    x86_pop(
2527       func,
2528       get_temp_base() );
2529
2530    x86_pop(
2531       func,
2532       get_immediate_base() );
2533
2534    emit_ret( func );
2535
2536    tgsi_parse_free( &parse );
2537
2538    return ok;
2539 }
2540
2541 #endif /* PIPE_ARCH_X86 */
2542