src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_config.h"
  29
  30 #if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
  31
  32 #include "pipe/p_debug.h"
  33 #include "pipe/p_shader_tokens.h"
  34 #include "util/u_math.h"
  35 #include "util/u_sse.h"
  36 #include "tgsi/tgsi_parse.h"
  37 #include "tgsi/tgsi_util.h"
  38 #include "tgsi_exec.h"
  39 #include "tgsi_sse2.h"
  40
  41 #include "rtasm/rtasm_x86sse.h"
  42
  43 /* for 1/sqrt()
  44  *
  45  * This costs about 100fps (close to 10%) in gears:
  46  */
  47 #define HIGH_PRECISION 1
  48
  49 #define FAST_MATH 1
  50
  51
  52 #define FOR_EACH_CHANNEL( CHAN )\
  53    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  54
  55 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  56    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  57
  58 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  59    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  60
  61 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  62    FOR_EACH_CHANNEL( CHAN )\
  63       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  64
  65 #define CHAN_X 0
  66 #define CHAN_Y 1
  67 #define CHAN_Z 2
  68 #define CHAN_W 3
  69
  70 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  71 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  72
  73 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  74 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  75 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  76 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  77
  78
  79 /**
  80  * X86 utility functions.
  81  */
  82
  83 static struct x86_reg
  84 make_xmm(
  85    unsigned xmm )
  86 {
  87    return x86_make_reg(
  88       file_XMM,
  89       (enum x86_reg_name) xmm );
  90 }
  91
  92 /**
  93  * X86 register mapping helpers.
  94  */
  95
  96 static struct x86_reg
  97 get_const_base( void )
  98 {
  99    return x86_make_reg(
 100       file_REG32,
 101       reg_CX );
 102 }
 103
 104 static struct x86_reg
 105 get_input_base( void )
 106 {
 107    return x86_make_reg(
 108       file_REG32,
 109       reg_AX );
 110 }
 111
 112 static struct x86_reg
 113 get_output_base( void )
 114 {
 115    return x86_make_reg(
 116       file_REG32,
 117       reg_DX );
 118 }
 119
 120 static struct x86_reg
 121 get_temp_base( void )
 122 {
 123    return x86_make_reg(
 124       file_REG32,
 125       reg_BX );
 126 }
 127
 128 static struct x86_reg
 129 get_coef_base( void )
 130 {
 131    return get_output_base();
 132 }
 133
 134 static struct x86_reg
 135 get_immediate_base( void )
 136 {
 137    return x86_make_reg(
 138       file_REG32,
 139       reg_DI );
 140 }
 141
 142
 143 /**
 144  * Data access helpers.
 145  */
 146
 147
 148 static struct x86_reg
 149 get_immediate(
 150    unsigned vec,
 151    unsigned chan )
 152 {
 153    return x86_make_disp(
 154       get_immediate_base(),
 155       (vec * 4 + chan) * 4 );
 156 }
 157
 158 static struct x86_reg
 159 get_const(
 160    unsigned vec,
 161    unsigned chan )
 162 {
 163    return x86_make_disp(
 164       get_const_base(),
 165       (vec * 4 + chan) * 4 );
 166 }
 167
 168 static struct x86_reg
 169 get_input(
 170    unsigned vec,
 171    unsigned chan )
 172 {
 173    return x86_make_disp(
 174       get_input_base(),
 175       (vec * 4 + chan) * 16 );
 176 }
 177
 178 static struct x86_reg
 179 get_output(
 180    unsigned vec,
 181    unsigned chan )
 182 {
 183    return x86_make_disp(
 184       get_output_base(),
 185       (vec * 4 + chan) * 16 );
 186 }
 187
 188 static struct x86_reg
 189 get_temp(
 190    unsigned vec,
 191    unsigned chan )
 192 {
 193    return x86_make_disp(
 194       get_temp_base(),
 195       (vec * 4 + chan) * 16 );
 196 }
 197
 198 static struct x86_reg
 199 get_coef(
 200    unsigned vec,
 201    unsigned chan,
 202    unsigned member )
 203 {
 204    return x86_make_disp(
 205       get_coef_base(),
 206       ((vec * 3 + member) * 4 + chan) * 4 );
 207 }
 208
 209
 210 static void
 211 emit_ret(
 212    struct x86_function  *func )
 213 {
 214    x86_ret( func );
 215 }
 216
 217
 218 /**
 219  * Data fetch helpers.
 220  */
 221
 222 /**
 223  * Copy a shader constant to xmm register
 224  * \param xmm  the destination xmm register
 225  * \param vec  the src const buffer index
 226  * \param chan  src channel to fetch (X, Y, Z or W)
 227  */
 228 static void
 229 emit_const(
 230    struct x86_function *func,
 231    uint xmm,
 232    int vec,
 233    uint chan,
 234    uint indirect,
 235    uint indirectFile,
 236    int indirectIndex )
 237 {
 238    if (indirect) {
 239       /* 'vec' is the offset from the address register's value.
 240        * We're loading CONST[ADDR+vec] into an xmm register.
 241        */
 242       struct x86_reg r0 = get_input_base();
 243       struct x86_reg r1 = get_output_base();
 244       uint i;
 245
 246       assert( indirectFile == TGSI_FILE_ADDRESS );
 247       assert( indirectIndex == 0 );
 248
 249       x86_push( func, r0 );
 250       x86_push( func, r1 );
 251
 252       /*
 253        * Loop over the four pixels or vertices in the quad.
 254        * Get the value of the address (offset) register for pixel/vertex[i],
 255        * add it to the src offset and index into the constant buffer.
 256        * Note that we're working on SOA data.
 257        * If any of the pixel/vertex execution channels are unused their
 258        * values will be garbage.  It's very important that we don't use
 259        * those garbage values as indexes into the constant buffer since
 260        * that'll cause segfaults.
 261        * The solution is to bitwise-AND the offset with the execution mask
 262        * register whose values are either 0 or ~0.
 263        * The caller must setup the execution mask register to indicate
 264        * which channels are valid/alive before running the shader.
 265        * The execution mask will also figure into loops and conditionals
 266        * someday.
 267        */
 268       for (i = 0; i < QUAD_SIZE; i++) {
 269          /* r1 = address register[i] */
 270          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 271          /* r0 = execution mask[i] */
 272          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 273          /* r1 = r1 & r0 */
 274          x86_and( func, r1, r0 );
 275          /* r0 = 'vec', the offset */
 276          x86_lea( func, r0, get_const( vec, chan ) );
 277
 278          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 279           */
 280          x86_add( func, r1, r1 );
 281          x86_add( func, r1, r1 );
 282          x86_add( func, r1, r1 );
 283          x86_add( func, r1, r1 );
 284
 285          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 286          x86_mov( func, r1, x86_deref( r0 ) );
 287          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 288       }
 289
 290       x86_pop( func, r1 );
 291       x86_pop( func, r0 );
 292
 293       sse_movaps(
 294          func,
 295          make_xmm( xmm ),
 296          get_temp( TEMP_R0, CHAN_X ) );
 297    }
 298    else {
 299       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 300       assert( vec >= 0 );
 301
 302       sse_movss(
 303          func,
 304          make_xmm( xmm ),
 305          get_const( vec, chan ) );
 306       sse_shufps(
 307          func,
 308          make_xmm( xmm ),
 309          make_xmm( xmm ),
 310          SHUF( 0, 0, 0, 0 ) );
 311    }
 312 }
 313
 314 static void
 315 emit_immediate(
 316    struct x86_function *func,
 317    unsigned xmm,
 318    unsigned vec,
 319    unsigned chan )
 320 {
 321    sse_movss(
 322       func,
 323       make_xmm( xmm ),
 324       get_immediate( vec, chan ) );
 325    sse_shufps(
 326       func,
 327       make_xmm( xmm ),
 328       make_xmm( xmm ),
 329       SHUF( 0, 0, 0, 0 ) );
 330 }
 331
 332
 333 /**
 334  * Copy a shader input to xmm register
 335  * \param xmm  the destination xmm register
 336  * \param vec  the src input attrib
 337  * \param chan  src channel to fetch (X, Y, Z or W)
 338  */
 339 static void
 340 emit_inputf(
 341    struct x86_function *func,
 342    unsigned xmm,
 343    unsigned vec,
 344    unsigned chan )
 345 {
 346    sse_movups(
 347       func,
 348       make_xmm( xmm ),
 349       get_input( vec, chan ) );
 350 }
 351
 352 /**
 353  * Store an xmm register to a shader output
 354  * \param xmm  the source xmm register
 355  * \param vec  the dest output attrib
 356  * \param chan  src dest channel to store (X, Y, Z or W)
 357  */
 358 static void
 359 emit_output(
 360    struct x86_function *func,
 361    unsigned xmm,
 362    unsigned vec,
 363    unsigned chan )
 364 {
 365    sse_movups(
 366       func,
 367       get_output( vec, chan ),
 368       make_xmm( xmm ) );
 369 }
 370
 371 /**
 372  * Copy a shader temporary to xmm register
 373  * \param xmm  the destination xmm register
 374  * \param vec  the src temp register
 375  * \param chan  src channel to fetch (X, Y, Z or W)
 376  */
 377 static void
 378 emit_tempf(
 379    struct x86_function *func,
 380    unsigned xmm,
 381    unsigned vec,
 382    unsigned chan )
 383 {
 384    sse_movaps(
 385       func,
 386       make_xmm( xmm ),
 387       get_temp( vec, chan ) );
 388 }
 389
 390 /**
 391  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 392  * \param xmm  the destination xmm register
 393  * \param vec  the src input/attribute coefficient index
 394  * \param chan  src channel to fetch (X, Y, Z or W)
 395  * \param member  0=a0, 1=dadx, 2=dady
 396  */
 397 static void
 398 emit_coef(
 399    struct x86_function *func,
 400    unsigned xmm,
 401    unsigned vec,
 402    unsigned chan,
 403    unsigned member )
 404 {
 405    sse_movss(
 406       func,
 407       make_xmm( xmm ),
 408       get_coef( vec, chan, member ) );
 409    sse_shufps(
 410       func,
 411       make_xmm( xmm ),
 412       make_xmm( xmm ),
 413       SHUF( 0, 0, 0, 0 ) );
 414 }
 415
 416 /**
 417  * Data store helpers.
 418  */
 419
 420 static void
 421 emit_inputs(
 422    struct x86_function *func,
 423    unsigned xmm,
 424    unsigned vec,
 425    unsigned chan )
 426 {
 427    sse_movups(
 428       func,
 429       get_input( vec, chan ),
 430       make_xmm( xmm ) );
 431 }
 432
 433 static void
 434 emit_temps(
 435    struct x86_function *func,
 436    unsigned xmm,
 437    unsigned vec,
 438    unsigned chan )
 439 {
 440    sse_movaps(
 441       func,
 442       get_temp( vec, chan ),
 443       make_xmm( xmm ) );
 444 }
 445
 446 static void
 447 emit_addrs(
 448    struct x86_function *func,
 449    unsigned xmm,
 450    unsigned vec,
 451    unsigned chan )
 452 {
 453    assert( vec == 0 );
 454
 455    emit_temps(
 456       func,
 457       xmm,
 458       vec + TGSI_EXEC_TEMP_ADDR,
 459       chan );
 460 }
 461
 462 /**
 463  * Coefficent fetch helpers.
 464  */
 465
 466 static void
 467 emit_coef_a0(
 468    struct x86_function *func,
 469    unsigned xmm,
 470    unsigned vec,
 471    unsigned chan )
 472 {
 473    emit_coef(
 474       func,
 475       xmm,
 476       vec,
 477       chan,
 478       0 );
 479 }
 480
 481 static void
 482 emit_coef_dadx(
 483    struct x86_function *func,
 484    unsigned xmm,
 485    unsigned vec,
 486    unsigned chan )
 487 {
 488    emit_coef(
 489       func,
 490       xmm,
 491       vec,
 492       chan,
 493       1 );
 494 }
 495
 496 static void
 497 emit_coef_dady(
 498    struct x86_function *func,
 499    unsigned xmm,
 500    unsigned vec,
 501    unsigned chan )
 502 {
 503    emit_coef(
 504       func,
 505       xmm,
 506       vec,
 507       chan,
 508       2 );
 509 }
 510
 511 /**
 512  * Function call helpers.
 513  */
 514
 515 /**
 516  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 517  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 518  * that the stack pointer is 16 byte aligned, as expected.
 519  */
 520 static void
 521 emit_func_call_dst(
 522    struct x86_function *func,
 523    unsigned xmm_save,
 524    unsigned xmm_dst,
 525    void (PIPE_CDECL *code)() )
 526 {
 527    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 528    unsigned i, n, xmm;
 529    unsigned xmm_mask;
 530
 531    /* Bitmask of the xmm registers to save */
 532    xmm_mask = (1 << xmm_save) - 1;
 533    xmm_mask &= ~(1 << xmm_dst);
 534
 535    sse_movaps(
 536       func,
 537       get_temp( TEMP_R0, 0 ),
 538       make_xmm( xmm_dst ) );
 539
 540    x86_push(
 541       func,
 542       x86_make_reg( file_REG32, reg_AX) );
 543    x86_push(
 544       func,
 545       x86_make_reg( file_REG32, reg_CX) );
 546    x86_push(
 547       func,
 548       x86_make_reg( file_REG32, reg_DX) );
 549
 550    for(i = 0, n = 0; i < 8; ++i)
 551       if(xmm_mask & (1 << i))
 552          ++n;
 553
 554    x86_sub_imm(
 555       func,
 556       x86_make_reg( file_REG32, reg_SP ),
 557       n*16);
 558
 559    for(i = 0, n = 0; i < 8; ++i)
 560       if(xmm_mask & (1 << i)) {
 561          sse_movups(
 562             func,
 563             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 564             make_xmm( xmm ) );
 565          ++n;
 566       }
 567
 568    x86_lea(
 569       func,
 570       ecx,
 571       get_temp( TEMP_R0, 0 ) );
 572
 573    x86_push( func, ecx );
 574    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 575    x86_call( func, ecx );
 576    x86_pop(func, ecx );
 577
 578    for(i = 0, n = 0; i < 8; ++i)
 579       if(xmm_mask & (1 << i)) {
 580          sse_movups(
 581             func,
 582             make_xmm( xmm ),
 583             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 584          ++n;
 585       }
 586
 587    x86_add_imm(
 588       func,
 589       x86_make_reg( file_REG32, reg_SP ),
 590       n*16);
 591
 592    /* Restore GP registers in a reverse order.
 593     */
 594    x86_pop(
 595       func,
 596       x86_make_reg( file_REG32, reg_DX) );
 597    x86_pop(
 598       func,
 599       x86_make_reg( file_REG32, reg_CX) );
 600    x86_pop(
 601       func,
 602       x86_make_reg( file_REG32, reg_AX) );
 603
 604    sse_movaps(
 605       func,
 606       make_xmm( xmm_dst ),
 607       get_temp( TEMP_R0, 0 ) );
 608 }
 609
 610 static void
 611 emit_func_call_dst_src(
 612    struct x86_function *func,
 613    unsigned xmm_save,
 614    unsigned xmm_dst,
 615    unsigned xmm_src,
 616    void (PIPE_CDECL *code)() )
 617 {
 618    sse_movaps(
 619       func,
 620       get_temp( TEMP_R0, 1 ),
 621       make_xmm( xmm_src ) );
 622
 623    emit_func_call_dst(
 624       func,
 625       xmm_save,
 626       xmm_dst,
 627       code );
 628 }
 629
 630 /*
 631  * Fast SSE2 implementation of special math functions.
 632  */
 633
 634 #define POLY0(x, c0) _mm_set1_ps(c0)
 635 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 636 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 637 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 638 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 639 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 640
 641 #define EXP_POLY_DEGREE 3
 642 #define LOG_POLY_DEGREE 5
 643
 644 /**
 645  * See http://www.devmaster.net/forums/showthread.php?p=43580
 646  */
 647 static INLINE __m128
 648 exp2f4(__m128 x)
 649 {
 650    __m128i ipart;
 651    __m128 fpart, expipart, expfpart;
 652
 653    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 654    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 655
 656    /* ipart = int(x - 0.5) */
 657    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 658
 659    /* fpart = x - ipart */
 660    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 661
 662    /* expipart = (float) (1 << ipart) */
 663    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 664
 665    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 666 #if EXP_POLY_DEGREE == 5
 667    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 668 #elif EXP_POLY_DEGREE == 4
 669    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 670 #elif EXP_POLY_DEGREE == 3
 671    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 672 #elif EXP_POLY_DEGREE == 2
 673    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 674 #else
 675 #error
 676 #endif
 677
 678    return _mm_mul_ps(expipart, expfpart);
 679 }
 680
 681 /**
 682  * See http://www.devmaster.net/forums/showthread.php?p=43580
 683  */
 684 static INLINE __m128
 685 log2f4(__m128 x)
 686 {
 687    __m128i expmask = _mm_set1_epi32(0x7f800000);
 688    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 689    __m128 one = _mm_set1_ps(1.0f);
 690
 691    __m128i i = _mm_castps_si128(x);
 692
 693    /* exp = (float) exponent(x) */
 694    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 695
 696    /* mant = (float) mantissa(x) */
 697    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 698
 699    __m128 logmant;
 700
 701    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 702     * These coefficients can be generate with
 703     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 704     */
 705 #if LOG_POLY_DEGREE == 6
 706    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 707 #elif LOG_POLY_DEGREE == 5
 708    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 709 #elif LOG_POLY_DEGREE == 4
 710    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 711 #elif LOG_POLY_DEGREE == 3
 712    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 713 #else
 714 #error
 715 #endif
 716
 717    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 718    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 719
 720    return _mm_add_ps(logmant, exp);
 721 }
 722
 723 static INLINE __m128
 724 powf4(__m128 x, __m128 y)
 725 {
 726    return exp2f4(_mm_mul_ps(log2f4(x), y));
 727 }
 728
 729
 730 /**
 731  * Low-level instruction translators.
 732  */
 733
 734 static void
 735 emit_abs(
 736    struct x86_function *func,
 737    unsigned xmm )
 738 {
 739    sse_andps(
 740       func,
 741       make_xmm( xmm ),
 742       get_temp(
 743          TGSI_EXEC_TEMP_7FFFFFFF_I,
 744          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 745 }
 746
 747 static void
 748 emit_add(
 749    struct x86_function *func,
 750    unsigned xmm_dst,
 751    unsigned xmm_src )
 752 {
 753    sse_addps(
 754       func,
 755       make_xmm( xmm_dst ),
 756       make_xmm( xmm_src ) );
 757 }
 758
 759 static void PIPE_CDECL
 760 cos4f(
 761    float *store )
 762 {
 763    store[0] = cosf( store[0] );
 764    store[1] = cosf( store[1] );
 765    store[2] = cosf( store[2] );
 766    store[3] = cosf( store[3] );
 767 }
 768
 769 static void
 770 emit_cos(
 771    struct x86_function *func,
 772    unsigned xmm_save,
 773    unsigned xmm_dst )
 774 {
 775    emit_func_call_dst(
 776       func,
 777       xmm_save,
 778       xmm_dst,
 779       cos4f );
 780 }
 781
 782 static void PIPE_CDECL
 783 #if defined(PIPE_CC_GCC)
 784 __attribute__((force_align_arg_pointer))
 785 #endif
 786 ex24f(
 787    float *store )
 788 {
 789    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 790 }
 791
 792 static void
 793 emit_ex2(
 794    struct x86_function *func,
 795    unsigned xmm_save,
 796    unsigned xmm_dst )
 797 {
 798    emit_func_call_dst(
 799       func,
 800       xmm_save,
 801       xmm_dst,
 802       ex24f );
 803 }
 804
 805 static void
 806 emit_f2it(
 807    struct x86_function *func,
 808    unsigned xmm )
 809 {
 810    sse2_cvttps2dq(
 811       func,
 812       make_xmm( xmm ),
 813       make_xmm( xmm ) );
 814 }
 815
 816 static void
 817 emit_i2f(
 818    struct x86_function *func,
 819    unsigned xmm )
 820 {
 821    sse2_cvtdq2ps(
 822       func,
 823       make_xmm( xmm ),
 824       make_xmm( xmm ) );
 825 }
 826
 827 static void PIPE_CDECL
 828 flr4f(
 829    float *store )
 830 {
 831    store[0] = floorf( store[0] );
 832    store[1] = floorf( store[1] );
 833    store[2] = floorf( store[2] );
 834    store[3] = floorf( store[3] );
 835 }
 836
 837 static void
 838 emit_flr(
 839    struct x86_function *func,
 840    unsigned xmm_save,
 841    unsigned xmm_dst )
 842 {
 843    emit_func_call_dst(
 844       func,
 845       xmm_save,
 846       xmm_dst,
 847       flr4f );
 848 }
 849
 850 static void PIPE_CDECL
 851 frc4f(
 852    float *store )
 853 {
 854    store[0] -= floorf( store[0] );
 855    store[1] -= floorf( store[1] );
 856    store[2] -= floorf( store[2] );
 857    store[3] -= floorf( store[3] );
 858 }
 859
 860 static void
 861 emit_frc(
 862    struct x86_function *func,
 863    unsigned xmm_save,
 864    unsigned xmm_dst )
 865 {
 866    emit_func_call_dst(
 867       func,
 868       xmm_save,
 869       xmm_dst,
 870       frc4f );
 871 }
 872
 873 static void PIPE_CDECL
 874 #if defined(PIPE_CC_GCC)
 875 __attribute__((force_align_arg_pointer))
 876 #endif
 877 lg24f(
 878    float *store )
 879 {
 880    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 881 }
 882
 883 static void
 884 emit_lg2(
 885    struct x86_function *func,
 886    unsigned xmm_save,
 887    unsigned xmm_dst )
 888 {
 889    emit_func_call_dst(
 890       func,
 891       xmm_save,
 892       xmm_dst,
 893       lg24f );
 894 }
 895
 896 static void
 897 emit_MOV(
 898    struct x86_function *func,
 899    unsigned xmm_dst,
 900    unsigned xmm_src )
 901 {
 902    sse_movups(
 903       func,
 904       make_xmm( xmm_dst ),
 905       make_xmm( xmm_src ) );
 906 }
 907
 908 static void
 909 emit_mul (struct x86_function *func,
 910           unsigned xmm_dst,
 911           unsigned xmm_src)
 912 {
 913    sse_mulps(
 914       func,
 915       make_xmm( xmm_dst ),
 916       make_xmm( xmm_src ) );
 917 }
 918
 919 static void
 920 emit_neg(
 921    struct x86_function *func,
 922    unsigned xmm )
 923 {
 924    sse_xorps(
 925       func,
 926       make_xmm( xmm ),
 927       get_temp(
 928          TGSI_EXEC_TEMP_80000000_I,
 929          TGSI_EXEC_TEMP_80000000_C ) );
 930 }
 931
 932 static void PIPE_CDECL
 933 #if defined(PIPE_CC_GCC)
 934 __attribute__((force_align_arg_pointer))
 935 #endif
 936 pow4f(
 937    float *store )
 938 {
 939 #if 1
 940    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
 941 #else
 942    store[0] = powf( store[0], store[4] );
 943    store[1] = powf( store[1], store[5] );
 944    store[2] = powf( store[2], store[6] );
 945    store[3] = powf( store[3], store[7] );
 946 #endif
 947 }
 948
 949 static void
 950 emit_pow(
 951    struct x86_function *func,
 952    unsigned xmm_save,
 953    unsigned xmm_dst,
 954    unsigned xmm_src )
 955 {
 956    emit_func_call_dst_src(
 957       func,
 958       xmm_save,
 959       xmm_dst,
 960       xmm_src,
 961       pow4f );
 962 }
 963
 964 static void
 965 emit_rcp (
 966    struct x86_function *func,
 967    unsigned xmm_dst,
 968    unsigned xmm_src )
 969 {
 970    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 971     * good enough.  Need to either emit a proper divide or use the
 972     * iterative technique described below in emit_rsqrt().
 973     */
 974    sse2_rcpps(
 975       func,
 976       make_xmm( xmm_dst ),
 977       make_xmm( xmm_src ) );
 978 }
 979
 980 static void
 981 emit_rsqrt(
 982    struct x86_function *func,
 983    unsigned xmm_dst,
 984    unsigned xmm_src )
 985 {
 986 #if HIGH_PRECISION
 987    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
 988     * implementations, it is possible to improve its precision at
 989     * fairly low cost, using a newton/raphson step, as below:
 990     *
 991     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
 992     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
 993     *
 994     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
 995     */
 996    {
 997       struct x86_reg dst = make_xmm( xmm_dst );
 998       struct x86_reg src = make_xmm( xmm_src );
 999       struct x86_reg tmp0 = make_xmm( 2 );
1000       struct x86_reg tmp1 = make_xmm( 3 );
1001
1002       assert( xmm_dst != xmm_src );
1003       assert( xmm_dst != 2 && xmm_dst != 3 );
1004       assert( xmm_src != 2 && xmm_src != 3 );
1005
1006       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1007       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1008       sse_rsqrtps( func, tmp1, src  );
1009       sse_mulps(   func, src,  tmp1 );
1010       sse_mulps(   func, dst,  tmp1 );
1011       sse_mulps(   func, src,  tmp1 );
1012       sse_subps(   func, tmp0, src  );
1013       sse_mulps(   func, dst,  tmp0 );
1014    }
1015 #else
1016    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1017     * good enough.
1018     */
1019    sse_rsqrtps(
1020       func,
1021       make_xmm( xmm_dst ),
1022       make_xmm( xmm_src ) );
1023 #endif
1024 }
1025
1026 static void
1027 emit_setsign(
1028    struct x86_function *func,
1029    unsigned xmm )
1030 {
1031    sse_orps(
1032       func,
1033       make_xmm( xmm ),
1034       get_temp(
1035          TGSI_EXEC_TEMP_80000000_I,
1036          TGSI_EXEC_TEMP_80000000_C ) );
1037 }
1038
1039 static void PIPE_CDECL
1040 sin4f(
1041    float *store )
1042 {
1043    store[0] = sinf( store[0] );
1044    store[1] = sinf( store[1] );
1045    store[2] = sinf( store[2] );
1046    store[3] = sinf( store[3] );
1047 }
1048
1049 static void
1050 emit_sin (struct x86_function *func,
1051           unsigned xmm_save,
1052           unsigned xmm_dst)
1053 {
1054    emit_func_call_dst(
1055       func,
1056       xmm_save,
1057       xmm_dst,
1058       sin4f );
1059 }
1060
1061 static void
1062 emit_sub(
1063    struct x86_function *func,
1064    unsigned xmm_dst,
1065    unsigned xmm_src )
1066 {
1067    sse_subps(
1068       func,
1069       make_xmm( xmm_dst ),
1070       make_xmm( xmm_src ) );
1071 }
1072
1073 /**
1074  * Register fetch.
1075  */
1076
1077 static void
1078 emit_fetch(
1079    struct x86_function *func,
1080    unsigned xmm,
1081    const struct tgsi_full_src_register *reg,
1082    const unsigned chan_index )
1083 {
1084    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1085
1086    switch (swizzle) {
1087    case TGSI_EXTSWIZZLE_X:
1088    case TGSI_EXTSWIZZLE_Y:
1089    case TGSI_EXTSWIZZLE_Z:
1090    case TGSI_EXTSWIZZLE_W:
1091       switch (reg->SrcRegister.File) {
1092       case TGSI_FILE_CONSTANT:
1093          emit_const(
1094             func,
1095             xmm,
1096             reg->SrcRegister.Index,
1097             swizzle,
1098             reg->SrcRegister.Indirect,
1099             reg->SrcRegisterInd.File,
1100             reg->SrcRegisterInd.Index );
1101          break;
1102
1103       case TGSI_FILE_IMMEDIATE:
1104          emit_immediate(
1105             func,
1106             xmm,
1107             reg->SrcRegister.Index,
1108             swizzle );
1109          break;
1110
1111       case TGSI_FILE_INPUT:
1112          emit_inputf(
1113             func,
1114             xmm,
1115             reg->SrcRegister.Index,
1116             swizzle );
1117          break;
1118
1119       case TGSI_FILE_TEMPORARY:
1120          emit_tempf(
1121             func,
1122             xmm,
1123             reg->SrcRegister.Index,
1124             swizzle );
1125          break;
1126
1127       default:
1128          assert( 0 );
1129       }
1130       break;
1131
1132    case TGSI_EXTSWIZZLE_ZERO:
1133       emit_tempf(
1134          func,
1135          xmm,
1136          TGSI_EXEC_TEMP_00000000_I,
1137          TGSI_EXEC_TEMP_00000000_C );
1138       break;
1139
1140    case TGSI_EXTSWIZZLE_ONE:
1141       emit_tempf(
1142          func,
1143          xmm,
1144          TEMP_ONE_I,
1145          TEMP_ONE_C );
1146       break;
1147
1148    default:
1149       assert( 0 );
1150    }
1151
1152    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1153    case TGSI_UTIL_SIGN_CLEAR:
1154       emit_abs( func, xmm );
1155       break;
1156
1157    case TGSI_UTIL_SIGN_SET:
1158       emit_setsign( func, xmm );
1159       break;
1160
1161    case TGSI_UTIL_SIGN_TOGGLE:
1162       emit_neg( func, xmm );
1163       break;
1164
1165    case TGSI_UTIL_SIGN_KEEP:
1166       break;
1167    }
1168 }
1169
1170 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1171    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1172
1173 /**
1174  * Register store.
1175  */
1176
1177 static void
1178 emit_store(
1179    struct x86_function *func,
1180    unsigned xmm,
1181    const struct tgsi_full_dst_register *reg,
1182    const struct tgsi_full_instruction *inst,
1183    unsigned chan_index )
1184 {
1185    switch( reg->DstRegister.File ) {
1186    case TGSI_FILE_OUTPUT:
1187       emit_output(
1188          func,
1189          xmm,
1190          reg->DstRegister.Index,
1191          chan_index );
1192       break;
1193
1194    case TGSI_FILE_TEMPORARY:
1195       emit_temps(
1196          func,
1197          xmm,
1198          reg->DstRegister.Index,
1199          chan_index );
1200       break;
1201
1202    case TGSI_FILE_ADDRESS:
1203       emit_addrs(
1204          func,
1205          xmm,
1206          reg->DstRegister.Index,
1207          chan_index );
1208       break;
1209
1210    default:
1211       assert( 0 );
1212    }
1213
1214    switch( inst->Instruction.Saturate ) {
1215    case TGSI_SAT_NONE:
1216       break;
1217
1218    case TGSI_SAT_ZERO_ONE:
1219       /* assert( 0 ); */
1220       break;
1221
1222    case TGSI_SAT_MINUS_PLUS_ONE:
1223       assert( 0 );
1224       break;
1225    }
1226 }
1227
1228 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1229    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1230
1231 /**
1232  * High-level instruction translators.
1233  */
1234
1235 static void
1236 emit_kil(
1237    struct x86_function *func,
1238    const struct tgsi_full_src_register *reg )
1239 {
1240    unsigned uniquemask;
1241    unsigned registers[4];
1242    unsigned nextregister = 0;
1243    unsigned firstchan = ~0;
1244    unsigned chan_index;
1245
1246    /* This mask stores component bits that were already tested. Note that
1247     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1248     * tested. */
1249    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1250
1251    FOR_EACH_CHANNEL( chan_index ) {
1252       unsigned swizzle;
1253
1254       /* unswizzle channel */
1255       swizzle = tgsi_util_get_full_src_register_extswizzle(
1256          reg,
1257          chan_index );
1258
1259       /* check if the component has not been already tested */
1260       if( !(uniquemask & (1 << swizzle)) ) {
1261          uniquemask |= 1 << swizzle;
1262
1263          /* allocate register */
1264          registers[chan_index] = nextregister;
1265          emit_fetch(
1266             func,
1267             nextregister,
1268             reg,
1269             chan_index );
1270          nextregister++;
1271
1272          /* mark the first channel used */
1273          if( firstchan == ~0 ) {
1274             firstchan = chan_index;
1275          }
1276       }
1277    }
1278
1279    x86_push(
1280       func,
1281       x86_make_reg( file_REG32, reg_AX ) );
1282    x86_push(
1283       func,
1284       x86_make_reg( file_REG32, reg_DX ) );
1285
1286    FOR_EACH_CHANNEL( chan_index ) {
1287       if( uniquemask & (1 << chan_index) ) {
1288          sse_cmpps(
1289             func,
1290             make_xmm( registers[chan_index] ),
1291             get_temp(
1292                TGSI_EXEC_TEMP_00000000_I,
1293                TGSI_EXEC_TEMP_00000000_C ),
1294             cc_LessThan );
1295
1296          if( chan_index == firstchan ) {
1297             sse_pmovmskb(
1298                func,
1299                x86_make_reg( file_REG32, reg_AX ),
1300                make_xmm( registers[chan_index] ) );
1301          }
1302          else {
1303             sse_pmovmskb(
1304                func,
1305                x86_make_reg( file_REG32, reg_DX ),
1306                make_xmm( registers[chan_index] ) );
1307             x86_or(
1308                func,
1309                x86_make_reg( file_REG32, reg_AX ),
1310                x86_make_reg( file_REG32, reg_DX ) );
1311          }
1312       }
1313    }
1314
1315    x86_or(
1316       func,
1317       get_temp(
1318          TGSI_EXEC_TEMP_KILMASK_I,
1319          TGSI_EXEC_TEMP_KILMASK_C ),
1320       x86_make_reg( file_REG32, reg_AX ) );
1321
1322    x86_pop(
1323       func,
1324       x86_make_reg( file_REG32, reg_DX ) );
1325    x86_pop(
1326       func,
1327       x86_make_reg( file_REG32, reg_AX ) );
1328 }
1329
1330
1331 static void
1332 emit_kilp(
1333    struct x86_function *func )
1334 {
1335    /* XXX todo / fix me */
1336 }
1337
1338
1339 static void
1340 emit_setcc(
1341    struct x86_function *func,
1342    struct tgsi_full_instruction *inst,
1343    enum sse_cc cc )
1344 {
1345    unsigned chan_index;
1346
1347    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1348       FETCH( func, *inst, 0, 0, chan_index );
1349       FETCH( func, *inst, 1, 1, chan_index );
1350       sse_cmpps(
1351          func,
1352          make_xmm( 0 ),
1353          make_xmm( 1 ),
1354          cc );
1355       sse_andps(
1356          func,
1357          make_xmm( 0 ),
1358          get_temp(
1359             TEMP_ONE_I,
1360             TEMP_ONE_C ) );
1361       STORE( func, *inst, 0, 0, chan_index );
1362    }
1363 }
1364
1365 static void
1366 emit_cmp(
1367    struct x86_function *func,
1368    struct tgsi_full_instruction *inst )
1369 {
1370    unsigned chan_index;
1371
1372    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1373       FETCH( func, *inst, 0, 0, chan_index );
1374       FETCH( func, *inst, 1, 1, chan_index );
1375       FETCH( func, *inst, 2, 2, chan_index );
1376       sse_cmpps(
1377          func,
1378          make_xmm( 0 ),
1379          get_temp(
1380             TGSI_EXEC_TEMP_00000000_I,
1381             TGSI_EXEC_TEMP_00000000_C ),
1382          cc_LessThan );
1383       sse_andps(
1384          func,
1385          make_xmm( 1 ),
1386          make_xmm( 0 ) );
1387       sse_andnps(
1388          func,
1389          make_xmm( 0 ),
1390          make_xmm( 2 ) );
1391       sse_orps(
1392          func,
1393          make_xmm( 0 ),
1394          make_xmm( 1 ) );
1395       STORE( func, *inst, 0, 0, chan_index );
1396    }
1397 }
1398
1399 static int
1400 emit_instruction(
1401    struct x86_function *func,
1402    struct tgsi_full_instruction *inst )
1403 {
1404    unsigned chan_index;
1405
1406    switch (inst->Instruction.Opcode) {
1407    case TGSI_OPCODE_ARL:
1408       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1409          FETCH( func, *inst, 0, 0, chan_index );
1410          emit_f2it( func, 0 );
1411          STORE( func, *inst, 0, 0, chan_index );
1412       }
1413       break;
1414
1415    case TGSI_OPCODE_MOV:
1416    case TGSI_OPCODE_SWZ:
1417       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1418          FETCH( func, *inst, 0, 0, chan_index );
1419          STORE( func, *inst, 0, 0, chan_index );
1420       }
1421       break;
1422
1423    case TGSI_OPCODE_LIT:
1424       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1425           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1426          emit_tempf(
1427             func,
1428             0,
1429             TEMP_ONE_I,
1430             TEMP_ONE_C);
1431          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1432             STORE( func, *inst, 0, 0, CHAN_X );
1433          }
1434          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1435             STORE( func, *inst, 0, 0, CHAN_W );
1436          }
1437       }
1438       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1439           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1440          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1441             FETCH( func, *inst, 0, 0, CHAN_X );
1442             sse_maxps(
1443                func,
1444                make_xmm( 0 ),
1445                get_temp(
1446                   TGSI_EXEC_TEMP_00000000_I,
1447                   TGSI_EXEC_TEMP_00000000_C ) );
1448             STORE( func, *inst, 0, 0, CHAN_Y );
1449          }
1450          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1451             /* XMM[1] = SrcReg[0].yyyy */
1452             FETCH( func, *inst, 1, 0, CHAN_Y );
1453             /* XMM[1] = max(XMM[1], 0) */
1454             sse_maxps(
1455                func,
1456                make_xmm( 1 ),
1457                get_temp(
1458                   TGSI_EXEC_TEMP_00000000_I,
1459                   TGSI_EXEC_TEMP_00000000_C ) );
1460             /* XMM[2] = SrcReg[0].wwww */
1461             FETCH( func, *inst, 2, 0, CHAN_W );
1462             /* XMM[2] = min(XMM[2], 128.0) */
1463             sse_minps(
1464                func,
1465                make_xmm( 2 ),
1466                get_temp(
1467                   TGSI_EXEC_TEMP_128_I,
1468                   TGSI_EXEC_TEMP_128_C ) );
1469             /* XMM[2] = max(XMM[2], -128.0) */
1470             sse_maxps(
1471                func,
1472                make_xmm( 2 ),
1473                get_temp(
1474                   TGSI_EXEC_TEMP_MINUS_128_I,
1475                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1476             emit_pow( func, 3, 1, 2 );
1477             FETCH( func, *inst, 0, 0, CHAN_X );
1478             sse_xorps(
1479                func,
1480                make_xmm( 2 ),
1481                make_xmm( 2 ) );
1482             sse_cmpps(
1483                func,
1484                make_xmm( 2 ),
1485                make_xmm( 0 ),
1486                cc_LessThanEqual );
1487             sse_andps(
1488                func,
1489                make_xmm( 2 ),
1490                make_xmm( 1 ) );
1491             STORE( func, *inst, 2, 0, CHAN_Z );
1492          }
1493       }
1494       break;
1495
1496    case TGSI_OPCODE_RCP:
1497    /* TGSI_OPCODE_RECIP */
1498       FETCH( func, *inst, 0, 0, CHAN_X );
1499       emit_rcp( func, 0, 0 );
1500       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1501          STORE( func, *inst, 0, 0, chan_index );
1502       }
1503       break;
1504
1505    case TGSI_OPCODE_RSQ:
1506    /* TGSI_OPCODE_RECIPSQRT */
1507       FETCH( func, *inst, 0, 0, CHAN_X );
1508       emit_rsqrt( func, 1, 0 );
1509       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1510          STORE( func, *inst, 1, 0, chan_index );
1511       }
1512       break;
1513
1514    case TGSI_OPCODE_EXP:
1515       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1516           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1517           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1518          FETCH( func, *inst, 0, 0, CHAN_X );
1519          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1520              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1521             emit_MOV( func, 1, 0 );
1522             emit_flr( func, 2, 1 );
1523             /* dst.x = ex2(floor(src.x)) */
1524             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1525                emit_MOV( func, 2, 1 );
1526                emit_ex2( func, 3, 2 );
1527                STORE( func, *inst, 2, 0, CHAN_X );
1528             }
1529             /* dst.y = src.x - floor(src.x) */
1530             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1531                emit_MOV( func, 2, 0 );
1532                emit_sub( func, 2, 1 );
1533                STORE( func, *inst, 2, 0, CHAN_Y );
1534             }
1535          }
1536          /* dst.z = ex2(src.x) */
1537          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1538             emit_ex2( func, 3, 0 );
1539             STORE( func, *inst, 0, 0, CHAN_Z );
1540          }
1541       }
1542       /* dst.w = 1.0 */
1543       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1544          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1545          STORE( func, *inst, 0, 0, CHAN_W );
1546       }
1547       break;
1548
1549    case TGSI_OPCODE_LOG:
1550       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1551           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1552           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1553          FETCH( func, *inst, 0, 0, CHAN_X );
1554          emit_abs( func, 0 );
1555          emit_MOV( func, 1, 0 );
1556          emit_lg2( func, 2, 1 );
1557          /* dst.z = lg2(abs(src.x)) */
1558          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1559             STORE( func, *inst, 1, 0, CHAN_Z );
1560          }
1561          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1562              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1563             emit_flr( func, 2, 1 );
1564             /* dst.x = floor(lg2(abs(src.x))) */
1565             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1566                STORE( func, *inst, 1, 0, CHAN_X );
1567             }
1568             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1569             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1570                emit_ex2( func, 2, 1 );
1571                emit_rcp( func, 1, 1 );
1572                emit_mul( func, 0, 1 );
1573                STORE( func, *inst, 0, 0, CHAN_Y );
1574             }
1575          }
1576       }
1577       /* dst.w = 1.0 */
1578       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1579          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1580          STORE( func, *inst, 0, 0, CHAN_W );
1581       }
1582       break;
1583
1584    case TGSI_OPCODE_MUL:
1585       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1586          FETCH( func, *inst, 0, 0, chan_index );
1587          FETCH( func, *inst, 1, 1, chan_index );
1588          emit_mul( func, 0, 1 );
1589          STORE( func, *inst, 0, 0, chan_index );
1590       }
1591       break;
1592
1593    case TGSI_OPCODE_ADD:
1594       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1595          FETCH( func, *inst, 0, 0, chan_index );
1596          FETCH( func, *inst, 1, 1, chan_index );
1597          emit_add( func, 0, 1 );
1598          STORE( func, *inst, 0, 0, chan_index );
1599       }
1600       break;
1601
1602    case TGSI_OPCODE_DP3:
1603    /* TGSI_OPCODE_DOT3 */
1604       FETCH( func, *inst, 0, 0, CHAN_X );
1605       FETCH( func, *inst, 1, 1, CHAN_X );
1606       emit_mul( func, 0, 1 );
1607       FETCH( func, *inst, 1, 0, CHAN_Y );
1608       FETCH( func, *inst, 2, 1, CHAN_Y );
1609       emit_mul( func, 1, 2 );
1610       emit_add( func, 0, 1 );
1611       FETCH( func, *inst, 1, 0, CHAN_Z );
1612       FETCH( func, *inst, 2, 1, CHAN_Z );
1613       emit_mul( func, 1, 2 );
1614       emit_add( func, 0, 1 );
1615       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1616          STORE( func, *inst, 0, 0, chan_index );
1617       }
1618       break;
1619
1620    case TGSI_OPCODE_DP4:
1621    /* TGSI_OPCODE_DOT4 */
1622       FETCH( func, *inst, 0, 0, CHAN_X );
1623       FETCH( func, *inst, 1, 1, CHAN_X );
1624       emit_mul( func, 0, 1 );
1625       FETCH( func, *inst, 1, 0, CHAN_Y );
1626       FETCH( func, *inst, 2, 1, CHAN_Y );
1627       emit_mul( func, 1, 2 );
1628       emit_add( func, 0, 1 );
1629       FETCH( func, *inst, 1, 0, CHAN_Z );
1630       FETCH( func, *inst, 2, 1, CHAN_Z );
1631       emit_mul(func, 1, 2 );
1632       emit_add(func, 0, 1 );
1633       FETCH( func, *inst, 1, 0, CHAN_W );
1634       FETCH( func, *inst, 2, 1, CHAN_W );
1635       emit_mul( func, 1, 2 );
1636       emit_add( func, 0, 1 );
1637       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1638          STORE( func, *inst, 0, 0, chan_index );
1639       }
1640       break;
1641
1642    case TGSI_OPCODE_DST:
1643       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1644          emit_tempf(
1645             func,
1646             0,
1647             TEMP_ONE_I,
1648             TEMP_ONE_C );
1649          STORE( func, *inst, 0, 0, CHAN_X );
1650       }
1651       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1652          FETCH( func, *inst, 0, 0, CHAN_Y );
1653          FETCH( func, *inst, 1, 1, CHAN_Y );
1654          emit_mul( func, 0, 1 );
1655          STORE( func, *inst, 0, 0, CHAN_Y );
1656       }
1657       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1658          FETCH( func, *inst, 0, 0, CHAN_Z );
1659          STORE( func, *inst, 0, 0, CHAN_Z );
1660       }
1661       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1662          FETCH( func, *inst, 0, 1, CHAN_W );
1663          STORE( func, *inst, 0, 0, CHAN_W );
1664       }
1665       break;
1666
1667    case TGSI_OPCODE_MIN:
1668       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1669          FETCH( func, *inst, 0, 0, chan_index );
1670          FETCH( func, *inst, 1, 1, chan_index );
1671          sse_minps(
1672             func,
1673             make_xmm( 0 ),
1674             make_xmm( 1 ) );
1675          STORE( func, *inst, 0, 0, chan_index );
1676       }
1677       break;
1678
1679    case TGSI_OPCODE_MAX:
1680       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1681          FETCH( func, *inst, 0, 0, chan_index );
1682          FETCH( func, *inst, 1, 1, chan_index );
1683          sse_maxps(
1684             func,
1685             make_xmm( 0 ),
1686             make_xmm( 1 ) );
1687          STORE( func, *inst, 0, 0, chan_index );
1688       }
1689       break;
1690
1691    case TGSI_OPCODE_SLT:
1692    /* TGSI_OPCODE_SETLT */
1693       emit_setcc( func, inst, cc_LessThan );
1694       break;
1695
1696    case TGSI_OPCODE_SGE:
1697    /* TGSI_OPCODE_SETGE */
1698       emit_setcc( func, inst, cc_NotLessThan );
1699       break;
1700
1701    case TGSI_OPCODE_MAD:
1702    /* TGSI_OPCODE_MADD */
1703       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1704          FETCH( func, *inst, 0, 0, chan_index );
1705          FETCH( func, *inst, 1, 1, chan_index );
1706          FETCH( func, *inst, 2, 2, chan_index );
1707          emit_mul( func, 0, 1 );
1708          emit_add( func, 0, 2 );
1709          STORE( func, *inst, 0, 0, chan_index );
1710       }
1711       break;
1712
1713    case TGSI_OPCODE_SUB:
1714       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1715          FETCH( func, *inst, 0, 0, chan_index );
1716          FETCH( func, *inst, 1, 1, chan_index );
1717          emit_sub( func, 0, 1 );
1718          STORE( func, *inst, 0, 0, chan_index );
1719       }
1720       break;
1721
1722    case TGSI_OPCODE_LERP:
1723    /* TGSI_OPCODE_LRP */
1724       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1725          FETCH( func, *inst, 0, 0, chan_index );
1726          FETCH( func, *inst, 1, 1, chan_index );
1727          FETCH( func, *inst, 2, 2, chan_index );
1728          emit_sub( func, 1, 2 );
1729          emit_mul( func, 0, 1 );
1730          emit_add( func, 0, 2 );
1731          STORE( func, *inst, 0, 0, chan_index );
1732       }
1733       break;
1734
1735    case TGSI_OPCODE_CND:
1736       return 0;
1737       break;
1738
1739    case TGSI_OPCODE_CND0:
1740       return 0;
1741       break;
1742
1743    case TGSI_OPCODE_DOT2ADD:
1744    /* TGSI_OPCODE_DP2A */
1745       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
1746       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
1747       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
1748       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
1749       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
1750       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
1751       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
1752       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
1753       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
1754       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1755          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
1756       }
1757       break;
1758
1759    case TGSI_OPCODE_INDEX:
1760       return 0;
1761       break;
1762
1763    case TGSI_OPCODE_NEGATE:
1764       return 0;
1765       break;
1766
1767    case TGSI_OPCODE_FRAC:
1768    /* TGSI_OPCODE_FRC */
1769       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1770          FETCH( func, *inst, 0, 0, chan_index );
1771          emit_frc( func, 0, 0 );
1772          STORE( func, *inst, 0, 0, chan_index );
1773       }
1774       break;
1775
1776    case TGSI_OPCODE_CLAMP:
1777       return 0;
1778       break;
1779
1780    case TGSI_OPCODE_FLOOR:
1781    /* TGSI_OPCODE_FLR */
1782       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1783          FETCH( func, *inst, 0, 0, chan_index );
1784          emit_flr( func, 0, 0 );
1785          STORE( func, *inst, 0, 0, chan_index );
1786       }
1787       break;
1788
1789    case TGSI_OPCODE_ROUND:
1790       return 0;
1791       break;
1792
1793    case TGSI_OPCODE_EXPBASE2:
1794    /* TGSI_OPCODE_EX2 */
1795       FETCH( func, *inst, 0, 0, CHAN_X );
1796       emit_ex2( func, 0, 0 );
1797       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1798          STORE( func, *inst, 0, 0, chan_index );
1799       }
1800       break;
1801
1802    case TGSI_OPCODE_LOGBASE2:
1803    /* TGSI_OPCODE_LG2 */
1804       FETCH( func, *inst, 0, 0, CHAN_X );
1805       emit_lg2( func, 0, 0 );
1806       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1807          STORE( func, *inst, 0, 0, chan_index );
1808       }
1809       break;
1810
1811    case TGSI_OPCODE_POWER:
1812    /* TGSI_OPCODE_POW */
1813       FETCH( func, *inst, 0, 0, CHAN_X );
1814       FETCH( func, *inst, 1, 1, CHAN_X );
1815       emit_pow( func, 0, 0, 1 );
1816       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1817          STORE( func, *inst, 0, 0, chan_index );
1818       }
1819       break;
1820
1821    case TGSI_OPCODE_CROSSPRODUCT:
1822    /* TGSI_OPCODE_XPD */
1823       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1824           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1825          FETCH( func, *inst, 1, 1, CHAN_Z );
1826          FETCH( func, *inst, 3, 0, CHAN_Z );
1827       }
1828       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1829           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1830          FETCH( func, *inst, 0, 0, CHAN_Y );
1831          FETCH( func, *inst, 4, 1, CHAN_Y );
1832       }
1833       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1834          emit_MOV( func, 2, 0 );
1835          emit_mul( func, 2, 1 );
1836          emit_MOV( func, 5, 3 );
1837          emit_mul( func, 5, 4 );
1838          emit_sub( func, 2, 5 );
1839          STORE( func, *inst, 2, 0, CHAN_X );
1840       }
1841       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1842           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1843          FETCH( func, *inst, 2, 1, CHAN_X );
1844          FETCH( func, *inst, 5, 0, CHAN_X );
1845       }
1846       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1847          emit_mul( func, 3, 2 );
1848          emit_mul( func, 1, 5 );
1849          emit_sub( func, 3, 1 );
1850          STORE( func, *inst, 3, 0, CHAN_Y );
1851       }
1852       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1853          emit_mul( func, 5, 4 );
1854          emit_mul( func, 0, 2 );
1855          emit_sub( func, 5, 0 );
1856          STORE( func, *inst, 5, 0, CHAN_Z );
1857       }
1858       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1859          emit_tempf(
1860             func,
1861             0,
1862             TEMP_ONE_I,
1863             TEMP_ONE_C );
1864          STORE( func, *inst, 0, 0, CHAN_W );
1865       }
1866       break;
1867
1868    case TGSI_OPCODE_MULTIPLYMATRIX:
1869       return 0;
1870       break;
1871
1872    case TGSI_OPCODE_ABS:
1873       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1874          FETCH( func, *inst, 0, 0, chan_index );
1875          emit_abs( func, 0) ;
1876
1877          STORE( func, *inst, 0, 0, chan_index );
1878       }
1879       break;
1880
1881    case TGSI_OPCODE_RCC:
1882       return 0;
1883       break;
1884
1885    case TGSI_OPCODE_DPH:
1886       FETCH( func, *inst, 0, 0, CHAN_X );
1887       FETCH( func, *inst, 1, 1, CHAN_X );
1888       emit_mul( func, 0, 1 );
1889       FETCH( func, *inst, 1, 0, CHAN_Y );
1890       FETCH( func, *inst, 2, 1, CHAN_Y );
1891       emit_mul( func, 1, 2 );
1892       emit_add( func, 0, 1 );
1893       FETCH( func, *inst, 1, 0, CHAN_Z );
1894       FETCH( func, *inst, 2, 1, CHAN_Z );
1895       emit_mul( func, 1, 2 );
1896       emit_add( func, 0, 1 );
1897       FETCH( func, *inst, 1, 1, CHAN_W );
1898       emit_add( func, 0, 1 );
1899       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1900          STORE( func, *inst, 0, 0, chan_index );
1901       }
1902       break;
1903
1904    case TGSI_OPCODE_COS:
1905       FETCH( func, *inst, 0, 0, CHAN_X );
1906       emit_cos( func, 0, 0 );
1907       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1908          STORE( func, *inst, 0, 0, chan_index );
1909       }
1910       break;
1911
1912    case TGSI_OPCODE_DDX:
1913       return 0;
1914       break;
1915
1916    case TGSI_OPCODE_DDY:
1917       return 0;
1918       break;
1919
1920    case TGSI_OPCODE_KILP:
1921       /* predicated kill */
1922       emit_kilp( func );
1923       return 0; /* XXX fix me */
1924       break;
1925
1926    case TGSI_OPCODE_KIL:
1927       /* conditional kill */
1928       emit_kil( func, &inst->FullSrcRegisters[0] );
1929       break;
1930
1931    case TGSI_OPCODE_PK2H:
1932       return 0;
1933       break;
1934
1935    case TGSI_OPCODE_PK2US:
1936       return 0;
1937       break;
1938
1939    case TGSI_OPCODE_PK4B:
1940       return 0;
1941       break;
1942
1943    case TGSI_OPCODE_PK4UB:
1944       return 0;
1945       break;
1946
1947    case TGSI_OPCODE_RFL:
1948       return 0;
1949       break;
1950
1951    case TGSI_OPCODE_SEQ:
1952       return 0;
1953       break;
1954
1955    case TGSI_OPCODE_SFL:
1956       return 0;
1957       break;
1958
1959    case TGSI_OPCODE_SGT:
1960       return 0;
1961       break;
1962
1963    case TGSI_OPCODE_SIN:
1964       FETCH( func, *inst, 0, 0, CHAN_X );
1965       emit_sin( func, 0, 0 );
1966       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1967          STORE( func, *inst, 0, 0, chan_index );
1968       }
1969       break;
1970
1971    case TGSI_OPCODE_SLE:
1972       return 0;
1973       break;
1974
1975    case TGSI_OPCODE_SNE:
1976       return 0;
1977       break;
1978
1979    case TGSI_OPCODE_STR:
1980       return 0;
1981       break;
1982
1983    case TGSI_OPCODE_TEX:
1984       if (0) {
1985          /* Disable dummy texture code:
1986           */
1987          emit_tempf(
1988             func,
1989             0,
1990             TEMP_ONE_I,
1991             TEMP_ONE_C );
1992          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1993             STORE( func, *inst, 0, 0, chan_index );
1994          }
1995       }
1996       else {
1997          return 0;
1998       }
1999       break;
2000
2001    case TGSI_OPCODE_TXD:
2002       return 0;
2003       break;
2004
2005    case TGSI_OPCODE_UP2H:
2006       return 0;
2007       break;
2008
2009    case TGSI_OPCODE_UP2US:
2010       return 0;
2011       break;
2012
2013    case TGSI_OPCODE_UP4B:
2014       return 0;
2015       break;
2016
2017    case TGSI_OPCODE_UP4UB:
2018       return 0;
2019       break;
2020
2021    case TGSI_OPCODE_X2D:
2022       return 0;
2023       break;
2024
2025    case TGSI_OPCODE_ARA:
2026       return 0;
2027       break;
2028
2029    case TGSI_OPCODE_ARR:
2030       return 0;
2031       break;
2032
2033    case TGSI_OPCODE_BRA:
2034       return 0;
2035       break;
2036
2037    case TGSI_OPCODE_CAL:
2038       return 0;
2039       break;
2040
2041    case TGSI_OPCODE_RET:
2042       emit_ret( func );
2043       break;
2044
2045    case TGSI_OPCODE_END:
2046       break;
2047
2048    case TGSI_OPCODE_SSG:
2049       return 0;
2050       break;
2051
2052    case TGSI_OPCODE_CMP:
2053       emit_cmp (func, inst);
2054       break;
2055
2056    case TGSI_OPCODE_SCS:
2057       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2058          FETCH( func, *inst, 0, 0, CHAN_X );
2059          emit_cos( func, 0, 0 );
2060          STORE( func, *inst, 0, 0, CHAN_X );
2061       }
2062       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2063          FETCH( func, *inst, 0, 0, CHAN_X );
2064          emit_sin( func, 0, 0 );
2065          STORE( func, *inst, 0, 0, CHAN_Y );
2066       }
2067       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2068          emit_tempf(
2069             func,
2070             0,
2071             TGSI_EXEC_TEMP_00000000_I,
2072             TGSI_EXEC_TEMP_00000000_C );
2073          STORE( func, *inst, 0, 0, CHAN_Z );
2074       }
2075       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2076          emit_tempf(
2077             func,
2078             0,
2079             TEMP_ONE_I,
2080             TEMP_ONE_C );
2081          STORE( func, *inst, 0, 0, CHAN_W );
2082       }
2083       break;
2084
2085    case TGSI_OPCODE_TXB:
2086       return 0;
2087       break;
2088
2089    case TGSI_OPCODE_NRM:
2090       /* fall-through */
2091    case TGSI_OPCODE_NRM4:
2092       /* 3 or 4-component normalization */
2093       {
2094          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2095          /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
2096          FETCH( func, *inst, 4, 0, CHAN_X );    /* xmm4 = src[0].x */
2097          FETCH( func, *inst, 5, 0, CHAN_Y );    /* xmm5 = src[0].y */
2098          FETCH( func, *inst, 6, 0, CHAN_Z );    /* xmm6 = src[0].z */
2099          if (dims == 4) {
2100             FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
2101          }
2102          emit_MOV( func, 0, 4 );                /* xmm0 = xmm3 */
2103          emit_mul( func, 0, 4 );                /* xmm0 *= xmm3 */
2104          emit_MOV( func, 1, 5 );                /* xmm1 = xmm4 */
2105          emit_mul( func, 1, 5 );                /* xmm1 *= xmm4 */
2106          emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
2107          emit_MOV( func, 1, 6 );                /* xmm1 = xmm5 */
2108          emit_mul( func, 1, 6 );                /* xmm1 *= xmm5 */
2109          emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
2110          if (dims == 4) {
2111             emit_MOV( func, 1, 7 );             /* xmm1 = xmm7 */
2112             emit_mul( func, 1, 7 );             /* xmm1 *= xmm7 */
2113             emit_add( func, 0, 0 );             /* xmm0 += xmm1 */
2114          }
2115          emit_rsqrt( func, 1, 0 );              /* xmm1 = 1/sqrt(xmm0) */
2116          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2117             if (chan_index < dims) {
2118                emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
2119                STORE( func, *inst, 4+chan_index, 0, chan_index );
2120             }
2121          }
2122       }
2123       break;
2124
2125    case TGSI_OPCODE_DIV:
2126       return 0;
2127       break;
2128
2129    case TGSI_OPCODE_DP2:
2130       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2131       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2132       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2133       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2134       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2135       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2136       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2137       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2138          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2139       }
2140       break;
2141
2142    case TGSI_OPCODE_TXL:
2143       return 0;
2144       break;
2145
2146    case TGSI_OPCODE_BRK:
2147       return 0;
2148       break;
2149
2150    case TGSI_OPCODE_IF:
2151       return 0;
2152       break;
2153
2154    case TGSI_OPCODE_LOOP:
2155       return 0;
2156       break;
2157
2158    case TGSI_OPCODE_REP:
2159       return 0;
2160       break;
2161
2162    case TGSI_OPCODE_ELSE:
2163       return 0;
2164       break;
2165
2166    case TGSI_OPCODE_ENDIF:
2167       return 0;
2168       break;
2169
2170    case TGSI_OPCODE_ENDLOOP:
2171       return 0;
2172       break;
2173
2174    case TGSI_OPCODE_ENDREP:
2175       return 0;
2176       break;
2177
2178    case TGSI_OPCODE_PUSHA:
2179       return 0;
2180       break;
2181
2182    case TGSI_OPCODE_POPA:
2183       return 0;
2184       break;
2185
2186    case TGSI_OPCODE_CEIL:
2187       return 0;
2188       break;
2189
2190    case TGSI_OPCODE_I2F:
2191       return 0;
2192       break;
2193
2194    case TGSI_OPCODE_NOT:
2195       return 0;
2196       break;
2197
2198    case TGSI_OPCODE_TRUNC:
2199       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2200          FETCH( func, *inst, 0, 0, chan_index );
2201          emit_f2it( func, 0 );
2202          emit_i2f( func, 0 );
2203          STORE( func, *inst, 0, 0, chan_index );
2204       }
2205       break;
2206
2207    case TGSI_OPCODE_SHL:
2208       return 0;
2209       break;
2210
2211    case TGSI_OPCODE_SHR:
2212       return 0;
2213       break;
2214
2215    case TGSI_OPCODE_AND:
2216       return 0;
2217       break;
2218
2219    case TGSI_OPCODE_OR:
2220       return 0;
2221       break;
2222
2223    case TGSI_OPCODE_MOD:
2224       return 0;
2225       break;
2226
2227    case TGSI_OPCODE_XOR:
2228       return 0;
2229       break;
2230
2231    case TGSI_OPCODE_SAD:
2232       return 0;
2233       break;
2234
2235    case TGSI_OPCODE_TXF:
2236       return 0;
2237       break;
2238
2239    case TGSI_OPCODE_TXQ:
2240       return 0;
2241       break;
2242
2243    case TGSI_OPCODE_CONT:
2244       return 0;
2245       break;
2246
2247    case TGSI_OPCODE_EMIT:
2248       return 0;
2249       break;
2250
2251    case TGSI_OPCODE_ENDPRIM:
2252       return 0;
2253       break;
2254
2255    default:
2256       return 0;
2257    }
2258
2259    return 1;
2260 }
2261
2262 static void
2263 emit_declaration(
2264    struct x86_function *func,
2265    struct tgsi_full_declaration *decl )
2266 {
2267    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2268       unsigned first, last, mask;
2269       unsigned i, j;
2270
2271       first = decl->DeclarationRange.First;
2272       last = decl->DeclarationRange.Last;
2273       mask = decl->Declaration.UsageMask;
2274
2275       for( i = first; i <= last; i++ ) {
2276          for( j = 0; j < NUM_CHANNELS; j++ ) {
2277             if( mask & (1 << j) ) {
2278                switch( decl->Declaration.Interpolate ) {
2279                case TGSI_INTERPOLATE_CONSTANT:
2280                   emit_coef_a0( func, 0, i, j );
2281                   emit_inputs( func, 0, i, j );
2282                   break;
2283
2284                case TGSI_INTERPOLATE_LINEAR:
2285                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2286                   emit_coef_dadx( func, 1, i, j );
2287                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2288                   emit_coef_dady( func, 3, i, j );
2289                   emit_mul( func, 0, 1 );    /* x * dadx */
2290                   emit_coef_a0( func, 4, i, j );
2291                   emit_mul( func, 2, 3 );    /* y * dady */
2292                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2293                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2294                   emit_inputs( func, 0, i, j );
2295                   break;
2296
2297                case TGSI_INTERPOLATE_PERSPECTIVE:
2298                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2299                   emit_coef_dadx( func, 1, i, j );
2300                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2301                   emit_coef_dady( func, 3, i, j );
2302                   emit_mul( func, 0, 1 );    /* x * dadx */
2303                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2304                   emit_coef_a0( func, 5, i, j );
2305                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2306                   emit_mul( func, 2, 3 );    /* y * dady */
2307                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2308                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2309                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2310                   emit_inputs( func, 0, i, j );
2311                   break;
2312
2313                default:
2314                   assert( 0 );
2315                   break;
2316                }
2317             }
2318          }
2319       }
2320    }
2321 }
2322
2323 static void aos_to_soa( struct x86_function *func,
2324                         uint arg_aos,
2325                         uint arg_soa,
2326                         uint arg_num,
2327                         uint arg_stride )
2328 {
2329    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2330    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2331    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2332    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2333    int inner_loop;
2334
2335
2336    /* Save EBX */
2337    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2338
2339    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2340    x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
2341    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2342    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2343
2344    /* do */
2345    inner_loop = x86_get_label( func );
2346    {
2347       x86_push( func, aos_input );
2348       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2349       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2350       x86_add( func, aos_input, stride );
2351       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2352       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2353       x86_add( func, aos_input, stride );
2354       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2355       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2356       x86_add( func, aos_input, stride );
2357       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2358       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2359       x86_pop( func, aos_input );
2360
2361       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2362       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2363       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2364       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2365       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2366       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2367
2368       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2369       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2370       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2371       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2372
2373       /* Advance to next input */
2374       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2375       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2376    }
2377    /* while --num_inputs */
2378    x86_dec( func, num_inputs );
2379    x86_jcc( func, cc_NE, inner_loop );
2380
2381    /* Restore EBX */
2382    x86_pop( func, aos_input );
2383 }
2384
2385 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2386 {
2387    struct x86_reg soa_output;
2388    struct x86_reg aos_output;
2389    struct x86_reg num_outputs;
2390    struct x86_reg temp;
2391    int inner_loop;
2392
2393    soa_output = x86_make_reg( file_REG32, reg_AX );
2394    aos_output = x86_make_reg( file_REG32, reg_BX );
2395    num_outputs = x86_make_reg( file_REG32, reg_CX );
2396    temp = x86_make_reg( file_REG32, reg_DX );
2397
2398    /* Save EBX */
2399    x86_push( func, aos_output );
2400
2401    x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2402    x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2403    x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2404
2405    /* do */
2406    inner_loop = x86_get_label( func );
2407    {
2408       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2409       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2410       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2411       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2412
2413       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2414       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2415       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2416       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2417       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2418       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2419
2420       x86_mov( func, temp, x86_fn_arg( func, stride ) );
2421       x86_push( func, aos_output );
2422       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2423       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2424       x86_add( func, aos_output, temp );
2425       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2426       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2427       x86_add( func, aos_output, temp );
2428       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2429       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2430       x86_add( func, aos_output, temp );
2431       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2432       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2433       x86_pop( func, aos_output );
2434
2435       /* Advance to next output */
2436       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2437       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2438    }
2439    /* while --num_outputs */
2440    x86_dec( func, num_outputs );
2441    x86_jcc( func, cc_NE, inner_loop );
2442
2443    /* Restore EBX */
2444    x86_pop( func, aos_output );
2445 }
2446
2447 /**
2448  * Translate a TGSI vertex/fragment shader to SSE2 code.
2449  * Slightly different things are done for vertex vs. fragment shaders.
2450  *
2451  * Note that fragment shaders are responsible for interpolating shader
2452  * inputs. Because on x86 we have only 4 GP registers, and here we
2453  * have 5 shader arguments (input, output, const, temp and coef), the
2454  * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2455  * GP register holding the output argument is aliased with the coeff
2456  * argument, as outputs are not needed in the DECLARATION phase.
2457  *
2458  * \param tokens  the TGSI input shader
2459  * \param func  the output SSE code/function
2460  * \param immediates  buffer to place immediates, later passed to SSE func
2461  * \param return  1 for success, 0 if translation failed
2462  */
2463 unsigned
2464 tgsi_emit_sse2(
2465    const struct tgsi_token *tokens,
2466    struct x86_function *func,
2467    float (*immediates)[4],
2468    boolean do_swizzles )
2469 {
2470    struct tgsi_parse_context parse;
2471    boolean instruction_phase = FALSE;
2472    unsigned ok = 1;
2473    uint num_immediates = 0;
2474
2475    util_init_math();
2476
2477    func->csr = func->store;
2478
2479    tgsi_parse_init( &parse, tokens );
2480
2481    /* Can't just use EDI, EBX without save/restoring them:
2482     */
2483    x86_push(
2484       func,
2485       get_immediate_base() );
2486
2487    x86_push(
2488       func,
2489       get_temp_base() );
2490
2491
2492    /*
2493     * Different function args for vertex/fragment shaders:
2494     */
2495    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2496       /* DECLARATION phase, do not load output argument. */
2497       x86_mov(
2498          func,
2499          get_input_base(),
2500          x86_fn_arg( func, 1 ) );
2501       /* skipping outputs argument here */
2502       x86_mov(
2503          func,
2504          get_const_base(),
2505          x86_fn_arg( func, 3 ) );
2506       x86_mov(
2507          func,
2508          get_temp_base(),
2509          x86_fn_arg( func, 4 ) );
2510       x86_mov(
2511          func,
2512          get_coef_base(),
2513          x86_fn_arg( func, 5 ) );
2514       x86_mov(
2515          func,
2516          get_immediate_base(),
2517          x86_fn_arg( func, 6 ) );
2518    }
2519    else {
2520       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2521
2522       if (do_swizzles)
2523          aos_to_soa( func,
2524                      6,         /* aos_input */
2525                      1,         /* machine->input */
2526                      7,         /* num_inputs */
2527                      8 );       /* input_stride */
2528
2529       x86_mov(
2530          func,
2531          get_input_base(),
2532          x86_fn_arg( func, 1 ) );
2533       x86_mov(
2534          func,
2535          get_output_base(),
2536          x86_fn_arg( func, 2 ) );
2537       x86_mov(
2538          func,
2539          get_const_base(),
2540          x86_fn_arg( func, 3 ) );
2541       x86_mov(
2542          func,
2543          get_temp_base(),
2544          x86_fn_arg( func, 4 ) );
2545       x86_mov(
2546          func,
2547          get_immediate_base(),
2548          x86_fn_arg( func, 5 ) );
2549    }
2550
2551    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2552       tgsi_parse_token( &parse );
2553
2554       switch( parse.FullToken.Token.Type ) {
2555       case TGSI_TOKEN_TYPE_DECLARATION:
2556          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2557             emit_declaration(
2558                func,
2559                &parse.FullToken.FullDeclaration );
2560          }
2561          break;
2562
2563       case TGSI_TOKEN_TYPE_INSTRUCTION:
2564          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2565             if( !instruction_phase ) {
2566                /* INSTRUCTION phase, overwrite coeff with output. */
2567                instruction_phase = TRUE;
2568                x86_mov(
2569                   func,
2570                   get_output_base(),
2571                   x86_fn_arg( func, 2 ) );
2572             }
2573          }
2574
2575          ok = emit_instruction(
2576             func,
2577             &parse.FullToken.FullInstruction );
2578
2579          if (!ok) {
2580             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2581                          parse.FullToken.FullInstruction.Instruction.Opcode,
2582                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2583                          "vertex shader" : "fragment shader");
2584          }
2585          break;
2586
2587       case TGSI_TOKEN_TYPE_IMMEDIATE:
2588          /* simply copy the immediate values into the next immediates[] slot */
2589          {
2590             const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2591             uint i;
2592             assert(size <= 4);
2593             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2594             for( i = 0; i < size; i++ ) {
2595                immediates[num_immediates][i] =
2596                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2597             }
2598 #if 0
2599             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2600                    num_immediates,
2601                    immediates[num_immediates][0],
2602                    immediates[num_immediates][1],
2603                    immediates[num_immediates][2],
2604                    immediates[num_immediates][3]);
2605 #endif
2606             num_immediates++;
2607          }
2608          break;
2609
2610       default:
2611          ok = 0;
2612          assert( 0 );
2613       }
2614    }
2615
2616    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2617       if (do_swizzles)
2618          soa_to_aos( func, 9, 2, 10, 11 );
2619    }
2620
2621    /* Can't just use EBX, EDI without save/restoring them:
2622     */
2623    x86_pop(
2624       func,
2625       get_temp_base() );
2626
2627    x86_pop(
2628       func,
2629       get_immediate_base() );
2630
2631    emit_ret( func );
2632
2633    tgsi_parse_free( &parse );
2634
2635    return ok;
2636 }
2637
2638 #endif /* PIPE_ARCH_X86 */
2639