src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_config.h"
  29
  30 #if defined(PIPE_ARCH_X86)
  31
  32 #include "pipe/p_debug.h"
  33 #include "pipe/p_shader_tokens.h"
  34 #include "util/u_math.h"
  35 #if defined(PIPE_ARCH_SSE)
  36 #include "util/u_sse.h"
  37 #endif
  38 #include "tgsi/tgsi_parse.h"
  39 #include "tgsi/tgsi_util.h"
  40 #include "tgsi_exec.h"
  41 #include "tgsi_sse2.h"
  42
  43 #include "rtasm/rtasm_x86sse.h"
  44
  45 /* for 1/sqrt()
  46  *
  47  * This costs about 100fps (close to 10%) in gears:
  48  */
  49 #define HIGH_PRECISION 1
  50
  51 #define FAST_MATH 1
  52
  53
  54 #define FOR_EACH_CHANNEL( CHAN )\
  55    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  56
  57 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  58    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  59
  60 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  61    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  62
  63 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  64    FOR_EACH_CHANNEL( CHAN )\
  65       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  66
  67 #define CHAN_X 0
  68 #define CHAN_Y 1
  69 #define CHAN_Z 2
  70 #define CHAN_W 3
  71
  72 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  73 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  74
  75 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  76 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  77 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  78 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  79
  80
  81 /**
  82  * X86 utility functions.
  83  */
  84
  85 static struct x86_reg
  86 make_xmm(
  87    unsigned xmm )
  88 {
  89    return x86_make_reg(
  90       file_XMM,
  91       (enum x86_reg_name) xmm );
  92 }
  93
  94 /**
  95  * X86 register mapping helpers.
  96  */
  97
  98 static struct x86_reg
  99 get_const_base( void )
 100 {
 101    return x86_make_reg(
 102       file_REG32,
 103       reg_CX );
 104 }
 105
 106 static struct x86_reg
 107 get_input_base( void )
 108 {
 109    return x86_make_reg(
 110       file_REG32,
 111       reg_AX );
 112 }
 113
 114 static struct x86_reg
 115 get_output_base( void )
 116 {
 117    return x86_make_reg(
 118       file_REG32,
 119       reg_DX );
 120 }
 121
 122 static struct x86_reg
 123 get_temp_base( void )
 124 {
 125    return x86_make_reg(
 126       file_REG32,
 127       reg_BX );
 128 }
 129
 130 static struct x86_reg
 131 get_coef_base( void )
 132 {
 133    return get_output_base();
 134 }
 135
 136 static struct x86_reg
 137 get_immediate_base( void )
 138 {
 139    return x86_make_reg(
 140       file_REG32,
 141       reg_DI );
 142 }
 143
 144
 145 /**
 146  * Data access helpers.
 147  */
 148
 149
 150 static struct x86_reg
 151 get_immediate(
 152    unsigned vec,
 153    unsigned chan )
 154 {
 155    return x86_make_disp(
 156       get_immediate_base(),
 157       (vec * 4 + chan) * 4 );
 158 }
 159
 160 static struct x86_reg
 161 get_const(
 162    unsigned vec,
 163    unsigned chan )
 164 {
 165    return x86_make_disp(
 166       get_const_base(),
 167       (vec * 4 + chan) * 4 );
 168 }
 169
 170 static struct x86_reg
 171 get_input(
 172    unsigned vec,
 173    unsigned chan )
 174 {
 175    return x86_make_disp(
 176       get_input_base(),
 177       (vec * 4 + chan) * 16 );
 178 }
 179
 180 static struct x86_reg
 181 get_output(
 182    unsigned vec,
 183    unsigned chan )
 184 {
 185    return x86_make_disp(
 186       get_output_base(),
 187       (vec * 4 + chan) * 16 );
 188 }
 189
 190 static struct x86_reg
 191 get_temp(
 192    unsigned vec,
 193    unsigned chan )
 194 {
 195    return x86_make_disp(
 196       get_temp_base(),
 197       (vec * 4 + chan) * 16 );
 198 }
 199
 200 static struct x86_reg
 201 get_coef(
 202    unsigned vec,
 203    unsigned chan,
 204    unsigned member )
 205 {
 206    return x86_make_disp(
 207       get_coef_base(),
 208       ((vec * 3 + member) * 4 + chan) * 4 );
 209 }
 210
 211
 212 static void
 213 emit_ret(
 214    struct x86_function  *func )
 215 {
 216    x86_ret( func );
 217 }
 218
 219
 220 /**
 221  * Data fetch helpers.
 222  */
 223
 224 /**
 225  * Copy a shader constant to xmm register
 226  * \param xmm  the destination xmm register
 227  * \param vec  the src const buffer index
 228  * \param chan  src channel to fetch (X, Y, Z or W)
 229  */
 230 static void
 231 emit_const(
 232    struct x86_function *func,
 233    uint xmm,
 234    int vec,
 235    uint chan,
 236    uint indirect,
 237    uint indirectFile,
 238    int indirectIndex )
 239 {
 240    if (indirect) {
 241       /* 'vec' is the offset from the address register's value.
 242        * We're loading CONST[ADDR+vec] into an xmm register.
 243        */
 244       struct x86_reg r0 = get_input_base();
 245       struct x86_reg r1 = get_output_base();
 246       uint i;
 247
 248       assert( indirectFile == TGSI_FILE_ADDRESS );
 249       assert( indirectIndex == 0 );
 250
 251       x86_push( func, r0 );
 252       x86_push( func, r1 );
 253
 254       /*
 255        * Loop over the four pixels or vertices in the quad.
 256        * Get the value of the address (offset) register for pixel/vertex[i],
 257        * add it to the src offset and index into the constant buffer.
 258        * Note that we're working on SOA data.
 259        * If any of the pixel/vertex execution channels are unused their
 260        * values will be garbage.  It's very important that we don't use
 261        * those garbage values as indexes into the constant buffer since
 262        * that'll cause segfaults.
 263        * The solution is to bitwise-AND the offset with the execution mask
 264        * register whose values are either 0 or ~0.
 265        * The caller must setup the execution mask register to indicate
 266        * which channels are valid/alive before running the shader.
 267        * The execution mask will also figure into loops and conditionals
 268        * someday.
 269        */
 270       for (i = 0; i < QUAD_SIZE; i++) {
 271          /* r1 = address register[i] */
 272          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 273          /* r0 = execution mask[i] */
 274          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 275          /* r1 = r1 & r0 */
 276          x86_and( func, r1, r0 );
 277          /* r0 = 'vec', the offset */
 278          x86_lea( func, r0, get_const( vec, chan ) );
 279
 280          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 281           */
 282          x86_add( func, r1, r1 );
 283          x86_add( func, r1, r1 );
 284          x86_add( func, r1, r1 );
 285          x86_add( func, r1, r1 );
 286
 287          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 288          x86_mov( func, r1, x86_deref( r0 ) );
 289          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 290       }
 291
 292       x86_pop( func, r1 );
 293       x86_pop( func, r0 );
 294
 295       sse_movaps(
 296          func,
 297          make_xmm( xmm ),
 298          get_temp( TEMP_R0, CHAN_X ) );
 299    }
 300    else {
 301       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 302       assert( vec >= 0 );
 303
 304       sse_movss(
 305          func,
 306          make_xmm( xmm ),
 307          get_const( vec, chan ) );
 308       sse_shufps(
 309          func,
 310          make_xmm( xmm ),
 311          make_xmm( xmm ),
 312          SHUF( 0, 0, 0, 0 ) );
 313    }
 314 }
 315
 316 static void
 317 emit_immediate(
 318    struct x86_function *func,
 319    unsigned xmm,
 320    unsigned vec,
 321    unsigned chan )
 322 {
 323    sse_movss(
 324       func,
 325       make_xmm( xmm ),
 326       get_immediate( vec, chan ) );
 327    sse_shufps(
 328       func,
 329       make_xmm( xmm ),
 330       make_xmm( xmm ),
 331       SHUF( 0, 0, 0, 0 ) );
 332 }
 333
 334
 335 /**
 336  * Copy a shader input to xmm register
 337  * \param xmm  the destination xmm register
 338  * \param vec  the src input attrib
 339  * \param chan  src channel to fetch (X, Y, Z or W)
 340  */
 341 static void
 342 emit_inputf(
 343    struct x86_function *func,
 344    unsigned xmm,
 345    unsigned vec,
 346    unsigned chan )
 347 {
 348    sse_movups(
 349       func,
 350       make_xmm( xmm ),
 351       get_input( vec, chan ) );
 352 }
 353
 354 /**
 355  * Store an xmm register to a shader output
 356  * \param xmm  the source xmm register
 357  * \param vec  the dest output attrib
 358  * \param chan  src dest channel to store (X, Y, Z or W)
 359  */
 360 static void
 361 emit_output(
 362    struct x86_function *func,
 363    unsigned xmm,
 364    unsigned vec,
 365    unsigned chan )
 366 {
 367    sse_movups(
 368       func,
 369       get_output( vec, chan ),
 370       make_xmm( xmm ) );
 371 }
 372
 373 /**
 374  * Copy a shader temporary to xmm register
 375  * \param xmm  the destination xmm register
 376  * \param vec  the src temp register
 377  * \param chan  src channel to fetch (X, Y, Z or W)
 378  */
 379 static void
 380 emit_tempf(
 381    struct x86_function *func,
 382    unsigned xmm,
 383    unsigned vec,
 384    unsigned chan )
 385 {
 386    sse_movaps(
 387       func,
 388       make_xmm( xmm ),
 389       get_temp( vec, chan ) );
 390 }
 391
 392 /**
 393  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 394  * \param xmm  the destination xmm register
 395  * \param vec  the src input/attribute coefficient index
 396  * \param chan  src channel to fetch (X, Y, Z or W)
 397  * \param member  0=a0, 1=dadx, 2=dady
 398  */
 399 static void
 400 emit_coef(
 401    struct x86_function *func,
 402    unsigned xmm,
 403    unsigned vec,
 404    unsigned chan,
 405    unsigned member )
 406 {
 407    sse_movss(
 408       func,
 409       make_xmm( xmm ),
 410       get_coef( vec, chan, member ) );
 411    sse_shufps(
 412       func,
 413       make_xmm( xmm ),
 414       make_xmm( xmm ),
 415       SHUF( 0, 0, 0, 0 ) );
 416 }
 417
 418 /**
 419  * Data store helpers.
 420  */
 421
 422 static void
 423 emit_inputs(
 424    struct x86_function *func,
 425    unsigned xmm,
 426    unsigned vec,
 427    unsigned chan )
 428 {
 429    sse_movups(
 430       func,
 431       get_input( vec, chan ),
 432       make_xmm( xmm ) );
 433 }
 434
 435 static void
 436 emit_temps(
 437    struct x86_function *func,
 438    unsigned xmm,
 439    unsigned vec,
 440    unsigned chan )
 441 {
 442    sse_movaps(
 443       func,
 444       get_temp( vec, chan ),
 445       make_xmm( xmm ) );
 446 }
 447
 448 static void
 449 emit_addrs(
 450    struct x86_function *func,
 451    unsigned xmm,
 452    unsigned vec,
 453    unsigned chan )
 454 {
 455    assert( vec == 0 );
 456
 457    emit_temps(
 458       func,
 459       xmm,
 460       vec + TGSI_EXEC_TEMP_ADDR,
 461       chan );
 462 }
 463
 464 /**
 465  * Coefficent fetch helpers.
 466  */
 467
 468 static void
 469 emit_coef_a0(
 470    struct x86_function *func,
 471    unsigned xmm,
 472    unsigned vec,
 473    unsigned chan )
 474 {
 475    emit_coef(
 476       func,
 477       xmm,
 478       vec,
 479       chan,
 480       0 );
 481 }
 482
 483 static void
 484 emit_coef_dadx(
 485    struct x86_function *func,
 486    unsigned xmm,
 487    unsigned vec,
 488    unsigned chan )
 489 {
 490    emit_coef(
 491       func,
 492       xmm,
 493       vec,
 494       chan,
 495       1 );
 496 }
 497
 498 static void
 499 emit_coef_dady(
 500    struct x86_function *func,
 501    unsigned xmm,
 502    unsigned vec,
 503    unsigned chan )
 504 {
 505    emit_coef(
 506       func,
 507       xmm,
 508       vec,
 509       chan,
 510       2 );
 511 }
 512
 513 /**
 514  * Function call helpers.
 515  */
 516
 517 /**
 518  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 519  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 520  * that the stack pointer is 16 byte aligned, as expected.
 521  */
 522 static void
 523 emit_func_call_dst(
 524    struct x86_function *func,
 525    unsigned xmm_save,
 526    unsigned xmm_dst,
 527    void (PIPE_CDECL *code)() )
 528 {
 529    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 530    unsigned i, n, xmm;
 531    unsigned xmm_mask;
 532
 533    /* Bitmask of the xmm registers to save */
 534    xmm_mask = (1 << xmm_save) - 1;
 535    xmm_mask &= ~(1 << xmm_dst);
 536
 537    sse_movaps(
 538       func,
 539       get_temp( TEMP_R0, 0 ),
 540       make_xmm( xmm_dst ) );
 541
 542    x86_push(
 543       func,
 544       x86_make_reg( file_REG32, reg_AX) );
 545    x86_push(
 546       func,
 547       x86_make_reg( file_REG32, reg_CX) );
 548    x86_push(
 549       func,
 550       x86_make_reg( file_REG32, reg_DX) );
 551
 552    for(i = 0, n = 0; i < 8; ++i)
 553       if(xmm_mask & (1 << i))
 554          ++n;
 555
 556    x86_sub_imm(
 557       func,
 558       x86_make_reg( file_REG32, reg_SP ),
 559       n*16);
 560
 561    for(i = 0, n = 0; i < 8; ++i)
 562       if(xmm_mask & (1 << i)) {
 563          sse_movups(
 564             func,
 565             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 566             make_xmm( xmm ) );
 567          ++n;
 568       }
 569
 570    x86_lea(
 571       func,
 572       ecx,
 573       get_temp( TEMP_R0, 0 ) );
 574
 575    x86_push( func, ecx );
 576    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 577    x86_call( func, ecx );
 578    x86_pop(func, ecx );
 579
 580    for(i = 0, n = 0; i < 8; ++i)
 581       if(xmm_mask & (1 << i)) {
 582          sse_movups(
 583             func,
 584             make_xmm( xmm ),
 585             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 586          ++n;
 587       }
 588
 589    x86_add_imm(
 590       func,
 591       x86_make_reg( file_REG32, reg_SP ),
 592       n*16);
 593
 594    /* Restore GP registers in a reverse order.
 595     */
 596    x86_pop(
 597       func,
 598       x86_make_reg( file_REG32, reg_DX) );
 599    x86_pop(
 600       func,
 601       x86_make_reg( file_REG32, reg_CX) );
 602    x86_pop(
 603       func,
 604       x86_make_reg( file_REG32, reg_AX) );
 605
 606    sse_movaps(
 607       func,
 608       make_xmm( xmm_dst ),
 609       get_temp( TEMP_R0, 0 ) );
 610 }
 611
 612 static void
 613 emit_func_call_dst_src(
 614    struct x86_function *func,
 615    unsigned xmm_save,
 616    unsigned xmm_dst,
 617    unsigned xmm_src,
 618    void (PIPE_CDECL *code)() )
 619 {
 620    sse_movaps(
 621       func,
 622       get_temp( TEMP_R0, 1 ),
 623       make_xmm( xmm_src ) );
 624
 625    emit_func_call_dst(
 626       func,
 627       xmm_save,
 628       xmm_dst,
 629       code );
 630 }
 631
 632
 633 #if defined(PIPE_ARCH_SSE)
 634
 635 /*
 636  * Fast SSE2 implementation of special math functions.
 637  */
 638
 639 #define POLY0(x, c0) _mm_set1_ps(c0)
 640 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 641 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 642 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 643 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 644 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 645
 646 #define EXP_POLY_DEGREE 3
 647 #define LOG_POLY_DEGREE 5
 648
 649 /**
 650  * See http://www.devmaster.net/forums/showthread.php?p=43580
 651  */
 652 static INLINE __m128
 653 exp2f4(__m128 x)
 654 {
 655    __m128i ipart;
 656    __m128 fpart, expipart, expfpart;
 657
 658    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 659    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 660
 661    /* ipart = int(x - 0.5) */
 662    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 663
 664    /* fpart = x - ipart */
 665    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 666
 667    /* expipart = (float) (1 << ipart) */
 668    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 669
 670    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 671 #if EXP_POLY_DEGREE == 5
 672    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 673 #elif EXP_POLY_DEGREE == 4
 674    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 675 #elif EXP_POLY_DEGREE == 3
 676    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 677 #elif EXP_POLY_DEGREE == 2
 678    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 679 #else
 680 #error
 681 #endif
 682
 683    return _mm_mul_ps(expipart, expfpart);
 684 }
 685
 686
 687 /**
 688  * See http://www.devmaster.net/forums/showthread.php?p=43580
 689  */
 690 static INLINE __m128
 691 log2f4(__m128 x)
 692 {
 693    __m128i expmask = _mm_set1_epi32(0x7f800000);
 694    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 695    __m128 one = _mm_set1_ps(1.0f);
 696
 697    __m128i i = _mm_castps_si128(x);
 698
 699    /* exp = (float) exponent(x) */
 700    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 701
 702    /* mant = (float) mantissa(x) */
 703    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 704
 705    __m128 logmant;
 706
 707    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 708     * These coefficients can be generate with
 709     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 710     */
 711 #if LOG_POLY_DEGREE == 6
 712    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 713 #elif LOG_POLY_DEGREE == 5
 714    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 715 #elif LOG_POLY_DEGREE == 4
 716    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 717 #elif LOG_POLY_DEGREE == 3
 718    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 719 #else
 720 #error
 721 #endif
 722
 723    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 724    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 725
 726    return _mm_add_ps(logmant, exp);
 727 }
 728
 729
 730 static INLINE __m128
 731 powf4(__m128 x, __m128 y)
 732 {
 733    return exp2f4(_mm_mul_ps(log2f4(x), y));
 734 }
 735
 736 #endif /* PIPE_ARCH_SSE */
 737
 738
 739
 740 /**
 741  * Low-level instruction translators.
 742  */
 743
 744 static void
 745 emit_abs(
 746    struct x86_function *func,
 747    unsigned xmm )
 748 {
 749    sse_andps(
 750       func,
 751       make_xmm( xmm ),
 752       get_temp(
 753          TGSI_EXEC_TEMP_7FFFFFFF_I,
 754          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 755 }
 756
 757 static void
 758 emit_add(
 759    struct x86_function *func,
 760    unsigned xmm_dst,
 761    unsigned xmm_src )
 762 {
 763    sse_addps(
 764       func,
 765       make_xmm( xmm_dst ),
 766       make_xmm( xmm_src ) );
 767 }
 768
 769 static void PIPE_CDECL
 770 cos4f(
 771    float *store )
 772 {
 773    store[0] = cosf( store[0] );
 774    store[1] = cosf( store[1] );
 775    store[2] = cosf( store[2] );
 776    store[3] = cosf( store[3] );
 777 }
 778
 779 static void
 780 emit_cos(
 781    struct x86_function *func,
 782    unsigned xmm_save,
 783    unsigned xmm_dst )
 784 {
 785    emit_func_call_dst(
 786       func,
 787       xmm_save,
 788       xmm_dst,
 789       cos4f );
 790 }
 791
 792 static void PIPE_CDECL
 793 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 794 __attribute__((force_align_arg_pointer))
 795 #endif
 796 ex24f(
 797    float *store )
 798 {
 799 #if defined(PIPE_ARCH_SSE)
 800    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 801 #else
 802    store[0] = util_fast_exp2( store[0] );
 803    store[1] = util_fast_exp2( store[1] );
 804    store[2] = util_fast_exp2( store[2] );
 805    store[3] = util_fast_exp2( store[3] );
 806 #endif
 807 }
 808
 809 static void
 810 emit_ex2(
 811    struct x86_function *func,
 812    unsigned xmm_save,
 813    unsigned xmm_dst )
 814 {
 815    emit_func_call_dst(
 816       func,
 817       xmm_save,
 818       xmm_dst,
 819       ex24f );
 820 }
 821
 822 static void
 823 emit_f2it(
 824    struct x86_function *func,
 825    unsigned xmm )
 826 {
 827    sse2_cvttps2dq(
 828       func,
 829       make_xmm( xmm ),
 830       make_xmm( xmm ) );
 831 }
 832
 833 static void
 834 emit_i2f(
 835    struct x86_function *func,
 836    unsigned xmm )
 837 {
 838    sse2_cvtdq2ps(
 839       func,
 840       make_xmm( xmm ),
 841       make_xmm( xmm ) );
 842 }
 843
 844 static void PIPE_CDECL
 845 flr4f(
 846    float *store )
 847 {
 848    store[0] = floorf( store[0] );
 849    store[1] = floorf( store[1] );
 850    store[2] = floorf( store[2] );
 851    store[3] = floorf( store[3] );
 852 }
 853
 854 static void
 855 emit_flr(
 856    struct x86_function *func,
 857    unsigned xmm_save,
 858    unsigned xmm_dst )
 859 {
 860    emit_func_call_dst(
 861       func,
 862       xmm_save,
 863       xmm_dst,
 864       flr4f );
 865 }
 866
 867 static void PIPE_CDECL
 868 frc4f(
 869    float *store )
 870 {
 871    store[0] -= floorf( store[0] );
 872    store[1] -= floorf( store[1] );
 873    store[2] -= floorf( store[2] );
 874    store[3] -= floorf( store[3] );
 875 }
 876
 877 static void
 878 emit_frc(
 879    struct x86_function *func,
 880    unsigned xmm_save,
 881    unsigned xmm_dst )
 882 {
 883    emit_func_call_dst(
 884       func,
 885       xmm_save,
 886       xmm_dst,
 887       frc4f );
 888 }
 889
 890 static void PIPE_CDECL
 891 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 892 __attribute__((force_align_arg_pointer))
 893 #endif
 894 lg24f(
 895    float *store )
 896 {
 897 #if defined(PIPE_ARCH_SSE)
 898    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 899 #else
 900    store[0] = util_fast_log2( store[0] );
 901    store[1] = util_fast_log2( store[1] );
 902    store[2] = util_fast_log2( store[2] );
 903    store[3] = util_fast_log2( store[3] );
 904 #endif
 905 }
 906
 907 static void
 908 emit_lg2(
 909    struct x86_function *func,
 910    unsigned xmm_save,
 911    unsigned xmm_dst )
 912 {
 913    emit_func_call_dst(
 914       func,
 915       xmm_save,
 916       xmm_dst,
 917       lg24f );
 918 }
 919
 920 static void
 921 emit_MOV(
 922    struct x86_function *func,
 923    unsigned xmm_dst,
 924    unsigned xmm_src )
 925 {
 926    sse_movups(
 927       func,
 928       make_xmm( xmm_dst ),
 929       make_xmm( xmm_src ) );
 930 }
 931
 932 static void
 933 emit_mul (struct x86_function *func,
 934           unsigned xmm_dst,
 935           unsigned xmm_src)
 936 {
 937    sse_mulps(
 938       func,
 939       make_xmm( xmm_dst ),
 940       make_xmm( xmm_src ) );
 941 }
 942
 943 static void
 944 emit_neg(
 945    struct x86_function *func,
 946    unsigned xmm )
 947 {
 948    sse_xorps(
 949       func,
 950       make_xmm( xmm ),
 951       get_temp(
 952          TGSI_EXEC_TEMP_80000000_I,
 953          TGSI_EXEC_TEMP_80000000_C ) );
 954 }
 955
 956 static void PIPE_CDECL
 957 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 958 __attribute__((force_align_arg_pointer))
 959 #endif
 960 pow4f(
 961    float *store )
 962 {
 963 #if defined(PIPE_ARCH_SSE)
 964    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
 965 #else
 966    store[0] = util_fast_pow( store[0], store[4] );
 967    store[1] = util_fast_pow( store[1], store[5] );
 968    store[2] = util_fast_pow( store[2], store[6] );
 969    store[3] = util_fast_pow( store[3], store[7] );
 970 #endif
 971 }
 972
 973 static void
 974 emit_pow(
 975    struct x86_function *func,
 976    unsigned xmm_save,
 977    unsigned xmm_dst,
 978    unsigned xmm_src )
 979 {
 980    emit_func_call_dst_src(
 981       func,
 982       xmm_save,
 983       xmm_dst,
 984       xmm_src,
 985       pow4f );
 986 }
 987
 988 static void
 989 emit_rcp (
 990    struct x86_function *func,
 991    unsigned xmm_dst,
 992    unsigned xmm_src )
 993 {
 994    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 995     * good enough.  Need to either emit a proper divide or use the
 996     * iterative technique described below in emit_rsqrt().
 997     */
 998    sse2_rcpps(
 999       func,
1000       make_xmm( xmm_dst ),
1001       make_xmm( xmm_src ) );
1002 }
1003
1004 static void
1005 emit_rsqrt(
1006    struct x86_function *func,
1007    unsigned xmm_dst,
1008    unsigned xmm_src )
1009 {
1010 #if HIGH_PRECISION
1011    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1012     * implementations, it is possible to improve its precision at
1013     * fairly low cost, using a newton/raphson step, as below:
1014     *
1015     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1016     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1017     *
1018     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1019     */
1020    {
1021       struct x86_reg dst = make_xmm( xmm_dst );
1022       struct x86_reg src = make_xmm( xmm_src );
1023       struct x86_reg tmp0 = make_xmm( 2 );
1024       struct x86_reg tmp1 = make_xmm( 3 );
1025
1026       assert( xmm_dst != xmm_src );
1027       assert( xmm_dst != 2 && xmm_dst != 3 );
1028       assert( xmm_src != 2 && xmm_src != 3 );
1029
1030       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1031       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1032       sse_rsqrtps( func, tmp1, src  );
1033       sse_mulps(   func, src,  tmp1 );
1034       sse_mulps(   func, dst,  tmp1 );
1035       sse_mulps(   func, src,  tmp1 );
1036       sse_subps(   func, tmp0, src  );
1037       sse_mulps(   func, dst,  tmp0 );
1038    }
1039 #else
1040    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1041     * good enough.
1042     */
1043    sse_rsqrtps(
1044       func,
1045       make_xmm( xmm_dst ),
1046       make_xmm( xmm_src ) );
1047 #endif
1048 }
1049
1050 static void
1051 emit_setsign(
1052    struct x86_function *func,
1053    unsigned xmm )
1054 {
1055    sse_orps(
1056       func,
1057       make_xmm( xmm ),
1058       get_temp(
1059          TGSI_EXEC_TEMP_80000000_I,
1060          TGSI_EXEC_TEMP_80000000_C ) );
1061 }
1062
1063 static void PIPE_CDECL
1064 sin4f(
1065    float *store )
1066 {
1067    store[0] = sinf( store[0] );
1068    store[1] = sinf( store[1] );
1069    store[2] = sinf( store[2] );
1070    store[3] = sinf( store[3] );
1071 }
1072
1073 static void
1074 emit_sin (struct x86_function *func,
1075           unsigned xmm_save,
1076           unsigned xmm_dst)
1077 {
1078    emit_func_call_dst(
1079       func,
1080       xmm_save,
1081       xmm_dst,
1082       sin4f );
1083 }
1084
1085 static void
1086 emit_sub(
1087    struct x86_function *func,
1088    unsigned xmm_dst,
1089    unsigned xmm_src )
1090 {
1091    sse_subps(
1092       func,
1093       make_xmm( xmm_dst ),
1094       make_xmm( xmm_src ) );
1095 }
1096
1097 /**
1098  * Register fetch.
1099  */
1100
1101 static void
1102 emit_fetch(
1103    struct x86_function *func,
1104    unsigned xmm,
1105    const struct tgsi_full_src_register *reg,
1106    const unsigned chan_index )
1107 {
1108    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1109
1110    switch (swizzle) {
1111    case TGSI_EXTSWIZZLE_X:
1112    case TGSI_EXTSWIZZLE_Y:
1113    case TGSI_EXTSWIZZLE_Z:
1114    case TGSI_EXTSWIZZLE_W:
1115       switch (reg->SrcRegister.File) {
1116       case TGSI_FILE_CONSTANT:
1117          emit_const(
1118             func,
1119             xmm,
1120             reg->SrcRegister.Index,
1121             swizzle,
1122             reg->SrcRegister.Indirect,
1123             reg->SrcRegisterInd.File,
1124             reg->SrcRegisterInd.Index );
1125          break;
1126
1127       case TGSI_FILE_IMMEDIATE:
1128          emit_immediate(
1129             func,
1130             xmm,
1131             reg->SrcRegister.Index,
1132             swizzle );
1133          break;
1134
1135       case TGSI_FILE_INPUT:
1136          emit_inputf(
1137             func,
1138             xmm,
1139             reg->SrcRegister.Index,
1140             swizzle );
1141          break;
1142
1143       case TGSI_FILE_TEMPORARY:
1144          emit_tempf(
1145             func,
1146             xmm,
1147             reg->SrcRegister.Index,
1148             swizzle );
1149          break;
1150
1151       default:
1152          assert( 0 );
1153       }
1154       break;
1155
1156    case TGSI_EXTSWIZZLE_ZERO:
1157       emit_tempf(
1158          func,
1159          xmm,
1160          TGSI_EXEC_TEMP_00000000_I,
1161          TGSI_EXEC_TEMP_00000000_C );
1162       break;
1163
1164    case TGSI_EXTSWIZZLE_ONE:
1165       emit_tempf(
1166          func,
1167          xmm,
1168          TEMP_ONE_I,
1169          TEMP_ONE_C );
1170       break;
1171
1172    default:
1173       assert( 0 );
1174    }
1175
1176    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1177    case TGSI_UTIL_SIGN_CLEAR:
1178       emit_abs( func, xmm );
1179       break;
1180
1181    case TGSI_UTIL_SIGN_SET:
1182       emit_setsign( func, xmm );
1183       break;
1184
1185    case TGSI_UTIL_SIGN_TOGGLE:
1186       emit_neg( func, xmm );
1187       break;
1188
1189    case TGSI_UTIL_SIGN_KEEP:
1190       break;
1191    }
1192 }
1193
1194 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1195    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1196
1197 /**
1198  * Register store.
1199  */
1200
1201 static void
1202 emit_store(
1203    struct x86_function *func,
1204    unsigned xmm,
1205    const struct tgsi_full_dst_register *reg,
1206    const struct tgsi_full_instruction *inst,
1207    unsigned chan_index )
1208 {
1209    switch( reg->DstRegister.File ) {
1210    case TGSI_FILE_OUTPUT:
1211       emit_output(
1212          func,
1213          xmm,
1214          reg->DstRegister.Index,
1215          chan_index );
1216       break;
1217
1218    case TGSI_FILE_TEMPORARY:
1219       emit_temps(
1220          func,
1221          xmm,
1222          reg->DstRegister.Index,
1223          chan_index );
1224       break;
1225
1226    case TGSI_FILE_ADDRESS:
1227       emit_addrs(
1228          func,
1229          xmm,
1230          reg->DstRegister.Index,
1231          chan_index );
1232       break;
1233
1234    default:
1235       assert( 0 );
1236    }
1237
1238    switch( inst->Instruction.Saturate ) {
1239    case TGSI_SAT_NONE:
1240       break;
1241
1242    case TGSI_SAT_ZERO_ONE:
1243       /* assert( 0 ); */
1244       break;
1245
1246    case TGSI_SAT_MINUS_PLUS_ONE:
1247       assert( 0 );
1248       break;
1249    }
1250 }
1251
1252 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1253    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1254
1255 /**
1256  * High-level instruction translators.
1257  */
1258
1259 static void
1260 emit_kil(
1261    struct x86_function *func,
1262    const struct tgsi_full_src_register *reg )
1263 {
1264    unsigned uniquemask;
1265    unsigned registers[4];
1266    unsigned nextregister = 0;
1267    unsigned firstchan = ~0;
1268    unsigned chan_index;
1269
1270    /* This mask stores component bits that were already tested. Note that
1271     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1272     * tested. */
1273    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1274
1275    FOR_EACH_CHANNEL( chan_index ) {
1276       unsigned swizzle;
1277
1278       /* unswizzle channel */
1279       swizzle = tgsi_util_get_full_src_register_extswizzle(
1280          reg,
1281          chan_index );
1282
1283       /* check if the component has not been already tested */
1284       if( !(uniquemask & (1 << swizzle)) ) {
1285          uniquemask |= 1 << swizzle;
1286
1287          /* allocate register */
1288          registers[chan_index] = nextregister;
1289          emit_fetch(
1290             func,
1291             nextregister,
1292             reg,
1293             chan_index );
1294          nextregister++;
1295
1296          /* mark the first channel used */
1297          if( firstchan == ~0 ) {
1298             firstchan = chan_index;
1299          }
1300       }
1301    }
1302
1303    x86_push(
1304       func,
1305       x86_make_reg( file_REG32, reg_AX ) );
1306    x86_push(
1307       func,
1308       x86_make_reg( file_REG32, reg_DX ) );
1309
1310    FOR_EACH_CHANNEL( chan_index ) {
1311       if( uniquemask & (1 << chan_index) ) {
1312          sse_cmpps(
1313             func,
1314             make_xmm( registers[chan_index] ),
1315             get_temp(
1316                TGSI_EXEC_TEMP_00000000_I,
1317                TGSI_EXEC_TEMP_00000000_C ),
1318             cc_LessThan );
1319
1320          if( chan_index == firstchan ) {
1321             sse_pmovmskb(
1322                func,
1323                x86_make_reg( file_REG32, reg_AX ),
1324                make_xmm( registers[chan_index] ) );
1325          }
1326          else {
1327             sse_pmovmskb(
1328                func,
1329                x86_make_reg( file_REG32, reg_DX ),
1330                make_xmm( registers[chan_index] ) );
1331             x86_or(
1332                func,
1333                x86_make_reg( file_REG32, reg_AX ),
1334                x86_make_reg( file_REG32, reg_DX ) );
1335          }
1336       }
1337    }
1338
1339    x86_or(
1340       func,
1341       get_temp(
1342          TGSI_EXEC_TEMP_KILMASK_I,
1343          TGSI_EXEC_TEMP_KILMASK_C ),
1344       x86_make_reg( file_REG32, reg_AX ) );
1345
1346    x86_pop(
1347       func,
1348       x86_make_reg( file_REG32, reg_DX ) );
1349    x86_pop(
1350       func,
1351       x86_make_reg( file_REG32, reg_AX ) );
1352 }
1353
1354
1355 static void
1356 emit_kilp(
1357    struct x86_function *func )
1358 {
1359    /* XXX todo / fix me */
1360 }
1361
1362
1363 static void
1364 emit_setcc(
1365    struct x86_function *func,
1366    struct tgsi_full_instruction *inst,
1367    enum sse_cc cc )
1368 {
1369    unsigned chan_index;
1370
1371    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1372       FETCH( func, *inst, 0, 0, chan_index );
1373       FETCH( func, *inst, 1, 1, chan_index );
1374       sse_cmpps(
1375          func,
1376          make_xmm( 0 ),
1377          make_xmm( 1 ),
1378          cc );
1379       sse_andps(
1380          func,
1381          make_xmm( 0 ),
1382          get_temp(
1383             TEMP_ONE_I,
1384             TEMP_ONE_C ) );
1385       STORE( func, *inst, 0, 0, chan_index );
1386    }
1387 }
1388
1389 static void
1390 emit_cmp(
1391    struct x86_function *func,
1392    struct tgsi_full_instruction *inst )
1393 {
1394    unsigned chan_index;
1395
1396    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1397       FETCH( func, *inst, 0, 0, chan_index );
1398       FETCH( func, *inst, 1, 1, chan_index );
1399       FETCH( func, *inst, 2, 2, chan_index );
1400       sse_cmpps(
1401          func,
1402          make_xmm( 0 ),
1403          get_temp(
1404             TGSI_EXEC_TEMP_00000000_I,
1405             TGSI_EXEC_TEMP_00000000_C ),
1406          cc_LessThan );
1407       sse_andps(
1408          func,
1409          make_xmm( 1 ),
1410          make_xmm( 0 ) );
1411       sse_andnps(
1412          func,
1413          make_xmm( 0 ),
1414          make_xmm( 2 ) );
1415       sse_orps(
1416          func,
1417          make_xmm( 0 ),
1418          make_xmm( 1 ) );
1419       STORE( func, *inst, 0, 0, chan_index );
1420    }
1421 }
1422
1423 static int
1424 emit_instruction(
1425    struct x86_function *func,
1426    struct tgsi_full_instruction *inst )
1427 {
1428    unsigned chan_index;
1429
1430    switch (inst->Instruction.Opcode) {
1431    case TGSI_OPCODE_ARL:
1432       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1433          FETCH( func, *inst, 0, 0, chan_index );
1434          emit_f2it( func, 0 );
1435          STORE( func, *inst, 0, 0, chan_index );
1436       }
1437       break;
1438
1439    case TGSI_OPCODE_MOV:
1440    case TGSI_OPCODE_SWZ:
1441       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1442          FETCH( func, *inst, 0, 0, chan_index );
1443          STORE( func, *inst, 0, 0, chan_index );
1444       }
1445       break;
1446
1447    case TGSI_OPCODE_LIT:
1448       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1449           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1450          emit_tempf(
1451             func,
1452             0,
1453             TEMP_ONE_I,
1454             TEMP_ONE_C);
1455          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1456             STORE( func, *inst, 0, 0, CHAN_X );
1457          }
1458          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1459             STORE( func, *inst, 0, 0, CHAN_W );
1460          }
1461       }
1462       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1463           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1464          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1465             FETCH( func, *inst, 0, 0, CHAN_X );
1466             sse_maxps(
1467                func,
1468                make_xmm( 0 ),
1469                get_temp(
1470                   TGSI_EXEC_TEMP_00000000_I,
1471                   TGSI_EXEC_TEMP_00000000_C ) );
1472             STORE( func, *inst, 0, 0, CHAN_Y );
1473          }
1474          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1475             /* XMM[1] = SrcReg[0].yyyy */
1476             FETCH( func, *inst, 1, 0, CHAN_Y );
1477             /* XMM[1] = max(XMM[1], 0) */
1478             sse_maxps(
1479                func,
1480                make_xmm( 1 ),
1481                get_temp(
1482                   TGSI_EXEC_TEMP_00000000_I,
1483                   TGSI_EXEC_TEMP_00000000_C ) );
1484             /* XMM[2] = SrcReg[0].wwww */
1485             FETCH( func, *inst, 2, 0, CHAN_W );
1486             /* XMM[2] = min(XMM[2], 128.0) */
1487             sse_minps(
1488                func,
1489                make_xmm( 2 ),
1490                get_temp(
1491                   TGSI_EXEC_TEMP_128_I,
1492                   TGSI_EXEC_TEMP_128_C ) );
1493             /* XMM[2] = max(XMM[2], -128.0) */
1494             sse_maxps(
1495                func,
1496                make_xmm( 2 ),
1497                get_temp(
1498                   TGSI_EXEC_TEMP_MINUS_128_I,
1499                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1500             emit_pow( func, 3, 1, 2 );
1501             FETCH( func, *inst, 0, 0, CHAN_X );
1502             sse_xorps(
1503                func,
1504                make_xmm( 2 ),
1505                make_xmm( 2 ) );
1506             sse_cmpps(
1507                func,
1508                make_xmm( 2 ),
1509                make_xmm( 0 ),
1510                cc_LessThanEqual );
1511             sse_andps(
1512                func,
1513                make_xmm( 2 ),
1514                make_xmm( 1 ) );
1515             STORE( func, *inst, 2, 0, CHAN_Z );
1516          }
1517       }
1518       break;
1519
1520    case TGSI_OPCODE_RCP:
1521    /* TGSI_OPCODE_RECIP */
1522       FETCH( func, *inst, 0, 0, CHAN_X );
1523       emit_rcp( func, 0, 0 );
1524       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1525          STORE( func, *inst, 0, 0, chan_index );
1526       }
1527       break;
1528
1529    case TGSI_OPCODE_RSQ:
1530    /* TGSI_OPCODE_RECIPSQRT */
1531       FETCH( func, *inst, 0, 0, CHAN_X );
1532       emit_rsqrt( func, 1, 0 );
1533       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1534          STORE( func, *inst, 1, 0, chan_index );
1535       }
1536       break;
1537
1538    case TGSI_OPCODE_EXP:
1539       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1540           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1541           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1542          FETCH( func, *inst, 0, 0, CHAN_X );
1543          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1544              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1545             emit_MOV( func, 1, 0 );
1546             emit_flr( func, 2, 1 );
1547             /* dst.x = ex2(floor(src.x)) */
1548             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1549                emit_MOV( func, 2, 1 );
1550                emit_ex2( func, 3, 2 );
1551                STORE( func, *inst, 2, 0, CHAN_X );
1552             }
1553             /* dst.y = src.x - floor(src.x) */
1554             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1555                emit_MOV( func, 2, 0 );
1556                emit_sub( func, 2, 1 );
1557                STORE( func, *inst, 2, 0, CHAN_Y );
1558             }
1559          }
1560          /* dst.z = ex2(src.x) */
1561          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1562             emit_ex2( func, 3, 0 );
1563             STORE( func, *inst, 0, 0, CHAN_Z );
1564          }
1565       }
1566       /* dst.w = 1.0 */
1567       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1568          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1569          STORE( func, *inst, 0, 0, CHAN_W );
1570       }
1571       break;
1572
1573    case TGSI_OPCODE_LOG:
1574       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1575           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1576           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1577          FETCH( func, *inst, 0, 0, CHAN_X );
1578          emit_abs( func, 0 );
1579          emit_MOV( func, 1, 0 );
1580          emit_lg2( func, 2, 1 );
1581          /* dst.z = lg2(abs(src.x)) */
1582          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1583             STORE( func, *inst, 1, 0, CHAN_Z );
1584          }
1585          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1586              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1587             emit_flr( func, 2, 1 );
1588             /* dst.x = floor(lg2(abs(src.x))) */
1589             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1590                STORE( func, *inst, 1, 0, CHAN_X );
1591             }
1592             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1593             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1594                emit_ex2( func, 2, 1 );
1595                emit_rcp( func, 1, 1 );
1596                emit_mul( func, 0, 1 );
1597                STORE( func, *inst, 0, 0, CHAN_Y );
1598             }
1599          }
1600       }
1601       /* dst.w = 1.0 */
1602       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1603          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1604          STORE( func, *inst, 0, 0, CHAN_W );
1605       }
1606       break;
1607
1608    case TGSI_OPCODE_MUL:
1609       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1610          FETCH( func, *inst, 0, 0, chan_index );
1611          FETCH( func, *inst, 1, 1, chan_index );
1612          emit_mul( func, 0, 1 );
1613          STORE( func, *inst, 0, 0, chan_index );
1614       }
1615       break;
1616
1617    case TGSI_OPCODE_ADD:
1618       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1619          FETCH( func, *inst, 0, 0, chan_index );
1620          FETCH( func, *inst, 1, 1, chan_index );
1621          emit_add( func, 0, 1 );
1622          STORE( func, *inst, 0, 0, chan_index );
1623       }
1624       break;
1625
1626    case TGSI_OPCODE_DP3:
1627    /* TGSI_OPCODE_DOT3 */
1628       FETCH( func, *inst, 0, 0, CHAN_X );
1629       FETCH( func, *inst, 1, 1, CHAN_X );
1630       emit_mul( func, 0, 1 );
1631       FETCH( func, *inst, 1, 0, CHAN_Y );
1632       FETCH( func, *inst, 2, 1, CHAN_Y );
1633       emit_mul( func, 1, 2 );
1634       emit_add( func, 0, 1 );
1635       FETCH( func, *inst, 1, 0, CHAN_Z );
1636       FETCH( func, *inst, 2, 1, CHAN_Z );
1637       emit_mul( func, 1, 2 );
1638       emit_add( func, 0, 1 );
1639       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1640          STORE( func, *inst, 0, 0, chan_index );
1641       }
1642       break;
1643
1644    case TGSI_OPCODE_DP4:
1645    /* TGSI_OPCODE_DOT4 */
1646       FETCH( func, *inst, 0, 0, CHAN_X );
1647       FETCH( func, *inst, 1, 1, CHAN_X );
1648       emit_mul( func, 0, 1 );
1649       FETCH( func, *inst, 1, 0, CHAN_Y );
1650       FETCH( func, *inst, 2, 1, CHAN_Y );
1651       emit_mul( func, 1, 2 );
1652       emit_add( func, 0, 1 );
1653       FETCH( func, *inst, 1, 0, CHAN_Z );
1654       FETCH( func, *inst, 2, 1, CHAN_Z );
1655       emit_mul(func, 1, 2 );
1656       emit_add(func, 0, 1 );
1657       FETCH( func, *inst, 1, 0, CHAN_W );
1658       FETCH( func, *inst, 2, 1, CHAN_W );
1659       emit_mul( func, 1, 2 );
1660       emit_add( func, 0, 1 );
1661       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1662          STORE( func, *inst, 0, 0, chan_index );
1663       }
1664       break;
1665
1666    case TGSI_OPCODE_DST:
1667       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1668          emit_tempf(
1669             func,
1670             0,
1671             TEMP_ONE_I,
1672             TEMP_ONE_C );
1673          STORE( func, *inst, 0, 0, CHAN_X );
1674       }
1675       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1676          FETCH( func, *inst, 0, 0, CHAN_Y );
1677          FETCH( func, *inst, 1, 1, CHAN_Y );
1678          emit_mul( func, 0, 1 );
1679          STORE( func, *inst, 0, 0, CHAN_Y );
1680       }
1681       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1682          FETCH( func, *inst, 0, 0, CHAN_Z );
1683          STORE( func, *inst, 0, 0, CHAN_Z );
1684       }
1685       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1686          FETCH( func, *inst, 0, 1, CHAN_W );
1687          STORE( func, *inst, 0, 0, CHAN_W );
1688       }
1689       break;
1690
1691    case TGSI_OPCODE_MIN:
1692       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1693          FETCH( func, *inst, 0, 0, chan_index );
1694          FETCH( func, *inst, 1, 1, chan_index );
1695          sse_minps(
1696             func,
1697             make_xmm( 0 ),
1698             make_xmm( 1 ) );
1699          STORE( func, *inst, 0, 0, chan_index );
1700       }
1701       break;
1702
1703    case TGSI_OPCODE_MAX:
1704       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1705          FETCH( func, *inst, 0, 0, chan_index );
1706          FETCH( func, *inst, 1, 1, chan_index );
1707          sse_maxps(
1708             func,
1709             make_xmm( 0 ),
1710             make_xmm( 1 ) );
1711          STORE( func, *inst, 0, 0, chan_index );
1712       }
1713       break;
1714
1715    case TGSI_OPCODE_SLT:
1716    /* TGSI_OPCODE_SETLT */
1717       emit_setcc( func, inst, cc_LessThan );
1718       break;
1719
1720    case TGSI_OPCODE_SGE:
1721    /* TGSI_OPCODE_SETGE */
1722       emit_setcc( func, inst, cc_NotLessThan );
1723       break;
1724
1725    case TGSI_OPCODE_MAD:
1726    /* TGSI_OPCODE_MADD */
1727       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1728          FETCH( func, *inst, 0, 0, chan_index );
1729          FETCH( func, *inst, 1, 1, chan_index );
1730          FETCH( func, *inst, 2, 2, chan_index );
1731          emit_mul( func, 0, 1 );
1732          emit_add( func, 0, 2 );
1733          STORE( func, *inst, 0, 0, chan_index );
1734       }
1735       break;
1736
1737    case TGSI_OPCODE_SUB:
1738       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1739          FETCH( func, *inst, 0, 0, chan_index );
1740          FETCH( func, *inst, 1, 1, chan_index );
1741          emit_sub( func, 0, 1 );
1742          STORE( func, *inst, 0, 0, chan_index );
1743       }
1744       break;
1745
1746    case TGSI_OPCODE_LERP:
1747    /* TGSI_OPCODE_LRP */
1748       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1749          FETCH( func, *inst, 0, 0, chan_index );
1750          FETCH( func, *inst, 1, 1, chan_index );
1751          FETCH( func, *inst, 2, 2, chan_index );
1752          emit_sub( func, 1, 2 );
1753          emit_mul( func, 0, 1 );
1754          emit_add( func, 0, 2 );
1755          STORE( func, *inst, 0, 0, chan_index );
1756       }
1757       break;
1758
1759    case TGSI_OPCODE_CND:
1760       return 0;
1761       break;
1762
1763    case TGSI_OPCODE_CND0:
1764       return 0;
1765       break;
1766
1767    case TGSI_OPCODE_DOT2ADD:
1768    /* TGSI_OPCODE_DP2A */
1769       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
1770       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
1771       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
1772       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
1773       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
1774       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
1775       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
1776       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
1777       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
1778       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1779          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
1780       }
1781       break;
1782
1783    case TGSI_OPCODE_INDEX:
1784       return 0;
1785       break;
1786
1787    case TGSI_OPCODE_NEGATE:
1788       return 0;
1789       break;
1790
1791    case TGSI_OPCODE_FRAC:
1792    /* TGSI_OPCODE_FRC */
1793       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1794          FETCH( func, *inst, 0, 0, chan_index );
1795          emit_frc( func, 0, 0 );
1796          STORE( func, *inst, 0, 0, chan_index );
1797       }
1798       break;
1799
1800    case TGSI_OPCODE_CLAMP:
1801       return 0;
1802       break;
1803
1804    case TGSI_OPCODE_FLOOR:
1805    /* TGSI_OPCODE_FLR */
1806       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1807          FETCH( func, *inst, 0, 0, chan_index );
1808          emit_flr( func, 0, 0 );
1809          STORE( func, *inst, 0, 0, chan_index );
1810       }
1811       break;
1812
1813    case TGSI_OPCODE_ROUND:
1814       return 0;
1815       break;
1816
1817    case TGSI_OPCODE_EXPBASE2:
1818    /* TGSI_OPCODE_EX2 */
1819       FETCH( func, *inst, 0, 0, CHAN_X );
1820       emit_ex2( func, 0, 0 );
1821       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1822          STORE( func, *inst, 0, 0, chan_index );
1823       }
1824       break;
1825
1826    case TGSI_OPCODE_LOGBASE2:
1827    /* TGSI_OPCODE_LG2 */
1828       FETCH( func, *inst, 0, 0, CHAN_X );
1829       emit_lg2( func, 0, 0 );
1830       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1831          STORE( func, *inst, 0, 0, chan_index );
1832       }
1833       break;
1834
1835    case TGSI_OPCODE_POWER:
1836    /* TGSI_OPCODE_POW */
1837       FETCH( func, *inst, 0, 0, CHAN_X );
1838       FETCH( func, *inst, 1, 1, CHAN_X );
1839       emit_pow( func, 0, 0, 1 );
1840       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1841          STORE( func, *inst, 0, 0, chan_index );
1842       }
1843       break;
1844
1845    case TGSI_OPCODE_CROSSPRODUCT:
1846    /* TGSI_OPCODE_XPD */
1847       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1848           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1849          FETCH( func, *inst, 1, 1, CHAN_Z );
1850          FETCH( func, *inst, 3, 0, CHAN_Z );
1851       }
1852       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1853           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1854          FETCH( func, *inst, 0, 0, CHAN_Y );
1855          FETCH( func, *inst, 4, 1, CHAN_Y );
1856       }
1857       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1858          emit_MOV( func, 2, 0 );
1859          emit_mul( func, 2, 1 );
1860          emit_MOV( func, 5, 3 );
1861          emit_mul( func, 5, 4 );
1862          emit_sub( func, 2, 5 );
1863          STORE( func, *inst, 2, 0, CHAN_X );
1864       }
1865       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1866           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1867          FETCH( func, *inst, 2, 1, CHAN_X );
1868          FETCH( func, *inst, 5, 0, CHAN_X );
1869       }
1870       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1871          emit_mul( func, 3, 2 );
1872          emit_mul( func, 1, 5 );
1873          emit_sub( func, 3, 1 );
1874          STORE( func, *inst, 3, 0, CHAN_Y );
1875       }
1876       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1877          emit_mul( func, 5, 4 );
1878          emit_mul( func, 0, 2 );
1879          emit_sub( func, 5, 0 );
1880          STORE( func, *inst, 5, 0, CHAN_Z );
1881       }
1882       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1883          emit_tempf(
1884             func,
1885             0,
1886             TEMP_ONE_I,
1887             TEMP_ONE_C );
1888          STORE( func, *inst, 0, 0, CHAN_W );
1889       }
1890       break;
1891
1892    case TGSI_OPCODE_MULTIPLYMATRIX:
1893       return 0;
1894       break;
1895
1896    case TGSI_OPCODE_ABS:
1897       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1898          FETCH( func, *inst, 0, 0, chan_index );
1899          emit_abs( func, 0) ;
1900
1901          STORE( func, *inst, 0, 0, chan_index );
1902       }
1903       break;
1904
1905    case TGSI_OPCODE_RCC:
1906       return 0;
1907       break;
1908
1909    case TGSI_OPCODE_DPH:
1910       FETCH( func, *inst, 0, 0, CHAN_X );
1911       FETCH( func, *inst, 1, 1, CHAN_X );
1912       emit_mul( func, 0, 1 );
1913       FETCH( func, *inst, 1, 0, CHAN_Y );
1914       FETCH( func, *inst, 2, 1, CHAN_Y );
1915       emit_mul( func, 1, 2 );
1916       emit_add( func, 0, 1 );
1917       FETCH( func, *inst, 1, 0, CHAN_Z );
1918       FETCH( func, *inst, 2, 1, CHAN_Z );
1919       emit_mul( func, 1, 2 );
1920       emit_add( func, 0, 1 );
1921       FETCH( func, *inst, 1, 1, CHAN_W );
1922       emit_add( func, 0, 1 );
1923       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1924          STORE( func, *inst, 0, 0, chan_index );
1925       }
1926       break;
1927
1928    case TGSI_OPCODE_COS:
1929       FETCH( func, *inst, 0, 0, CHAN_X );
1930       emit_cos( func, 0, 0 );
1931       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1932          STORE( func, *inst, 0, 0, chan_index );
1933       }
1934       break;
1935
1936    case TGSI_OPCODE_DDX:
1937       return 0;
1938       break;
1939
1940    case TGSI_OPCODE_DDY:
1941       return 0;
1942       break;
1943
1944    case TGSI_OPCODE_KILP:
1945       /* predicated kill */
1946       emit_kilp( func );
1947       return 0; /* XXX fix me */
1948       break;
1949
1950    case TGSI_OPCODE_KIL:
1951       /* conditional kill */
1952       emit_kil( func, &inst->FullSrcRegisters[0] );
1953       break;
1954
1955    case TGSI_OPCODE_PK2H:
1956       return 0;
1957       break;
1958
1959    case TGSI_OPCODE_PK2US:
1960       return 0;
1961       break;
1962
1963    case TGSI_OPCODE_PK4B:
1964       return 0;
1965       break;
1966
1967    case TGSI_OPCODE_PK4UB:
1968       return 0;
1969       break;
1970
1971    case TGSI_OPCODE_RFL:
1972       return 0;
1973       break;
1974
1975    case TGSI_OPCODE_SEQ:
1976       return 0;
1977       break;
1978
1979    case TGSI_OPCODE_SFL:
1980       return 0;
1981       break;
1982
1983    case TGSI_OPCODE_SGT:
1984       return 0;
1985       break;
1986
1987    case TGSI_OPCODE_SIN:
1988       FETCH( func, *inst, 0, 0, CHAN_X );
1989       emit_sin( func, 0, 0 );
1990       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1991          STORE( func, *inst, 0, 0, chan_index );
1992       }
1993       break;
1994
1995    case TGSI_OPCODE_SLE:
1996       return 0;
1997       break;
1998
1999    case TGSI_OPCODE_SNE:
2000       return 0;
2001       break;
2002
2003    case TGSI_OPCODE_STR:
2004       return 0;
2005       break;
2006
2007    case TGSI_OPCODE_TEX:
2008       if (0) {
2009          /* Disable dummy texture code:
2010           */
2011          emit_tempf(
2012             func,
2013             0,
2014             TEMP_ONE_I,
2015             TEMP_ONE_C );
2016          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2017             STORE( func, *inst, 0, 0, chan_index );
2018          }
2019       }
2020       else {
2021          return 0;
2022       }
2023       break;
2024
2025    case TGSI_OPCODE_TXD:
2026       return 0;
2027       break;
2028
2029    case TGSI_OPCODE_UP2H:
2030       return 0;
2031       break;
2032
2033    case TGSI_OPCODE_UP2US:
2034       return 0;
2035       break;
2036
2037    case TGSI_OPCODE_UP4B:
2038       return 0;
2039       break;
2040
2041    case TGSI_OPCODE_UP4UB:
2042       return 0;
2043       break;
2044
2045    case TGSI_OPCODE_X2D:
2046       return 0;
2047       break;
2048
2049    case TGSI_OPCODE_ARA:
2050       return 0;
2051       break;
2052
2053    case TGSI_OPCODE_ARR:
2054       return 0;
2055       break;
2056
2057    case TGSI_OPCODE_BRA:
2058       return 0;
2059       break;
2060
2061    case TGSI_OPCODE_CAL:
2062       return 0;
2063       break;
2064
2065    case TGSI_OPCODE_RET:
2066       emit_ret( func );
2067       break;
2068
2069    case TGSI_OPCODE_END:
2070       break;
2071
2072    case TGSI_OPCODE_SSG:
2073       return 0;
2074       break;
2075
2076    case TGSI_OPCODE_CMP:
2077       emit_cmp (func, inst);
2078       break;
2079
2080    case TGSI_OPCODE_SCS:
2081       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2082          FETCH( func, *inst, 0, 0, CHAN_X );
2083          emit_cos( func, 0, 0 );
2084          STORE( func, *inst, 0, 0, CHAN_X );
2085       }
2086       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2087          FETCH( func, *inst, 0, 0, CHAN_X );
2088          emit_sin( func, 0, 0 );
2089          STORE( func, *inst, 0, 0, CHAN_Y );
2090       }
2091       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2092          emit_tempf(
2093             func,
2094             0,
2095             TGSI_EXEC_TEMP_00000000_I,
2096             TGSI_EXEC_TEMP_00000000_C );
2097          STORE( func, *inst, 0, 0, CHAN_Z );
2098       }
2099       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2100          emit_tempf(
2101             func,
2102             0,
2103             TEMP_ONE_I,
2104             TEMP_ONE_C );
2105          STORE( func, *inst, 0, 0, CHAN_W );
2106       }
2107       break;
2108
2109    case TGSI_OPCODE_TXB:
2110       return 0;
2111       break;
2112
2113    case TGSI_OPCODE_NRM:
2114       /* fall-through */
2115    case TGSI_OPCODE_NRM4:
2116       /* 3 or 4-component normalization */
2117       {
2118          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2119          /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
2120          FETCH( func, *inst, 4, 0, CHAN_X );    /* xmm4 = src[0].x */
2121          FETCH( func, *inst, 5, 0, CHAN_Y );    /* xmm5 = src[0].y */
2122          FETCH( func, *inst, 6, 0, CHAN_Z );    /* xmm6 = src[0].z */
2123          if (dims == 4) {
2124             FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
2125          }
2126          emit_MOV( func, 0, 4 );                /* xmm0 = xmm3 */
2127          emit_mul( func, 0, 4 );                /* xmm0 *= xmm3 */
2128          emit_MOV( func, 1, 5 );                /* xmm1 = xmm4 */
2129          emit_mul( func, 1, 5 );                /* xmm1 *= xmm4 */
2130          emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
2131          emit_MOV( func, 1, 6 );                /* xmm1 = xmm5 */
2132          emit_mul( func, 1, 6 );                /* xmm1 *= xmm5 */
2133          emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
2134          if (dims == 4) {
2135             emit_MOV( func, 1, 7 );             /* xmm1 = xmm7 */
2136             emit_mul( func, 1, 7 );             /* xmm1 *= xmm7 */
2137             emit_add( func, 0, 0 );             /* xmm0 += xmm1 */
2138          }
2139          emit_rsqrt( func, 1, 0 );              /* xmm1 = 1/sqrt(xmm0) */
2140          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2141             if (chan_index < dims) {
2142                emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
2143                STORE( func, *inst, 4+chan_index, 0, chan_index );
2144             }
2145          }
2146       }
2147       break;
2148
2149    case TGSI_OPCODE_DIV:
2150       return 0;
2151       break;
2152
2153    case TGSI_OPCODE_DP2:
2154       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2155       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2156       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2157       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2158       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2159       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2160       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2161       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2162          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2163       }
2164       break;
2165
2166    case TGSI_OPCODE_TXL:
2167       return 0;
2168       break;
2169
2170    case TGSI_OPCODE_BRK:
2171       return 0;
2172       break;
2173
2174    case TGSI_OPCODE_IF:
2175       return 0;
2176       break;
2177
2178    case TGSI_OPCODE_LOOP:
2179       return 0;
2180       break;
2181
2182    case TGSI_OPCODE_REP:
2183       return 0;
2184       break;
2185
2186    case TGSI_OPCODE_ELSE:
2187       return 0;
2188       break;
2189
2190    case TGSI_OPCODE_ENDIF:
2191       return 0;
2192       break;
2193
2194    case TGSI_OPCODE_ENDLOOP:
2195       return 0;
2196       break;
2197
2198    case TGSI_OPCODE_ENDREP:
2199       return 0;
2200       break;
2201
2202    case TGSI_OPCODE_PUSHA:
2203       return 0;
2204       break;
2205
2206    case TGSI_OPCODE_POPA:
2207       return 0;
2208       break;
2209
2210    case TGSI_OPCODE_CEIL:
2211       return 0;
2212       break;
2213
2214    case TGSI_OPCODE_I2F:
2215       return 0;
2216       break;
2217
2218    case TGSI_OPCODE_NOT:
2219       return 0;
2220       break;
2221
2222    case TGSI_OPCODE_TRUNC:
2223       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2224          FETCH( func, *inst, 0, 0, chan_index );
2225          emit_f2it( func, 0 );
2226          emit_i2f( func, 0 );
2227          STORE( func, *inst, 0, 0, chan_index );
2228       }
2229       break;
2230
2231    case TGSI_OPCODE_SHL:
2232       return 0;
2233       break;
2234
2235    case TGSI_OPCODE_SHR:
2236       return 0;
2237       break;
2238
2239    case TGSI_OPCODE_AND:
2240       return 0;
2241       break;
2242
2243    case TGSI_OPCODE_OR:
2244       return 0;
2245       break;
2246
2247    case TGSI_OPCODE_MOD:
2248       return 0;
2249       break;
2250
2251    case TGSI_OPCODE_XOR:
2252       return 0;
2253       break;
2254
2255    case TGSI_OPCODE_SAD:
2256       return 0;
2257       break;
2258
2259    case TGSI_OPCODE_TXF:
2260       return 0;
2261       break;
2262
2263    case TGSI_OPCODE_TXQ:
2264       return 0;
2265       break;
2266
2267    case TGSI_OPCODE_CONT:
2268       return 0;
2269       break;
2270
2271    case TGSI_OPCODE_EMIT:
2272       return 0;
2273       break;
2274
2275    case TGSI_OPCODE_ENDPRIM:
2276       return 0;
2277       break;
2278
2279    default:
2280       return 0;
2281    }
2282
2283    return 1;
2284 }
2285
2286 static void
2287 emit_declaration(
2288    struct x86_function *func,
2289    struct tgsi_full_declaration *decl )
2290 {
2291    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2292       unsigned first, last, mask;
2293       unsigned i, j;
2294
2295       first = decl->DeclarationRange.First;
2296       last = decl->DeclarationRange.Last;
2297       mask = decl->Declaration.UsageMask;
2298
2299       for( i = first; i <= last; i++ ) {
2300          for( j = 0; j < NUM_CHANNELS; j++ ) {
2301             if( mask & (1 << j) ) {
2302                switch( decl->Declaration.Interpolate ) {
2303                case TGSI_INTERPOLATE_CONSTANT:
2304                   emit_coef_a0( func, 0, i, j );
2305                   emit_inputs( func, 0, i, j );
2306                   break;
2307
2308                case TGSI_INTERPOLATE_LINEAR:
2309                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2310                   emit_coef_dadx( func, 1, i, j );
2311                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2312                   emit_coef_dady( func, 3, i, j );
2313                   emit_mul( func, 0, 1 );    /* x * dadx */
2314                   emit_coef_a0( func, 4, i, j );
2315                   emit_mul( func, 2, 3 );    /* y * dady */
2316                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2317                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2318                   emit_inputs( func, 0, i, j );
2319                   break;
2320
2321                case TGSI_INTERPOLATE_PERSPECTIVE:
2322                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2323                   emit_coef_dadx( func, 1, i, j );
2324                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2325                   emit_coef_dady( func, 3, i, j );
2326                   emit_mul( func, 0, 1 );    /* x * dadx */
2327                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2328                   emit_coef_a0( func, 5, i, j );
2329                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2330                   emit_mul( func, 2, 3 );    /* y * dady */
2331                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2332                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2333                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2334                   emit_inputs( func, 0, i, j );
2335                   break;
2336
2337                default:
2338                   assert( 0 );
2339                   break;
2340                }
2341             }
2342          }
2343       }
2344    }
2345 }
2346
2347 static void aos_to_soa( struct x86_function *func,
2348                         uint arg_aos,
2349                         uint arg_soa,
2350                         uint arg_num,
2351                         uint arg_stride )
2352 {
2353    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2354    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2355    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2356    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2357    int inner_loop;
2358
2359
2360    /* Save EBX */
2361    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2362
2363    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2364    x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
2365    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2366    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2367
2368    /* do */
2369    inner_loop = x86_get_label( func );
2370    {
2371       x86_push( func, aos_input );
2372       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2373       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2374       x86_add( func, aos_input, stride );
2375       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2376       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2377       x86_add( func, aos_input, stride );
2378       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2379       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2380       x86_add( func, aos_input, stride );
2381       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2382       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2383       x86_pop( func, aos_input );
2384
2385       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2386       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2387       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2388       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2389       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2390       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2391
2392       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2393       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2394       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2395       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2396
2397       /* Advance to next input */
2398       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2399       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2400    }
2401    /* while --num_inputs */
2402    x86_dec( func, num_inputs );
2403    x86_jcc( func, cc_NE, inner_loop );
2404
2405    /* Restore EBX */
2406    x86_pop( func, aos_input );
2407 }
2408
2409 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2410 {
2411    struct x86_reg soa_output;
2412    struct x86_reg aos_output;
2413    struct x86_reg num_outputs;
2414    struct x86_reg temp;
2415    int inner_loop;
2416
2417    soa_output = x86_make_reg( file_REG32, reg_AX );
2418    aos_output = x86_make_reg( file_REG32, reg_BX );
2419    num_outputs = x86_make_reg( file_REG32, reg_CX );
2420    temp = x86_make_reg( file_REG32, reg_DX );
2421
2422    /* Save EBX */
2423    x86_push( func, aos_output );
2424
2425    x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2426    x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2427    x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2428
2429    /* do */
2430    inner_loop = x86_get_label( func );
2431    {
2432       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2433       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2434       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2435       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2436
2437       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2438       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2439       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2440       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2441       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2442       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2443
2444       x86_mov( func, temp, x86_fn_arg( func, stride ) );
2445       x86_push( func, aos_output );
2446       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2447       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2448       x86_add( func, aos_output, temp );
2449       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2450       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2451       x86_add( func, aos_output, temp );
2452       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2453       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2454       x86_add( func, aos_output, temp );
2455       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2456       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2457       x86_pop( func, aos_output );
2458
2459       /* Advance to next output */
2460       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2461       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2462    }
2463    /* while --num_outputs */
2464    x86_dec( func, num_outputs );
2465    x86_jcc( func, cc_NE, inner_loop );
2466
2467    /* Restore EBX */
2468    x86_pop( func, aos_output );
2469 }
2470
2471 /**
2472  * Translate a TGSI vertex/fragment shader to SSE2 code.
2473  * Slightly different things are done for vertex vs. fragment shaders.
2474  *
2475  * Note that fragment shaders are responsible for interpolating shader
2476  * inputs. Because on x86 we have only 4 GP registers, and here we
2477  * have 5 shader arguments (input, output, const, temp and coef), the
2478  * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2479  * GP register holding the output argument is aliased with the coeff
2480  * argument, as outputs are not needed in the DECLARATION phase.
2481  *
2482  * \param tokens  the TGSI input shader
2483  * \param func  the output SSE code/function
2484  * \param immediates  buffer to place immediates, later passed to SSE func
2485  * \param return  1 for success, 0 if translation failed
2486  */
2487 unsigned
2488 tgsi_emit_sse2(
2489    const struct tgsi_token *tokens,
2490    struct x86_function *func,
2491    float (*immediates)[4],
2492    boolean do_swizzles )
2493 {
2494    struct tgsi_parse_context parse;
2495    boolean instruction_phase = FALSE;
2496    unsigned ok = 1;
2497    uint num_immediates = 0;
2498
2499    util_init_math();
2500
2501    func->csr = func->store;
2502
2503    tgsi_parse_init( &parse, tokens );
2504
2505    /* Can't just use EDI, EBX without save/restoring them:
2506     */
2507    x86_push(
2508       func,
2509       get_immediate_base() );
2510
2511    x86_push(
2512       func,
2513       get_temp_base() );
2514
2515
2516    /*
2517     * Different function args for vertex/fragment shaders:
2518     */
2519    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2520       /* DECLARATION phase, do not load output argument. */
2521       x86_mov(
2522          func,
2523          get_input_base(),
2524          x86_fn_arg( func, 1 ) );
2525       /* skipping outputs argument here */
2526       x86_mov(
2527          func,
2528          get_const_base(),
2529          x86_fn_arg( func, 3 ) );
2530       x86_mov(
2531          func,
2532          get_temp_base(),
2533          x86_fn_arg( func, 4 ) );
2534       x86_mov(
2535          func,
2536          get_coef_base(),
2537          x86_fn_arg( func, 5 ) );
2538       x86_mov(
2539          func,
2540          get_immediate_base(),
2541          x86_fn_arg( func, 6 ) );
2542    }
2543    else {
2544       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2545
2546       if (do_swizzles)
2547          aos_to_soa( func,
2548                      6,         /* aos_input */
2549                      1,         /* machine->input */
2550                      7,         /* num_inputs */
2551                      8 );       /* input_stride */
2552
2553       x86_mov(
2554          func,
2555          get_input_base(),
2556          x86_fn_arg( func, 1 ) );
2557       x86_mov(
2558          func,
2559          get_output_base(),
2560          x86_fn_arg( func, 2 ) );
2561       x86_mov(
2562          func,
2563          get_const_base(),
2564          x86_fn_arg( func, 3 ) );
2565       x86_mov(
2566          func,
2567          get_temp_base(),
2568          x86_fn_arg( func, 4 ) );
2569       x86_mov(
2570          func,
2571          get_immediate_base(),
2572          x86_fn_arg( func, 5 ) );
2573    }
2574
2575    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2576       tgsi_parse_token( &parse );
2577
2578       switch( parse.FullToken.Token.Type ) {
2579       case TGSI_TOKEN_TYPE_DECLARATION:
2580          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2581             emit_declaration(
2582                func,
2583                &parse.FullToken.FullDeclaration );
2584          }
2585          break;
2586
2587       case TGSI_TOKEN_TYPE_INSTRUCTION:
2588          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2589             if( !instruction_phase ) {
2590                /* INSTRUCTION phase, overwrite coeff with output. */
2591                instruction_phase = TRUE;
2592                x86_mov(
2593                   func,
2594                   get_output_base(),
2595                   x86_fn_arg( func, 2 ) );
2596             }
2597          }
2598
2599          ok = emit_instruction(
2600             func,
2601             &parse.FullToken.FullInstruction );
2602
2603          if (!ok) {
2604             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2605                          parse.FullToken.FullInstruction.Instruction.Opcode,
2606                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2607                          "vertex shader" : "fragment shader");
2608          }
2609          break;
2610
2611       case TGSI_TOKEN_TYPE_IMMEDIATE:
2612          /* simply copy the immediate values into the next immediates[] slot */
2613          {
2614             const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2615             uint i;
2616             assert(size <= 4);
2617             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2618             for( i = 0; i < size; i++ ) {
2619                immediates[num_immediates][i] =
2620                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2621             }
2622 #if 0
2623             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2624                    num_immediates,
2625                    immediates[num_immediates][0],
2626                    immediates[num_immediates][1],
2627                    immediates[num_immediates][2],
2628                    immediates[num_immediates][3]);
2629 #endif
2630             num_immediates++;
2631          }
2632          break;
2633
2634       default:
2635          ok = 0;
2636          assert( 0 );
2637       }
2638    }
2639
2640    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2641       if (do_swizzles)
2642          soa_to_aos( func, 9, 2, 10, 11 );
2643    }
2644
2645    /* Can't just use EBX, EDI without save/restoring them:
2646     */
2647    x86_pop(
2648       func,
2649       get_temp_base() );
2650
2651    x86_pop(
2652       func,
2653       get_immediate_base() );
2654
2655    emit_ret( func );
2656
2657    tgsi_parse_free( &parse );
2658
2659    return ok;
2660 }
2661
2662 #endif /* PIPE_ARCH_X86 */
2663