src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_config.h"
  29
  30 #if defined(PIPE_ARCH_X86)
  31
  32 #include "util/u_debug.h"
  33 #include "pipe/p_shader_tokens.h"
  34 #include "util/u_math.h"
  35 #include "util/u_memory.h"
  36 #if defined(PIPE_ARCH_SSE)
  37 #include "util/u_sse.h"
  38 #endif
  39 #include "tgsi/tgsi_info.h"
  40 #include "tgsi/tgsi_parse.h"
  41 #include "tgsi/tgsi_util.h"
  42 #include "tgsi/tgsi_dump.h"
  43 #include "tgsi/tgsi_exec.h"
  44 #include "tgsi/tgsi_sse2.h"
  45
  46 #include "rtasm/rtasm_x86sse.h"
  47
  48 /* for 1/sqrt()
  49  *
  50  * This costs about 100fps (close to 10%) in gears:
  51  */
  52 #define HIGH_PRECISION 1
  53
  54 #define FAST_MATH 1
  55
  56
  57 #define FOR_EACH_CHANNEL( CHAN )\
  58    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  59
  60 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  61    ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
  62
  63 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  64    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  65
  66 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  67    FOR_EACH_CHANNEL( CHAN )\
  68       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  69
  70 #define CHAN_X 0
  71 #define CHAN_Y 1
  72 #define CHAN_Z 2
  73 #define CHAN_W 3
  74
  75 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  76 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  77
  78 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  79 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  80 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  81 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  82
  83
  84 /**
  85  * X86 utility functions.
  86  */
  87
  88 static struct x86_reg
  89 make_xmm(
  90    unsigned xmm )
  91 {
  92    return x86_make_reg(
  93       file_XMM,
  94       (enum x86_reg_name) xmm );
  95 }
  96
  97 /**
  98  * X86 register mapping helpers.
  99  */
 100
 101 static struct x86_reg
 102 get_const_base( void )
 103 {
 104    return x86_make_reg(
 105       file_REG32,
 106       reg_AX );
 107 }
 108
 109 static struct x86_reg
 110 get_machine_base( void )
 111 {
 112    return x86_make_reg(
 113       file_REG32,
 114       reg_CX );
 115 }
 116
 117 static struct x86_reg
 118 get_input_base( void )
 119 {
 120    return x86_make_disp(
 121       get_machine_base(),
 122       Offset(struct tgsi_exec_machine, Inputs) );
 123 }
 124
 125 static struct x86_reg
 126 get_output_base( void )
 127 {
 128    return x86_make_disp(
 129       get_machine_base(),
 130       Offset(struct tgsi_exec_machine, Outputs) );
 131 }
 132
 133 static struct x86_reg
 134 get_temp_base( void )
 135 {
 136    return x86_make_disp(
 137       get_machine_base(),
 138       Offset(struct tgsi_exec_machine, Temps) );
 139 }
 140
 141 static struct x86_reg
 142 get_coef_base( void )
 143 {
 144    return x86_make_reg(
 145       file_REG32,
 146       reg_BX );
 147 }
 148
 149 static struct x86_reg
 150 get_sampler_base( void )
 151 {
 152    return x86_make_reg(
 153       file_REG32,
 154       reg_DI );
 155 }
 156
 157 static struct x86_reg
 158 get_immediate_base( void )
 159 {
 160    return x86_make_reg(
 161       file_REG32,
 162       reg_DX );
 163 }
 164
 165
 166 /**
 167  * Data access helpers.
 168  */
 169
 170
 171 static struct x86_reg
 172 get_immediate(
 173    unsigned vec,
 174    unsigned chan )
 175 {
 176    return x86_make_disp(
 177       get_immediate_base(),
 178       (vec * 4 + chan) * 4 );
 179 }
 180
 181 static struct x86_reg
 182 get_const(
 183    unsigned vec,
 184    unsigned chan )
 185 {
 186    return x86_make_disp(
 187       get_const_base(),
 188       (vec * 4 + chan) * 4 );
 189 }
 190
 191 static struct x86_reg
 192 get_sampler_ptr(
 193    unsigned unit )
 194 {
 195    return x86_make_disp(
 196       get_sampler_base(),
 197       unit * sizeof( struct tgsi_sampler * ) );
 198 }
 199
 200 static struct x86_reg
 201 get_input(
 202    unsigned vec,
 203    unsigned chan )
 204 {
 205    return x86_make_disp(
 206       get_input_base(),
 207       (vec * 4 + chan) * 16 );
 208 }
 209
 210 static struct x86_reg
 211 get_output(
 212    unsigned vec,
 213    unsigned chan )
 214 {
 215    return x86_make_disp(
 216       get_output_base(),
 217       (vec * 4 + chan) * 16 );
 218 }
 219
 220 static struct x86_reg
 221 get_temp(
 222    unsigned vec,
 223    unsigned chan )
 224 {
 225    return x86_make_disp(
 226       get_temp_base(),
 227       (vec * 4 + chan) * 16 );
 228 }
 229
 230 static struct x86_reg
 231 get_coef(
 232    unsigned vec,
 233    unsigned chan,
 234    unsigned member )
 235 {
 236    return x86_make_disp(
 237       get_coef_base(),
 238       ((vec * 3 + member) * 4 + chan) * 4 );
 239 }
 240
 241
 242 static void
 243 emit_ret(
 244    struct x86_function  *func )
 245 {
 246    x86_ret( func );
 247 }
 248
 249
 250 /**
 251  * Data fetch helpers.
 252  */
 253
 254 /**
 255  * Copy a shader constant to xmm register
 256  * \param xmm  the destination xmm register
 257  * \param vec  the src const buffer index
 258  * \param chan  src channel to fetch (X, Y, Z or W)
 259  */
 260 static void
 261 emit_const(
 262    struct x86_function *func,
 263    uint xmm,
 264    int vec,
 265    uint chan,
 266    uint indirect,
 267    uint indirectFile,
 268    int indirectIndex )
 269 {
 270    if (indirect) {
 271       /* 'vec' is the offset from the address register's value.
 272        * We're loading CONST[ADDR+vec] into an xmm register.
 273        */
 274       struct x86_reg r0 = get_immediate_base();
 275       struct x86_reg r1 = get_coef_base();
 276       uint i;
 277
 278       assert( indirectFile == TGSI_FILE_ADDRESS );
 279       assert( indirectIndex == 0 );
 280       assert( r0.mod == mod_REG );
 281       assert( r1.mod == mod_REG );
 282
 283       x86_push( func, r0 );
 284       x86_push( func, r1 );
 285
 286       /*
 287        * Loop over the four pixels or vertices in the quad.
 288        * Get the value of the address (offset) register for pixel/vertex[i],
 289        * add it to the src offset and index into the constant buffer.
 290        * Note that we're working on SOA data.
 291        * If any of the pixel/vertex execution channels are unused their
 292        * values will be garbage.  It's very important that we don't use
 293        * those garbage values as indexes into the constant buffer since
 294        * that'll cause segfaults.
 295        * The solution is to bitwise-AND the offset with the execution mask
 296        * register whose values are either 0 or ~0.
 297        * The caller must setup the execution mask register to indicate
 298        * which channels are valid/alive before running the shader.
 299        * The execution mask will also figure into loops and conditionals
 300        * someday.
 301        */
 302       for (i = 0; i < QUAD_SIZE; i++) {
 303          /* r1 = address register[i] */
 304          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 305          /* r0 = execution mask[i] */
 306          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 307          /* r1 = r1 & r0 */
 308          x86_and( func, r1, r0 );
 309          /* r0 = 'vec', the offset */
 310          x86_lea( func, r0, get_const( vec, chan ) );
 311
 312          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 313           */
 314          x86_add( func, r1, r1 );
 315          x86_add( func, r1, r1 );
 316          x86_add( func, r1, r1 );
 317          x86_add( func, r1, r1 );
 318
 319          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 320          x86_mov( func, r1, x86_deref( r0 ) );
 321          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 322       }
 323
 324       x86_pop( func, r1 );
 325       x86_pop( func, r0 );
 326
 327       sse_movaps(
 328          func,
 329          make_xmm( xmm ),
 330          get_temp( TEMP_R0, CHAN_X ) );
 331    }
 332    else {
 333       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 334       assert( vec >= 0 );
 335
 336       sse_movss(
 337          func,
 338          make_xmm( xmm ),
 339          get_const( vec, chan ) );
 340       sse_shufps(
 341          func,
 342          make_xmm( xmm ),
 343          make_xmm( xmm ),
 344          SHUF( 0, 0, 0, 0 ) );
 345    }
 346 }
 347
 348 static void
 349 emit_immediate(
 350    struct x86_function *func,
 351    unsigned xmm,
 352    unsigned vec,
 353    unsigned chan )
 354 {
 355    sse_movss(
 356       func,
 357       make_xmm( xmm ),
 358       get_immediate( vec, chan ) );
 359    sse_shufps(
 360       func,
 361       make_xmm( xmm ),
 362       make_xmm( xmm ),
 363       SHUF( 0, 0, 0, 0 ) );
 364 }
 365
 366
 367 /**
 368  * Copy a shader input to xmm register
 369  * \param xmm  the destination xmm register
 370  * \param vec  the src input attrib
 371  * \param chan  src channel to fetch (X, Y, Z or W)
 372  */
 373 static void
 374 emit_inputf(
 375    struct x86_function *func,
 376    unsigned xmm,
 377    unsigned vec,
 378    unsigned chan )
 379 {
 380    sse_movups(
 381       func,
 382       make_xmm( xmm ),
 383       get_input( vec, chan ) );
 384 }
 385
 386 /**
 387  * Store an xmm register to a shader output
 388  * \param xmm  the source xmm register
 389  * \param vec  the dest output attrib
 390  * \param chan  src dest channel to store (X, Y, Z or W)
 391  */
 392 static void
 393 emit_output(
 394    struct x86_function *func,
 395    unsigned xmm,
 396    unsigned vec,
 397    unsigned chan )
 398 {
 399    sse_movups(
 400       func,
 401       get_output( vec, chan ),
 402       make_xmm( xmm ) );
 403 }
 404
 405 /**
 406  * Copy a shader temporary to xmm register
 407  * \param xmm  the destination xmm register
 408  * \param vec  the src temp register
 409  * \param chan  src channel to fetch (X, Y, Z or W)
 410  */
 411 static void
 412 emit_tempf(
 413    struct x86_function *func,
 414    unsigned xmm,
 415    unsigned vec,
 416    unsigned chan )
 417 {
 418    sse_movaps(
 419       func,
 420       make_xmm( xmm ),
 421       get_temp( vec, chan ) );
 422 }
 423
 424 /**
 425  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 426  * \param xmm  the destination xmm register
 427  * \param vec  the src input/attribute coefficient index
 428  * \param chan  src channel to fetch (X, Y, Z or W)
 429  * \param member  0=a0, 1=dadx, 2=dady
 430  */
 431 static void
 432 emit_coef(
 433    struct x86_function *func,
 434    unsigned xmm,
 435    unsigned vec,
 436    unsigned chan,
 437    unsigned member )
 438 {
 439    sse_movss(
 440       func,
 441       make_xmm( xmm ),
 442       get_coef( vec, chan, member ) );
 443    sse_shufps(
 444       func,
 445       make_xmm( xmm ),
 446       make_xmm( xmm ),
 447       SHUF( 0, 0, 0, 0 ) );
 448 }
 449
 450 /**
 451  * Data store helpers.
 452  */
 453
 454 static void
 455 emit_inputs(
 456    struct x86_function *func,
 457    unsigned xmm,
 458    unsigned vec,
 459    unsigned chan )
 460 {
 461    sse_movups(
 462       func,
 463       get_input( vec, chan ),
 464       make_xmm( xmm ) );
 465 }
 466
 467 static void
 468 emit_temps(
 469    struct x86_function *func,
 470    unsigned xmm,
 471    unsigned vec,
 472    unsigned chan )
 473 {
 474    sse_movaps(
 475       func,
 476       get_temp( vec, chan ),
 477       make_xmm( xmm ) );
 478 }
 479
 480 static void
 481 emit_addrs(
 482    struct x86_function *func,
 483    unsigned xmm,
 484    unsigned vec,
 485    unsigned chan )
 486 {
 487    assert( vec == 0 );
 488
 489    emit_temps(
 490       func,
 491       xmm,
 492       vec + TGSI_EXEC_TEMP_ADDR,
 493       chan );
 494 }
 495
 496 /**
 497  * Coefficent fetch helpers.
 498  */
 499
 500 static void
 501 emit_coef_a0(
 502    struct x86_function *func,
 503    unsigned xmm,
 504    unsigned vec,
 505    unsigned chan )
 506 {
 507    emit_coef(
 508       func,
 509       xmm,
 510       vec,
 511       chan,
 512       0 );
 513 }
 514
 515 static void
 516 emit_coef_dadx(
 517    struct x86_function *func,
 518    unsigned xmm,
 519    unsigned vec,
 520    unsigned chan )
 521 {
 522    emit_coef(
 523       func,
 524       xmm,
 525       vec,
 526       chan,
 527       1 );
 528 }
 529
 530 static void
 531 emit_coef_dady(
 532    struct x86_function *func,
 533    unsigned xmm,
 534    unsigned vec,
 535    unsigned chan )
 536 {
 537    emit_coef(
 538       func,
 539       xmm,
 540       vec,
 541       chan,
 542       2 );
 543 }
 544
 545 /**
 546  * Function call helpers.
 547  */
 548
 549 /**
 550  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 551  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 552  * that the stack pointer is 16 byte aligned, as expected.
 553  */
 554 static void
 555 emit_func_call(
 556    struct x86_function *func,
 557    unsigned xmm_save_mask,
 558    const struct x86_reg *arg,
 559    unsigned nr_args,
 560    void (PIPE_CDECL *code)() )
 561 {
 562    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 563    unsigned i, n;
 564
 565    x86_push(
 566       func,
 567       x86_make_reg( file_REG32, reg_AX) );
 568    x86_push(
 569       func,
 570       x86_make_reg( file_REG32, reg_CX) );
 571    x86_push(
 572       func,
 573       x86_make_reg( file_REG32, reg_DX) );
 574
 575    /* Store XMM regs to the stack
 576     */
 577    for(i = 0, n = 0; i < 8; ++i)
 578       if(xmm_save_mask & (1 << i))
 579          ++n;
 580
 581    x86_sub_imm(
 582       func,
 583       x86_make_reg( file_REG32, reg_SP ),
 584       n*16);
 585
 586    for(i = 0, n = 0; i < 8; ++i)
 587       if(xmm_save_mask & (1 << i)) {
 588          sse_movups(
 589             func,
 590             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 591             make_xmm( i ) );
 592          ++n;
 593       }
 594
 595    for (i = 0; i < nr_args; i++) {
 596       /* Load the address of the buffer we use for passing arguments and
 597        * receiving results:
 598        */
 599       x86_lea(
 600          func,
 601          ecx,
 602          arg[i] );
 603
 604       /* Push actual function arguments (currently just the pointer to
 605        * the buffer above), and call the function:
 606        */
 607       x86_push( func, ecx );
 608    }
 609
 610    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 611    x86_call( func, ecx );
 612
 613    /* Pop the arguments (or just add an immediate to esp)
 614     */
 615    for (i = 0; i < nr_args; i++) {
 616       x86_pop(func, ecx );
 617    }
 618
 619    /* Pop the saved XMM regs:
 620     */
 621    for(i = 0, n = 0; i < 8; ++i)
 622       if(xmm_save_mask & (1 << i)) {
 623          sse_movups(
 624             func,
 625             make_xmm( i ),
 626             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 627          ++n;
 628       }
 629
 630    x86_add_imm(
 631       func,
 632       x86_make_reg( file_REG32, reg_SP ),
 633       n*16);
 634
 635    /* Restore GP registers in a reverse order.
 636     */
 637    x86_pop(
 638       func,
 639       x86_make_reg( file_REG32, reg_DX) );
 640    x86_pop(
 641       func,
 642       x86_make_reg( file_REG32, reg_CX) );
 643    x86_pop(
 644       func,
 645       x86_make_reg( file_REG32, reg_AX) );
 646 }
 647
 648 static void
 649 emit_func_call_dst_src1(
 650    struct x86_function *func,
 651    unsigned xmm_save,
 652    unsigned xmm_dst,
 653    unsigned xmm_src0,
 654    void (PIPE_CDECL *code)() )
 655 {
 656    struct x86_reg store = get_temp( TEMP_R0, 0 );
 657    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 658
 659    /* Store our input parameters (in xmm regs) to the buffer we use
 660     * for passing arguments.  We will pass a pointer to this buffer as
 661     * the actual function argument.
 662     */
 663    sse_movaps(
 664       func,
 665       store,
 666       make_xmm( xmm_src0 ) );
 667
 668    emit_func_call( func,
 669                    xmm_mask,
 670                    &store,
 671                    1,
 672                    code );
 673
 674    sse_movaps(
 675       func,
 676       make_xmm( xmm_dst ),
 677       store );
 678 }
 679
 680
 681 static void
 682 emit_func_call_dst_src2(
 683    struct x86_function *func,
 684    unsigned xmm_save,
 685    unsigned xmm_dst,
 686    unsigned xmm_src0,
 687    unsigned xmm_src1,
 688    void (PIPE_CDECL *code)() )
 689 {
 690    struct x86_reg store = get_temp( TEMP_R0, 0 );
 691    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 692
 693    /* Store two inputs to parameter buffer.
 694     */
 695    sse_movaps(
 696       func,
 697       store,
 698       make_xmm( xmm_src0 ) );
 699
 700    sse_movaps(
 701       func,
 702       x86_make_disp( store, 4 * sizeof(float) ),
 703       make_xmm( xmm_src1 ) );
 704
 705
 706    /* Emit the call
 707     */
 708    emit_func_call( func,
 709                    xmm_mask,
 710                    &store,
 711                    1,
 712                    code );
 713
 714    /* Retrieve the results:
 715     */
 716    sse_movaps(
 717       func,
 718       make_xmm( xmm_dst ),
 719       store );
 720 }
 721
 722
 723
 724
 725
 726 #if defined(PIPE_ARCH_SSE)
 727
 728 /*
 729  * Fast SSE2 implementation of special math functions.
 730  */
 731
 732 #define POLY0(x, c0) _mm_set1_ps(c0)
 733 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 734 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 735 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 736 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 737 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 738
 739 #define EXP_POLY_DEGREE 3
 740 #define LOG_POLY_DEGREE 5
 741
 742 /**
 743  * See http://www.devmaster.net/forums/showthread.php?p=43580
 744  */
 745 static INLINE __m128
 746 exp2f4(__m128 x)
 747 {
 748    __m128i ipart;
 749    __m128 fpart, expipart, expfpart;
 750
 751    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 752    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 753
 754    /* ipart = int(x - 0.5) */
 755    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 756
 757    /* fpart = x - ipart */
 758    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 759
 760    /* expipart = (float) (1 << ipart) */
 761    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 762
 763    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 764 #if EXP_POLY_DEGREE == 5
 765    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 766 #elif EXP_POLY_DEGREE == 4
 767    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 768 #elif EXP_POLY_DEGREE == 3
 769    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 770 #elif EXP_POLY_DEGREE == 2
 771    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 772 #else
 773 #error
 774 #endif
 775
 776    return _mm_mul_ps(expipart, expfpart);
 777 }
 778
 779
 780 /**
 781  * See http://www.devmaster.net/forums/showthread.php?p=43580
 782  */
 783 static INLINE __m128
 784 log2f4(__m128 x)
 785 {
 786    __m128i expmask = _mm_set1_epi32(0x7f800000);
 787    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 788    __m128 one = _mm_set1_ps(1.0f);
 789
 790    __m128i i = _mm_castps_si128(x);
 791
 792    /* exp = (float) exponent(x) */
 793    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 794
 795    /* mant = (float) mantissa(x) */
 796    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 797
 798    __m128 logmant;
 799
 800    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 801     * These coefficients can be generate with
 802     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 803     */
 804 #if LOG_POLY_DEGREE == 6
 805    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 806 #elif LOG_POLY_DEGREE == 5
 807    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 808 #elif LOG_POLY_DEGREE == 4
 809    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 810 #elif LOG_POLY_DEGREE == 3
 811    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 812 #else
 813 #error
 814 #endif
 815
 816    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 817    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 818
 819    return _mm_add_ps(logmant, exp);
 820 }
 821
 822
 823 static INLINE __m128
 824 powf4(__m128 x, __m128 y)
 825 {
 826    return exp2f4(_mm_mul_ps(log2f4(x), y));
 827 }
 828
 829 #endif /* PIPE_ARCH_SSE */
 830
 831
 832
 833 /**
 834  * Low-level instruction translators.
 835  */
 836
 837 static void
 838 emit_abs(
 839    struct x86_function *func,
 840    unsigned xmm )
 841 {
 842    sse_andps(
 843       func,
 844       make_xmm( xmm ),
 845       get_temp(
 846          TGSI_EXEC_TEMP_7FFFFFFF_I,
 847          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 848 }
 849
 850 static void
 851 emit_add(
 852    struct x86_function *func,
 853    unsigned xmm_dst,
 854    unsigned xmm_src )
 855 {
 856    sse_addps(
 857       func,
 858       make_xmm( xmm_dst ),
 859       make_xmm( xmm_src ) );
 860 }
 861
 862 static void PIPE_CDECL
 863 cos4f(
 864    float *store )
 865 {
 866    store[0] = cosf( store[0] );
 867    store[1] = cosf( store[1] );
 868    store[2] = cosf( store[2] );
 869    store[3] = cosf( store[3] );
 870 }
 871
 872 static void
 873 emit_cos(
 874    struct x86_function *func,
 875    unsigned xmm_save,
 876    unsigned xmm_dst )
 877 {
 878    emit_func_call_dst_src1(
 879       func,
 880       xmm_save,
 881       xmm_dst,
 882       xmm_dst,
 883       cos4f );
 884 }
 885
 886 static void PIPE_CDECL
 887 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 888 __attribute__((force_align_arg_pointer))
 889 #endif
 890 ex24f(
 891    float *store )
 892 {
 893 #if defined(PIPE_ARCH_SSE)
 894    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 895 #else
 896    store[0] = util_fast_exp2( store[0] );
 897    store[1] = util_fast_exp2( store[1] );
 898    store[2] = util_fast_exp2( store[2] );
 899    store[3] = util_fast_exp2( store[3] );
 900 #endif
 901 }
 902
 903 static void
 904 emit_ex2(
 905    struct x86_function *func,
 906    unsigned xmm_save,
 907    unsigned xmm_dst )
 908 {
 909    emit_func_call_dst_src1(
 910       func,
 911       xmm_save,
 912       xmm_dst,
 913       xmm_dst,
 914       ex24f );
 915 }
 916
 917 static void
 918 emit_f2it(
 919    struct x86_function *func,
 920    unsigned xmm )
 921 {
 922    sse2_cvttps2dq(
 923       func,
 924       make_xmm( xmm ),
 925       make_xmm( xmm ) );
 926 }
 927
 928 static void
 929 emit_i2f(
 930    struct x86_function *func,
 931    unsigned xmm )
 932 {
 933    sse2_cvtdq2ps(
 934       func,
 935       make_xmm( xmm ),
 936       make_xmm( xmm ) );
 937 }
 938
 939 static void PIPE_CDECL
 940 flr4f(
 941    float *store )
 942 {
 943    store[0] = floorf( store[0] );
 944    store[1] = floorf( store[1] );
 945    store[2] = floorf( store[2] );
 946    store[3] = floorf( store[3] );
 947 }
 948
 949 static void
 950 emit_flr(
 951    struct x86_function *func,
 952    unsigned xmm_save,
 953    unsigned xmm_dst )
 954 {
 955    emit_func_call_dst_src1(
 956       func,
 957       xmm_save,
 958       xmm_dst,
 959       xmm_dst,
 960       flr4f );
 961 }
 962
 963 static void PIPE_CDECL
 964 frc4f(
 965    float *store )
 966 {
 967    store[0] -= floorf( store[0] );
 968    store[1] -= floorf( store[1] );
 969    store[2] -= floorf( store[2] );
 970    store[3] -= floorf( store[3] );
 971 }
 972
 973 static void
 974 emit_frc(
 975    struct x86_function *func,
 976    unsigned xmm_save,
 977    unsigned xmm_dst )
 978 {
 979    emit_func_call_dst_src1(
 980       func,
 981       xmm_save,
 982       xmm_dst,
 983       xmm_dst,
 984       frc4f );
 985 }
 986
 987 static void PIPE_CDECL
 988 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 989 __attribute__((force_align_arg_pointer))
 990 #endif
 991 lg24f(
 992    float *store )
 993 {
 994 #if defined(PIPE_ARCH_SSE)
 995    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 996 #else
 997    store[0] = util_fast_log2( store[0] );
 998    store[1] = util_fast_log2( store[1] );
 999    store[2] = util_fast_log2( store[2] );
1000    store[3] = util_fast_log2( store[3] );
1001 #endif
1002 }
1003
1004 static void
1005 emit_lg2(
1006    struct x86_function *func,
1007    unsigned xmm_save,
1008    unsigned xmm_dst )
1009 {
1010    emit_func_call_dst_src1(
1011       func,
1012       xmm_save,
1013       xmm_dst,
1014       xmm_dst,
1015       lg24f );
1016 }
1017
1018 static void
1019 emit_MOV(
1020    struct x86_function *func,
1021    unsigned xmm_dst,
1022    unsigned xmm_src )
1023 {
1024    sse_movups(
1025       func,
1026       make_xmm( xmm_dst ),
1027       make_xmm( xmm_src ) );
1028 }
1029
1030 static void
1031 emit_mul (struct x86_function *func,
1032           unsigned xmm_dst,
1033           unsigned xmm_src)
1034 {
1035    sse_mulps(
1036       func,
1037       make_xmm( xmm_dst ),
1038       make_xmm( xmm_src ) );
1039 }
1040
1041 static void
1042 emit_neg(
1043    struct x86_function *func,
1044    unsigned xmm )
1045 {
1046    sse_xorps(
1047       func,
1048       make_xmm( xmm ),
1049       get_temp(
1050          TGSI_EXEC_TEMP_80000000_I,
1051          TGSI_EXEC_TEMP_80000000_C ) );
1052 }
1053
1054 static void PIPE_CDECL
1055 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1056 __attribute__((force_align_arg_pointer))
1057 #endif
1058 pow4f(
1059    float *store )
1060 {
1061 #if defined(PIPE_ARCH_SSE)
1062    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1063 #else
1064    store[0] = util_fast_pow( store[0], store[4] );
1065    store[1] = util_fast_pow( store[1], store[5] );
1066    store[2] = util_fast_pow( store[2], store[6] );
1067    store[3] = util_fast_pow( store[3], store[7] );
1068 #endif
1069 }
1070
1071 static void
1072 emit_pow(
1073    struct x86_function *func,
1074    unsigned xmm_save,
1075    unsigned xmm_dst,
1076    unsigned xmm_src0,
1077    unsigned xmm_src1 )
1078 {
1079    emit_func_call_dst_src2(
1080       func,
1081       xmm_save,
1082       xmm_dst,
1083       xmm_src0,
1084       xmm_src1,
1085       pow4f );
1086 }
1087
1088 static void
1089 emit_rcp (
1090    struct x86_function *func,
1091    unsigned xmm_dst,
1092    unsigned xmm_src )
1093 {
1094    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1095     * good enough.  Need to either emit a proper divide or use the
1096     * iterative technique described below in emit_rsqrt().
1097     */
1098    sse2_rcpps(
1099       func,
1100       make_xmm( xmm_dst ),
1101       make_xmm( xmm_src ) );
1102 }
1103
1104 static void PIPE_CDECL
1105 rnd4f(
1106    float *store )
1107 {
1108    store[0] = floorf( store[0] + 0.5f );
1109    store[1] = floorf( store[1] + 0.5f );
1110    store[2] = floorf( store[2] + 0.5f );
1111    store[3] = floorf( store[3] + 0.5f );
1112 }
1113
1114 static void
1115 emit_rnd(
1116    struct x86_function *func,
1117    unsigned xmm_save,
1118    unsigned xmm_dst )
1119 {
1120    emit_func_call_dst_src1(
1121       func,
1122       xmm_save,
1123       xmm_dst,
1124       xmm_dst,
1125       rnd4f );
1126 }
1127
1128 static void
1129 emit_rsqrt(
1130    struct x86_function *func,
1131    unsigned xmm_dst,
1132    unsigned xmm_src )
1133 {
1134 #if HIGH_PRECISION
1135    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1136     * implementations, it is possible to improve its precision at
1137     * fairly low cost, using a newton/raphson step, as below:
1138     *
1139     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1140     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1141     *
1142     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1143     */
1144    {
1145       struct x86_reg dst = make_xmm( xmm_dst );
1146       struct x86_reg src = make_xmm( xmm_src );
1147       struct x86_reg tmp0 = make_xmm( 2 );
1148       struct x86_reg tmp1 = make_xmm( 3 );
1149
1150       assert( xmm_dst != xmm_src );
1151       assert( xmm_dst != 2 && xmm_dst != 3 );
1152       assert( xmm_src != 2 && xmm_src != 3 );
1153
1154       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1155       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1156       sse_rsqrtps( func, tmp1, src  );
1157       sse_mulps(   func, src,  tmp1 );
1158       sse_mulps(   func, dst,  tmp1 );
1159       sse_mulps(   func, src,  tmp1 );
1160       sse_subps(   func, tmp0, src  );
1161       sse_mulps(   func, dst,  tmp0 );
1162    }
1163 #else
1164    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1165     * good enough.
1166     */
1167    sse_rsqrtps(
1168       func,
1169       make_xmm( xmm_dst ),
1170       make_xmm( xmm_src ) );
1171 #endif
1172 }
1173
1174 static void
1175 emit_setsign(
1176    struct x86_function *func,
1177    unsigned xmm )
1178 {
1179    sse_orps(
1180       func,
1181       make_xmm( xmm ),
1182       get_temp(
1183          TGSI_EXEC_TEMP_80000000_I,
1184          TGSI_EXEC_TEMP_80000000_C ) );
1185 }
1186
1187 static void PIPE_CDECL
1188 sgn4f(
1189    float *store )
1190 {
1191    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1192    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1193    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1194    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1195 }
1196
1197 static void
1198 emit_sgn(
1199    struct x86_function *func,
1200    unsigned xmm_save,
1201    unsigned xmm_dst )
1202 {
1203    emit_func_call_dst_src1(
1204       func,
1205       xmm_save,
1206       xmm_dst,
1207       xmm_dst,
1208       sgn4f );
1209 }
1210
1211 static void PIPE_CDECL
1212 sin4f(
1213    float *store )
1214 {
1215    store[0] = sinf( store[0] );
1216    store[1] = sinf( store[1] );
1217    store[2] = sinf( store[2] );
1218    store[3] = sinf( store[3] );
1219 }
1220
1221 static void
1222 emit_sin (struct x86_function *func,
1223           unsigned xmm_save,
1224           unsigned xmm_dst)
1225 {
1226    emit_func_call_dst_src1(
1227       func,
1228       xmm_save,
1229       xmm_dst,
1230       xmm_dst,
1231       sin4f );
1232 }
1233
1234 static void
1235 emit_sub(
1236    struct x86_function *func,
1237    unsigned xmm_dst,
1238    unsigned xmm_src )
1239 {
1240    sse_subps(
1241       func,
1242       make_xmm( xmm_dst ),
1243       make_xmm( xmm_src ) );
1244 }
1245
1246
1247
1248
1249
1250
1251
1252 /**
1253  * Register fetch.
1254  */
1255
1256 static void
1257 emit_fetch(
1258    struct x86_function *func,
1259    unsigned xmm,
1260    const struct tgsi_full_src_register *reg,
1261    const unsigned chan_index )
1262 {
1263    unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1264
1265    switch (swizzle) {
1266    case TGSI_SWIZZLE_X:
1267    case TGSI_SWIZZLE_Y:
1268    case TGSI_SWIZZLE_Z:
1269    case TGSI_SWIZZLE_W:
1270       switch (reg->Register.File) {
1271       case TGSI_FILE_CONSTANT:
1272          emit_const(
1273             func,
1274             xmm,
1275             reg->Register.Index,
1276             swizzle,
1277             reg->Register.Indirect,
1278             reg->Indirect.File,
1279             reg->Indirect.Index );
1280          break;
1281
1282       case TGSI_FILE_IMMEDIATE:
1283          emit_immediate(
1284             func,
1285             xmm,
1286             reg->Register.Index,
1287             swizzle );
1288          break;
1289
1290       case TGSI_FILE_INPUT:
1291          emit_inputf(
1292             func,
1293             xmm,
1294             reg->Register.Index,
1295             swizzle );
1296          break;
1297
1298       case TGSI_FILE_TEMPORARY:
1299          emit_tempf(
1300             func,
1301             xmm,
1302             reg->Register.Index,
1303             swizzle );
1304          break;
1305
1306       default:
1307          assert( 0 );
1308       }
1309       break;
1310
1311    default:
1312       assert( 0 );
1313    }
1314
1315    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1316    case TGSI_UTIL_SIGN_CLEAR:
1317       emit_abs( func, xmm );
1318       break;
1319
1320    case TGSI_UTIL_SIGN_SET:
1321       emit_setsign( func, xmm );
1322       break;
1323
1324    case TGSI_UTIL_SIGN_TOGGLE:
1325       emit_neg( func, xmm );
1326       break;
1327
1328    case TGSI_UTIL_SIGN_KEEP:
1329       break;
1330    }
1331 }
1332
1333 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1334    emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
1335
1336 /**
1337  * Register store.
1338  */
1339
1340 static void
1341 emit_store(
1342    struct x86_function *func,
1343    unsigned xmm,
1344    const struct tgsi_full_dst_register *reg,
1345    const struct tgsi_full_instruction *inst,
1346    unsigned chan_index )
1347 {
1348    switch( inst->Instruction.Saturate ) {
1349    case TGSI_SAT_NONE:
1350       break;
1351
1352    case TGSI_SAT_ZERO_ONE:
1353       sse_maxps(
1354          func,
1355          make_xmm( xmm ),
1356          get_temp(
1357             TGSI_EXEC_TEMP_00000000_I,
1358             TGSI_EXEC_TEMP_00000000_C ) );
1359
1360       sse_minps(
1361          func,
1362          make_xmm( xmm ),
1363          get_temp(
1364             TGSI_EXEC_TEMP_ONE_I,
1365             TGSI_EXEC_TEMP_ONE_C ) );
1366       break;
1367
1368    case TGSI_SAT_MINUS_PLUS_ONE:
1369       assert( 0 );
1370       break;
1371    }
1372
1373
1374    switch( reg->Register.File ) {
1375    case TGSI_FILE_OUTPUT:
1376       emit_output(
1377          func,
1378          xmm,
1379          reg->Register.Index,
1380          chan_index );
1381       break;
1382
1383    case TGSI_FILE_TEMPORARY:
1384       emit_temps(
1385          func,
1386          xmm,
1387          reg->Register.Index,
1388          chan_index );
1389       break;
1390
1391    case TGSI_FILE_ADDRESS:
1392       emit_addrs(
1393          func,
1394          xmm,
1395          reg->Register.Index,
1396          chan_index );
1397       break;
1398
1399    default:
1400       assert( 0 );
1401    }
1402 }
1403
1404 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1405    emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
1406
1407
1408 static void PIPE_CDECL
1409 fetch_texel( struct tgsi_sampler **sampler,
1410              float *store )
1411 {
1412 #if 0
1413    uint j;
1414
1415    debug_printf("%s sampler: %p (%p) store: %p\n",
1416                 __FUNCTION__,
1417                 sampler, *sampler,
1418                 store );
1419
1420    debug_printf("lodbias %f\n", store[12]);
1421
1422    for (j = 0; j < 4; j++)
1423       debug_printf("sample %d texcoord %f %f\n",
1424                    j,
1425                    store[0+j],
1426                    store[4+j]);
1427 #endif
1428
1429    {
1430       float rgba[NUM_CHANNELS][QUAD_SIZE];
1431       (*sampler)->get_samples(*sampler,
1432                               &store[0],  /* s */
1433                               &store[4],  /* t */
1434                               &store[8],  /* r */
1435                               store[12],  /* lodbias */
1436                               rgba);      /* results */
1437
1438       memcpy( store, rgba, 16 * sizeof(float));
1439    }
1440
1441 #if 0
1442    for (j = 0; j < 4; j++)
1443       debug_printf("sample %d result %f %f %f %f\n",
1444                    j,
1445                    store[0+j],
1446                    store[4+j],
1447                    store[8+j],
1448                    store[12+j]);
1449 #endif
1450 }
1451
1452 /**
1453  * High-level instruction translators.
1454  */
1455
1456 static void
1457 emit_tex( struct x86_function *func,
1458           const struct tgsi_full_instruction *inst,
1459           boolean lodbias,
1460           boolean projected)
1461 {
1462    const uint unit = inst->Src[1].Register.Index;
1463    struct x86_reg args[2];
1464    unsigned count;
1465    unsigned i;
1466
1467    assert(inst->Instruction.Texture);
1468    switch (inst->Texture.Texture) {
1469    case TGSI_TEXTURE_1D:
1470       count = 1;
1471       break;
1472    case TGSI_TEXTURE_2D:
1473    case TGSI_TEXTURE_RECT:
1474       count = 2;
1475       break;
1476    case TGSI_TEXTURE_SHADOW1D:
1477    case TGSI_TEXTURE_SHADOW2D:
1478    case TGSI_TEXTURE_SHADOWRECT:
1479    case TGSI_TEXTURE_3D:
1480    case TGSI_TEXTURE_CUBE:
1481       count = 3;
1482       break;
1483    default:
1484       assert(0);
1485       return;
1486    }
1487
1488    if (lodbias) {
1489       FETCH( func, *inst, 3, 0, 3 );
1490    }
1491    else {
1492       emit_tempf(
1493          func,
1494          3,
1495          TGSI_EXEC_TEMP_00000000_I,
1496          TGSI_EXEC_TEMP_00000000_C );
1497
1498    }
1499
1500    /* store lodbias whether enabled or not -- fetch_texel currently
1501     * respects it always.
1502     */
1503    sse_movaps( func,
1504                get_temp( TEMP_R0, 3 ),
1505                make_xmm( 3 ) );
1506
1507
1508    if (projected) {
1509       FETCH( func, *inst, 3, 0, 3 );
1510
1511       emit_rcp( func, 3, 3 );
1512    }
1513
1514    for (i = 0; i < count; i++) {
1515       FETCH( func, *inst, i, 0, i );
1516
1517       if (projected) {
1518          sse_mulps(
1519             func,
1520             make_xmm( i ),
1521             make_xmm( 3 ) );
1522       }
1523
1524       /* Store in the argument buffer:
1525        */
1526       sse_movaps(
1527          func,
1528          get_temp( TEMP_R0, i ),
1529          make_xmm( i ) );
1530    }
1531
1532    args[0] = get_temp( TEMP_R0, 0 );
1533    args[1] = get_sampler_ptr( unit );
1534
1535
1536    emit_func_call( func,
1537                    0,
1538                    args,
1539                    Elements(args),
1540                    fetch_texel );
1541
1542    /* If all four channels are enabled, could use a pointer to
1543     * dst[0].x instead of TEMP_R0 for store?
1544     */
1545    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1546
1547       sse_movaps(
1548          func,
1549          make_xmm( 0 ),
1550          get_temp( TEMP_R0, i ) );
1551
1552       STORE( func, *inst, 0, 0, i );
1553    }
1554 }
1555
1556
1557 static void
1558 emit_kil(
1559    struct x86_function *func,
1560    const struct tgsi_full_src_register *reg )
1561 {
1562    unsigned uniquemask;
1563    unsigned unique_count = 0;
1564    unsigned chan_index;
1565    unsigned i;
1566
1567    /* This mask stores component bits that were already tested. Note that
1568     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1569     * tested. */
1570    uniquemask = 0;
1571
1572    FOR_EACH_CHANNEL( chan_index ) {
1573       unsigned swizzle;
1574
1575       /* unswizzle channel */
1576       swizzle = tgsi_util_get_full_src_register_swizzle(
1577          reg,
1578          chan_index );
1579
1580       /* check if the component has not been already tested */
1581       if( !(uniquemask & (1 << swizzle)) ) {
1582          uniquemask |= 1 << swizzle;
1583
1584          /* allocate register */
1585          emit_fetch(
1586             func,
1587             unique_count++,
1588             reg,
1589             chan_index );
1590       }
1591    }
1592
1593    x86_push(
1594       func,
1595       x86_make_reg( file_REG32, reg_AX ) );
1596    x86_push(
1597       func,
1598       x86_make_reg( file_REG32, reg_DX ) );
1599
1600    for (i = 0 ; i < unique_count; i++ ) {
1601       struct x86_reg dataXMM = make_xmm(i);
1602
1603       sse_cmpps(
1604          func,
1605          dataXMM,
1606          get_temp(
1607             TGSI_EXEC_TEMP_00000000_I,
1608             TGSI_EXEC_TEMP_00000000_C ),
1609          cc_LessThan );
1610
1611       if( i == 0 ) {
1612          sse_movmskps(
1613             func,
1614             x86_make_reg( file_REG32, reg_AX ),
1615             dataXMM );
1616       }
1617       else {
1618          sse_movmskps(
1619             func,
1620             x86_make_reg( file_REG32, reg_DX ),
1621             dataXMM );
1622          x86_or(
1623             func,
1624             x86_make_reg( file_REG32, reg_AX ),
1625             x86_make_reg( file_REG32, reg_DX ) );
1626       }
1627    }
1628
1629    x86_or(
1630       func,
1631       get_temp(
1632          TGSI_EXEC_TEMP_KILMASK_I,
1633          TGSI_EXEC_TEMP_KILMASK_C ),
1634       x86_make_reg( file_REG32, reg_AX ) );
1635
1636    x86_pop(
1637       func,
1638       x86_make_reg( file_REG32, reg_DX ) );
1639    x86_pop(
1640       func,
1641       x86_make_reg( file_REG32, reg_AX ) );
1642 }
1643
1644
1645 static void
1646 emit_kilp(
1647    struct x86_function *func )
1648 {
1649    /* XXX todo / fix me */
1650 }
1651
1652
1653 static void
1654 emit_setcc(
1655    struct x86_function *func,
1656    struct tgsi_full_instruction *inst,
1657    enum sse_cc cc )
1658 {
1659    unsigned chan_index;
1660
1661    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1662       FETCH( func, *inst, 0, 0, chan_index );
1663       FETCH( func, *inst, 1, 1, chan_index );
1664       sse_cmpps(
1665          func,
1666          make_xmm( 0 ),
1667          make_xmm( 1 ),
1668          cc );
1669       sse_andps(
1670          func,
1671          make_xmm( 0 ),
1672          get_temp(
1673             TEMP_ONE_I,
1674             TEMP_ONE_C ) );
1675       STORE( func, *inst, 0, 0, chan_index );
1676    }
1677 }
1678
1679 static void
1680 emit_cmp(
1681    struct x86_function *func,
1682    struct tgsi_full_instruction *inst )
1683 {
1684    unsigned chan_index;
1685
1686    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1687       FETCH( func, *inst, 0, 0, chan_index );
1688       FETCH( func, *inst, 1, 1, chan_index );
1689       FETCH( func, *inst, 2, 2, chan_index );
1690       sse_cmpps(
1691          func,
1692          make_xmm( 0 ),
1693          get_temp(
1694             TGSI_EXEC_TEMP_00000000_I,
1695             TGSI_EXEC_TEMP_00000000_C ),
1696          cc_LessThan );
1697       sse_andps(
1698          func,
1699          make_xmm( 1 ),
1700          make_xmm( 0 ) );
1701       sse_andnps(
1702          func,
1703          make_xmm( 0 ),
1704          make_xmm( 2 ) );
1705       sse_orps(
1706          func,
1707          make_xmm( 0 ),
1708          make_xmm( 1 ) );
1709       STORE( func, *inst, 0, 0, chan_index );
1710    }
1711 }
1712
1713
1714 /**
1715  * Check if inst src/dest regs use indirect addressing into temporary
1716  * register file.
1717  */
1718 static boolean
1719 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1720 {
1721    uint i;
1722    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1723       const struct tgsi_full_src_register *reg = &inst->Src[i];
1724       if (reg->Register.File == TGSI_FILE_TEMPORARY &&
1725           reg->Register.Indirect)
1726          return TRUE;
1727    }
1728    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1729       const struct tgsi_full_dst_register *reg = &inst->Dst[i];
1730       if (reg->Register.File == TGSI_FILE_TEMPORARY &&
1731           reg->Register.Indirect)
1732          return TRUE;
1733    }
1734    return FALSE;
1735 }
1736
1737
1738 static int
1739 emit_instruction(
1740    struct x86_function *func,
1741    struct tgsi_full_instruction *inst )
1742 {
1743    unsigned chan_index;
1744
1745    /* we can't handle indirect addressing into temp register file yet */
1746    if (indirect_temp_reference(inst))
1747       return FALSE;
1748
1749    switch (inst->Instruction.Opcode) {
1750    case TGSI_OPCODE_ARL:
1751       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1752          FETCH( func, *inst, 0, 0, chan_index );
1753          emit_flr(func, 0, 0);
1754          emit_f2it( func, 0 );
1755          STORE( func, *inst, 0, 0, chan_index );
1756       }
1757       break;
1758
1759    case TGSI_OPCODE_MOV:
1760       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1761          FETCH( func, *inst, 4 + chan_index, 0, chan_index );
1762       }
1763       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1764          STORE( func, *inst, 4 + chan_index, 0, chan_index );
1765       }
1766       break;
1767
1768    case TGSI_OPCODE_LIT:
1769       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1770           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1771          emit_tempf(
1772             func,
1773             0,
1774             TEMP_ONE_I,
1775             TEMP_ONE_C);
1776          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1777             STORE( func, *inst, 0, 0, CHAN_X );
1778          }
1779          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1780             STORE( func, *inst, 0, 0, CHAN_W );
1781          }
1782       }
1783       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1784           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1785          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1786             FETCH( func, *inst, 0, 0, CHAN_X );
1787             sse_maxps(
1788                func,
1789                make_xmm( 0 ),
1790                get_temp(
1791                   TGSI_EXEC_TEMP_00000000_I,
1792                   TGSI_EXEC_TEMP_00000000_C ) );
1793             STORE( func, *inst, 0, 0, CHAN_Y );
1794          }
1795          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1796             /* XMM[1] = SrcReg[0].yyyy */
1797             FETCH( func, *inst, 1, 0, CHAN_Y );
1798             /* XMM[1] = max(XMM[1], 0) */
1799             sse_maxps(
1800                func,
1801                make_xmm( 1 ),
1802                get_temp(
1803                   TGSI_EXEC_TEMP_00000000_I,
1804                   TGSI_EXEC_TEMP_00000000_C ) );
1805             /* XMM[2] = SrcReg[0].wwww */
1806             FETCH( func, *inst, 2, 0, CHAN_W );
1807             /* XMM[2] = min(XMM[2], 128.0) */
1808             sse_minps(
1809                func,
1810                make_xmm( 2 ),
1811                get_temp(
1812                   TGSI_EXEC_TEMP_128_I,
1813                   TGSI_EXEC_TEMP_128_C ) );
1814             /* XMM[2] = max(XMM[2], -128.0) */
1815             sse_maxps(
1816                func,
1817                make_xmm( 2 ),
1818                get_temp(
1819                   TGSI_EXEC_TEMP_MINUS_128_I,
1820                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1821             emit_pow( func, 3, 1, 1, 2 );
1822             FETCH( func, *inst, 0, 0, CHAN_X );
1823             sse_xorps(
1824                func,
1825                make_xmm( 2 ),
1826                make_xmm( 2 ) );
1827             sse_cmpps(
1828                func,
1829                make_xmm( 2 ),
1830                make_xmm( 0 ),
1831                cc_LessThan );
1832             sse_andps(
1833                func,
1834                make_xmm( 2 ),
1835                make_xmm( 1 ) );
1836             STORE( func, *inst, 2, 0, CHAN_Z );
1837          }
1838       }
1839       break;
1840
1841    case TGSI_OPCODE_RCP:
1842       FETCH( func, *inst, 0, 0, CHAN_X );
1843       emit_rcp( func, 0, 0 );
1844       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1845          STORE( func, *inst, 0, 0, chan_index );
1846       }
1847       break;
1848
1849    case TGSI_OPCODE_RSQ:
1850       FETCH( func, *inst, 0, 0, CHAN_X );
1851       emit_abs( func, 0 );
1852       emit_rsqrt( func, 1, 0 );
1853       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1854          STORE( func, *inst, 1, 0, chan_index );
1855       }
1856       break;
1857
1858    case TGSI_OPCODE_EXP:
1859       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1860           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1861           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1862          FETCH( func, *inst, 0, 0, CHAN_X );
1863          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1864              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1865             emit_MOV( func, 1, 0 );
1866             emit_flr( func, 2, 1 );
1867             /* dst.x = ex2(floor(src.x)) */
1868             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1869                emit_MOV( func, 2, 1 );
1870                emit_ex2( func, 3, 2 );
1871                STORE( func, *inst, 2, 0, CHAN_X );
1872             }
1873             /* dst.y = src.x - floor(src.x) */
1874             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1875                emit_MOV( func, 2, 0 );
1876                emit_sub( func, 2, 1 );
1877                STORE( func, *inst, 2, 0, CHAN_Y );
1878             }
1879          }
1880          /* dst.z = ex2(src.x) */
1881          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1882             emit_ex2( func, 3, 0 );
1883             STORE( func, *inst, 0, 0, CHAN_Z );
1884          }
1885       }
1886       /* dst.w = 1.0 */
1887       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1888          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1889          STORE( func, *inst, 0, 0, CHAN_W );
1890       }
1891       break;
1892
1893    case TGSI_OPCODE_LOG:
1894       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1895           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1896           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1897          FETCH( func, *inst, 0, 0, CHAN_X );
1898          emit_abs( func, 0 );
1899          emit_MOV( func, 1, 0 );
1900          emit_lg2( func, 2, 1 );
1901          /* dst.z = lg2(abs(src.x)) */
1902          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1903             STORE( func, *inst, 1, 0, CHAN_Z );
1904          }
1905          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1906              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1907             emit_flr( func, 2, 1 );
1908             /* dst.x = floor(lg2(abs(src.x))) */
1909             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1910                STORE( func, *inst, 1, 0, CHAN_X );
1911             }
1912             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1913             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1914                emit_ex2( func, 2, 1 );
1915                emit_rcp( func, 1, 1 );
1916                emit_mul( func, 0, 1 );
1917                STORE( func, *inst, 0, 0, CHAN_Y );
1918             }
1919          }
1920       }
1921       /* dst.w = 1.0 */
1922       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1923          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1924          STORE( func, *inst, 0, 0, CHAN_W );
1925       }
1926       break;
1927
1928    case TGSI_OPCODE_MUL:
1929       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1930          FETCH( func, *inst, 0, 0, chan_index );
1931          FETCH( func, *inst, 1, 1, chan_index );
1932          emit_mul( func, 0, 1 );
1933          STORE( func, *inst, 0, 0, chan_index );
1934       }
1935       break;
1936
1937    case TGSI_OPCODE_ADD:
1938       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1939          FETCH( func, *inst, 0, 0, chan_index );
1940          FETCH( func, *inst, 1, 1, chan_index );
1941          emit_add( func, 0, 1 );
1942          STORE( func, *inst, 0, 0, chan_index );
1943       }
1944       break;
1945
1946    case TGSI_OPCODE_DP3:
1947       FETCH( func, *inst, 0, 0, CHAN_X );
1948       FETCH( func, *inst, 1, 1, CHAN_X );
1949       emit_mul( func, 0, 1 );
1950       FETCH( func, *inst, 1, 0, CHAN_Y );
1951       FETCH( func, *inst, 2, 1, CHAN_Y );
1952       emit_mul( func, 1, 2 );
1953       emit_add( func, 0, 1 );
1954       FETCH( func, *inst, 1, 0, CHAN_Z );
1955       FETCH( func, *inst, 2, 1, CHAN_Z );
1956       emit_mul( func, 1, 2 );
1957       emit_add( func, 0, 1 );
1958       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1959          STORE( func, *inst, 0, 0, chan_index );
1960       }
1961       break;
1962
1963    case TGSI_OPCODE_DP4:
1964       FETCH( func, *inst, 0, 0, CHAN_X );
1965       FETCH( func, *inst, 1, 1, CHAN_X );
1966       emit_mul( func, 0, 1 );
1967       FETCH( func, *inst, 1, 0, CHAN_Y );
1968       FETCH( func, *inst, 2, 1, CHAN_Y );
1969       emit_mul( func, 1, 2 );
1970       emit_add( func, 0, 1 );
1971       FETCH( func, *inst, 1, 0, CHAN_Z );
1972       FETCH( func, *inst, 2, 1, CHAN_Z );
1973       emit_mul(func, 1, 2 );
1974       emit_add(func, 0, 1 );
1975       FETCH( func, *inst, 1, 0, CHAN_W );
1976       FETCH( func, *inst, 2, 1, CHAN_W );
1977       emit_mul( func, 1, 2 );
1978       emit_add( func, 0, 1 );
1979       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1980          STORE( func, *inst, 0, 0, chan_index );
1981       }
1982       break;
1983
1984    case TGSI_OPCODE_DST:
1985       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1986          emit_tempf(
1987             func,
1988             0,
1989             TEMP_ONE_I,
1990             TEMP_ONE_C );
1991          STORE( func, *inst, 0, 0, CHAN_X );
1992       }
1993       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1994          FETCH( func, *inst, 0, 0, CHAN_Y );
1995          FETCH( func, *inst, 1, 1, CHAN_Y );
1996          emit_mul( func, 0, 1 );
1997          STORE( func, *inst, 0, 0, CHAN_Y );
1998       }
1999       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2000          FETCH( func, *inst, 0, 0, CHAN_Z );
2001          STORE( func, *inst, 0, 0, CHAN_Z );
2002       }
2003       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2004          FETCH( func, *inst, 0, 1, CHAN_W );
2005          STORE( func, *inst, 0, 0, CHAN_W );
2006       }
2007       break;
2008
2009    case TGSI_OPCODE_MIN:
2010       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2011          FETCH( func, *inst, 0, 0, chan_index );
2012          FETCH( func, *inst, 1, 1, chan_index );
2013          sse_minps(
2014             func,
2015             make_xmm( 0 ),
2016             make_xmm( 1 ) );
2017          STORE( func, *inst, 0, 0, chan_index );
2018       }
2019       break;
2020
2021    case TGSI_OPCODE_MAX:
2022       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2023          FETCH( func, *inst, 0, 0, chan_index );
2024          FETCH( func, *inst, 1, 1, chan_index );
2025          sse_maxps(
2026             func,
2027             make_xmm( 0 ),
2028             make_xmm( 1 ) );
2029          STORE( func, *inst, 0, 0, chan_index );
2030       }
2031       break;
2032
2033    case TGSI_OPCODE_SLT:
2034       emit_setcc( func, inst, cc_LessThan );
2035       break;
2036
2037    case TGSI_OPCODE_SGE:
2038       emit_setcc( func, inst, cc_NotLessThan );
2039       break;
2040
2041    case TGSI_OPCODE_MAD:
2042       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2043          FETCH( func, *inst, 0, 0, chan_index );
2044          FETCH( func, *inst, 1, 1, chan_index );
2045          FETCH( func, *inst, 2, 2, chan_index );
2046          emit_mul( func, 0, 1 );
2047          emit_add( func, 0, 2 );
2048          STORE( func, *inst, 0, 0, chan_index );
2049       }
2050       break;
2051
2052    case TGSI_OPCODE_SUB:
2053       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2054          FETCH( func, *inst, 0, 0, chan_index );
2055          FETCH( func, *inst, 1, 1, chan_index );
2056          emit_sub( func, 0, 1 );
2057          STORE( func, *inst, 0, 0, chan_index );
2058       }
2059       break;
2060
2061    case TGSI_OPCODE_LRP:
2062       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2063          FETCH( func, *inst, 0, 0, chan_index );
2064          FETCH( func, *inst, 1, 1, chan_index );
2065          FETCH( func, *inst, 2, 2, chan_index );
2066          emit_sub( func, 1, 2 );
2067          emit_mul( func, 0, 1 );
2068          emit_add( func, 0, 2 );
2069          STORE( func, *inst, 0, 0, chan_index );
2070       }
2071       break;
2072
2073    case TGSI_OPCODE_CND:
2074       return 0;
2075       break;
2076
2077    case TGSI_OPCODE_DP2A:
2078       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2079       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2080       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2081       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2082       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2083       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2084       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2085       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
2086       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2087       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2088          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2089       }
2090       break;
2091
2092    case TGSI_OPCODE_FRC:
2093       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2094          FETCH( func, *inst, 0, 0, chan_index );
2095          emit_frc( func, 0, 0 );
2096          STORE( func, *inst, 0, 0, chan_index );
2097       }
2098       break;
2099
2100    case TGSI_OPCODE_CLAMP:
2101       return 0;
2102       break;
2103
2104    case TGSI_OPCODE_FLR:
2105       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2106          FETCH( func, *inst, 0, 0, chan_index );
2107          emit_flr( func, 0, 0 );
2108          STORE( func, *inst, 0, 0, chan_index );
2109       }
2110       break;
2111
2112    case TGSI_OPCODE_ROUND:
2113       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2114          FETCH( func, *inst, 0, 0, chan_index );
2115          emit_rnd( func, 0, 0 );
2116          STORE( func, *inst, 0, 0, chan_index );
2117       }
2118       break;
2119
2120    case TGSI_OPCODE_EX2:
2121       FETCH( func, *inst, 0, 0, CHAN_X );
2122       emit_ex2( func, 0, 0 );
2123       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2124          STORE( func, *inst, 0, 0, chan_index );
2125       }
2126       break;
2127
2128    case TGSI_OPCODE_LG2:
2129       FETCH( func, *inst, 0, 0, CHAN_X );
2130       emit_lg2( func, 0, 0 );
2131       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2132          STORE( func, *inst, 0, 0, chan_index );
2133       }
2134       break;
2135
2136    case TGSI_OPCODE_POW:
2137       FETCH( func, *inst, 0, 0, CHAN_X );
2138       FETCH( func, *inst, 1, 1, CHAN_X );
2139       emit_pow( func, 0, 0, 0, 1 );
2140       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2141          STORE( func, *inst, 0, 0, chan_index );
2142       }
2143       break;
2144
2145    case TGSI_OPCODE_XPD:
2146       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2147           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2148          FETCH( func, *inst, 1, 1, CHAN_Z );
2149          FETCH( func, *inst, 3, 0, CHAN_Z );
2150       }
2151       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2152           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2153          FETCH( func, *inst, 0, 0, CHAN_Y );
2154          FETCH( func, *inst, 4, 1, CHAN_Y );
2155       }
2156       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2157          emit_MOV( func, 2, 0 );
2158          emit_mul( func, 2, 1 );
2159          emit_MOV( func, 5, 3 );
2160          emit_mul( func, 5, 4 );
2161          emit_sub( func, 2, 5 );
2162          STORE( func, *inst, 2, 0, CHAN_X );
2163       }
2164       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2165           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2166          FETCH( func, *inst, 2, 1, CHAN_X );
2167          FETCH( func, *inst, 5, 0, CHAN_X );
2168       }
2169       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2170          emit_mul( func, 3, 2 );
2171          emit_mul( func, 1, 5 );
2172          emit_sub( func, 3, 1 );
2173          STORE( func, *inst, 3, 0, CHAN_Y );
2174       }
2175       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2176          emit_mul( func, 5, 4 );
2177          emit_mul( func, 0, 2 );
2178          emit_sub( func, 5, 0 );
2179          STORE( func, *inst, 5, 0, CHAN_Z );
2180       }
2181       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2182          emit_tempf(
2183             func,
2184             0,
2185             TEMP_ONE_I,
2186             TEMP_ONE_C );
2187          STORE( func, *inst, 0, 0, CHAN_W );
2188       }
2189       break;
2190
2191    case TGSI_OPCODE_ABS:
2192       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2193          FETCH( func, *inst, 0, 0, chan_index );
2194          emit_abs( func, 0) ;
2195
2196          STORE( func, *inst, 0, 0, chan_index );
2197       }
2198       break;
2199
2200    case TGSI_OPCODE_RCC:
2201       return 0;
2202       break;
2203
2204    case TGSI_OPCODE_DPH:
2205       FETCH( func, *inst, 0, 0, CHAN_X );
2206       FETCH( func, *inst, 1, 1, CHAN_X );
2207       emit_mul( func, 0, 1 );
2208       FETCH( func, *inst, 1, 0, CHAN_Y );
2209       FETCH( func, *inst, 2, 1, CHAN_Y );
2210       emit_mul( func, 1, 2 );
2211       emit_add( func, 0, 1 );
2212       FETCH( func, *inst, 1, 0, CHAN_Z );
2213       FETCH( func, *inst, 2, 1, CHAN_Z );
2214       emit_mul( func, 1, 2 );
2215       emit_add( func, 0, 1 );
2216       FETCH( func, *inst, 1, 1, CHAN_W );
2217       emit_add( func, 0, 1 );
2218       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2219          STORE( func, *inst, 0, 0, chan_index );
2220       }
2221       break;
2222
2223    case TGSI_OPCODE_COS:
2224       FETCH( func, *inst, 0, 0, CHAN_X );
2225       emit_cos( func, 0, 0 );
2226       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2227          STORE( func, *inst, 0, 0, chan_index );
2228       }
2229       break;
2230
2231    case TGSI_OPCODE_DDX:
2232       return 0;
2233       break;
2234
2235    case TGSI_OPCODE_DDY:
2236       return 0;
2237       break;
2238
2239    case TGSI_OPCODE_KILP:
2240       /* predicated kill */
2241       emit_kilp( func );
2242       return 0; /* XXX fix me */
2243       break;
2244
2245    case TGSI_OPCODE_KIL:
2246       /* conditional kill */
2247       emit_kil( func, &inst->Src[0] );
2248       break;
2249
2250    case TGSI_OPCODE_PK2H:
2251       return 0;
2252       break;
2253
2254    case TGSI_OPCODE_PK2US:
2255       return 0;
2256       break;
2257
2258    case TGSI_OPCODE_PK4B:
2259       return 0;
2260       break;
2261
2262    case TGSI_OPCODE_PK4UB:
2263       return 0;
2264       break;
2265
2266    case TGSI_OPCODE_RFL:
2267       return 0;
2268       break;
2269
2270    case TGSI_OPCODE_SEQ:
2271       emit_setcc( func, inst, cc_Equal );
2272       break;
2273
2274    case TGSI_OPCODE_SFL:
2275       return 0;
2276       break;
2277
2278    case TGSI_OPCODE_SGT:
2279       emit_setcc( func, inst, cc_NotLessThanEqual );
2280       break;
2281
2282    case TGSI_OPCODE_SIN:
2283       FETCH( func, *inst, 0, 0, CHAN_X );
2284       emit_sin( func, 0, 0 );
2285       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2286          STORE( func, *inst, 0, 0, chan_index );
2287       }
2288       break;
2289
2290    case TGSI_OPCODE_SLE:
2291       emit_setcc( func, inst, cc_LessThanEqual );
2292       break;
2293
2294    case TGSI_OPCODE_SNE:
2295       emit_setcc( func, inst, cc_NotEqual );
2296       break;
2297
2298    case TGSI_OPCODE_STR:
2299       return 0;
2300       break;
2301
2302    case TGSI_OPCODE_TEX:
2303       emit_tex( func, inst, FALSE, FALSE );
2304       break;
2305
2306    case TGSI_OPCODE_TXD:
2307       return 0;
2308       break;
2309
2310    case TGSI_OPCODE_UP2H:
2311       return 0;
2312       break;
2313
2314    case TGSI_OPCODE_UP2US:
2315       return 0;
2316       break;
2317
2318    case TGSI_OPCODE_UP4B:
2319       return 0;
2320       break;
2321
2322    case TGSI_OPCODE_UP4UB:
2323       return 0;
2324       break;
2325
2326    case TGSI_OPCODE_X2D:
2327       return 0;
2328       break;
2329
2330    case TGSI_OPCODE_ARA:
2331       return 0;
2332       break;
2333
2334    case TGSI_OPCODE_ARR:
2335       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2336          FETCH( func, *inst, 0, 0, chan_index );
2337          emit_rnd( func, 0, 0 );
2338          emit_f2it( func, 0 );
2339          STORE( func, *inst, 0, 0, chan_index );
2340       }
2341       break;
2342
2343    case TGSI_OPCODE_BRA:
2344       return 0;
2345       break;
2346
2347    case TGSI_OPCODE_CAL:
2348       return 0;
2349       break;
2350
2351    case TGSI_OPCODE_RET:
2352       emit_ret( func );
2353       break;
2354
2355    case TGSI_OPCODE_END:
2356       break;
2357
2358    case TGSI_OPCODE_SSG:
2359       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2360          FETCH( func, *inst, 0, 0, chan_index );
2361          emit_sgn( func, 0, 0 );
2362          STORE( func, *inst, 0, 0, chan_index );
2363       }
2364       break;
2365
2366    case TGSI_OPCODE_CMP:
2367       emit_cmp (func, inst);
2368       break;
2369
2370    case TGSI_OPCODE_SCS:
2371       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2372          FETCH( func, *inst, 0, 0, CHAN_X );
2373          emit_cos( func, 0, 0 );
2374          STORE( func, *inst, 0, 0, CHAN_X );
2375       }
2376       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2377          FETCH( func, *inst, 0, 0, CHAN_X );
2378          emit_sin( func, 0, 0 );
2379          STORE( func, *inst, 0, 0, CHAN_Y );
2380       }
2381       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2382          emit_tempf(
2383             func,
2384             0,
2385             TGSI_EXEC_TEMP_00000000_I,
2386             TGSI_EXEC_TEMP_00000000_C );
2387          STORE( func, *inst, 0, 0, CHAN_Z );
2388       }
2389       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2390          emit_tempf(
2391             func,
2392             0,
2393             TEMP_ONE_I,
2394             TEMP_ONE_C );
2395          STORE( func, *inst, 0, 0, CHAN_W );
2396       }
2397       break;
2398
2399    case TGSI_OPCODE_TXB:
2400       emit_tex( func, inst, TRUE, FALSE );
2401       break;
2402
2403    case TGSI_OPCODE_NRM:
2404       /* fall-through */
2405    case TGSI_OPCODE_NRM4:
2406       /* 3 or 4-component normalization */
2407       {
2408          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2409
2410          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2411              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2412              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2413              (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2414
2415             /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2416
2417             /* xmm4 = src.x */
2418             /* xmm0 = src.x * src.x */
2419             FETCH(func, *inst, 0, 0, CHAN_X);
2420             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2421                emit_MOV(func, 4, 0);
2422             }
2423             emit_mul(func, 0, 0);
2424
2425             /* xmm5 = src.y */
2426             /* xmm0 = xmm0 + src.y * src.y */
2427             FETCH(func, *inst, 1, 0, CHAN_Y);
2428             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2429                emit_MOV(func, 5, 1);
2430             }
2431             emit_mul(func, 1, 1);
2432             emit_add(func, 0, 1);
2433
2434             /* xmm6 = src.z */
2435             /* xmm0 = xmm0 + src.z * src.z */
2436             FETCH(func, *inst, 1, 0, CHAN_Z);
2437             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2438                emit_MOV(func, 6, 1);
2439             }
2440             emit_mul(func, 1, 1);
2441             emit_add(func, 0, 1);
2442
2443             if (dims == 4) {
2444                /* xmm7 = src.w */
2445                /* xmm0 = xmm0 + src.w * src.w */
2446                FETCH(func, *inst, 1, 0, CHAN_W);
2447                if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2448                   emit_MOV(func, 7, 1);
2449                }
2450                emit_mul(func, 1, 1);
2451                emit_add(func, 0, 1);
2452             }
2453
2454             /* xmm1 = 1 / sqrt(xmm0) */
2455             emit_rsqrt(func, 1, 0);
2456
2457             /* dst.x = xmm1 * src.x */
2458             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2459                emit_mul(func, 4, 1);
2460                STORE(func, *inst, 4, 0, CHAN_X);
2461             }
2462
2463             /* dst.y = xmm1 * src.y */
2464             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2465                emit_mul(func, 5, 1);
2466                STORE(func, *inst, 5, 0, CHAN_Y);
2467             }
2468
2469             /* dst.z = xmm1 * src.z */
2470             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2471                emit_mul(func, 6, 1);
2472                STORE(func, *inst, 6, 0, CHAN_Z);
2473             }
2474
2475             /* dst.w = xmm1 * src.w */
2476             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2477                emit_mul(func, 7, 1);
2478                STORE(func, *inst, 7, 0, CHAN_W);
2479             }
2480          }
2481
2482          /* dst0.w = 1.0 */
2483          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2484             emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2485             STORE(func, *inst, 0, 0, CHAN_W);
2486          }
2487       }
2488       break;
2489
2490    case TGSI_OPCODE_DIV:
2491       return 0;
2492       break;
2493
2494    case TGSI_OPCODE_DP2:
2495       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2496       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2497       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2498       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2499       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2500       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2501       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2502       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2503          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2504       }
2505       break;
2506
2507    case TGSI_OPCODE_TXL:
2508       emit_tex( func, inst, TRUE, FALSE );
2509       break;
2510
2511    case TGSI_OPCODE_TXP:
2512       emit_tex( func, inst, FALSE, TRUE );
2513       break;
2514
2515    case TGSI_OPCODE_BRK:
2516       return 0;
2517       break;
2518
2519    case TGSI_OPCODE_IF:
2520       return 0;
2521       break;
2522
2523    case TGSI_OPCODE_BGNFOR:
2524       return 0;
2525       break;
2526
2527    case TGSI_OPCODE_REP:
2528       return 0;
2529       break;
2530
2531    case TGSI_OPCODE_ELSE:
2532       return 0;
2533       break;
2534
2535    case TGSI_OPCODE_ENDIF:
2536       return 0;
2537       break;
2538
2539    case TGSI_OPCODE_ENDFOR:
2540       return 0;
2541       break;
2542
2543    case TGSI_OPCODE_ENDREP:
2544       return 0;
2545       break;
2546
2547    case TGSI_OPCODE_PUSHA:
2548       return 0;
2549       break;
2550
2551    case TGSI_OPCODE_POPA:
2552       return 0;
2553       break;
2554
2555    case TGSI_OPCODE_CEIL:
2556       return 0;
2557       break;
2558
2559    case TGSI_OPCODE_I2F:
2560       return 0;
2561       break;
2562
2563    case TGSI_OPCODE_NOT:
2564       return 0;
2565       break;
2566
2567    case TGSI_OPCODE_TRUNC:
2568       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2569          FETCH( func, *inst, 0, 0, chan_index );
2570          emit_f2it( func, 0 );
2571          emit_i2f( func, 0 );
2572          STORE( func, *inst, 0, 0, chan_index );
2573       }
2574       break;
2575
2576    case TGSI_OPCODE_SHL:
2577       return 0;
2578       break;
2579
2580    case TGSI_OPCODE_SHR:
2581       return 0;
2582       break;
2583
2584    case TGSI_OPCODE_AND:
2585       return 0;
2586       break;
2587
2588    case TGSI_OPCODE_OR:
2589       return 0;
2590       break;
2591
2592    case TGSI_OPCODE_MOD:
2593       return 0;
2594       break;
2595
2596    case TGSI_OPCODE_XOR:
2597       return 0;
2598       break;
2599
2600    case TGSI_OPCODE_SAD:
2601       return 0;
2602       break;
2603
2604    case TGSI_OPCODE_TXF:
2605       return 0;
2606       break;
2607
2608    case TGSI_OPCODE_TXQ:
2609       return 0;
2610       break;
2611
2612    case TGSI_OPCODE_CONT:
2613       return 0;
2614       break;
2615
2616    case TGSI_OPCODE_EMIT:
2617       return 0;
2618       break;
2619
2620    case TGSI_OPCODE_ENDPRIM:
2621       return 0;
2622       break;
2623
2624    default:
2625       return 0;
2626    }
2627
2628    return 1;
2629 }
2630
2631 static void
2632 emit_declaration(
2633    struct x86_function *func,
2634    struct tgsi_full_declaration *decl )
2635 {
2636    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2637       unsigned first, last, mask;
2638       unsigned i, j;
2639
2640       first = decl->Range.First;
2641       last = decl->Range.Last;
2642       mask = decl->Declaration.UsageMask;
2643
2644       for( i = first; i <= last; i++ ) {
2645          for( j = 0; j < NUM_CHANNELS; j++ ) {
2646             if( mask & (1 << j) ) {
2647                switch( decl->Declaration.Interpolate ) {
2648                case TGSI_INTERPOLATE_CONSTANT:
2649                   emit_coef_a0( func, 0, i, j );
2650                   emit_inputs( func, 0, i, j );
2651                   break;
2652
2653                case TGSI_INTERPOLATE_LINEAR:
2654                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2655                   emit_coef_dadx( func, 1, i, j );
2656                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2657                   emit_coef_dady( func, 3, i, j );
2658                   emit_mul( func, 0, 1 );    /* x * dadx */
2659                   emit_coef_a0( func, 4, i, j );
2660                   emit_mul( func, 2, 3 );    /* y * dady */
2661                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2662                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2663                   emit_inputs( func, 0, i, j );
2664                   break;
2665
2666                case TGSI_INTERPOLATE_PERSPECTIVE:
2667                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2668                   emit_coef_dadx( func, 1, i, j );
2669                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2670                   emit_coef_dady( func, 3, i, j );
2671                   emit_mul( func, 0, 1 );    /* x * dadx */
2672                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2673                   emit_coef_a0( func, 5, i, j );
2674                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2675                   emit_mul( func, 2, 3 );    /* y * dady */
2676                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2677                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2678                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2679                   emit_inputs( func, 0, i, j );
2680                   break;
2681
2682                default:
2683                   assert( 0 );
2684                   break;
2685                }
2686             }
2687          }
2688       }
2689    }
2690 }
2691
2692 static void aos_to_soa( struct x86_function *func,
2693                         uint arg_aos,
2694                         uint arg_machine,
2695                         uint arg_num,
2696                         uint arg_stride )
2697 {
2698    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2699    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2700    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2701    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2702    int inner_loop;
2703
2704
2705    /* Save EBX */
2706    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2707
2708    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2709    x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
2710    x86_lea( func, soa_input,
2711             x86_make_disp( soa_input,
2712                            Offset(struct tgsi_exec_machine, Inputs) ) );
2713    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2714    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2715
2716    /* do */
2717    inner_loop = x86_get_label( func );
2718    {
2719       x86_push( func, aos_input );
2720       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2721       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2722       x86_add( func, aos_input, stride );
2723       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2724       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2725       x86_add( func, aos_input, stride );
2726       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2727       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2728       x86_add( func, aos_input, stride );
2729       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2730       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2731       x86_pop( func, aos_input );
2732
2733       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2734       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2735       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2736       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2737       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2738       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2739
2740       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2741       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2742       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2743       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2744
2745       /* Advance to next input */
2746       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2747       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2748    }
2749    /* while --num_inputs */
2750    x86_dec( func, num_inputs );
2751    x86_jcc( func, cc_NE, inner_loop );
2752
2753    /* Restore EBX */
2754    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2755 }
2756
2757 static void soa_to_aos( struct x86_function *func,
2758                         uint arg_aos,
2759                         uint arg_machine,
2760                         uint arg_num,
2761                         uint arg_stride )
2762 {
2763    struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2764    struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2765    struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2766    struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2767    int inner_loop;
2768
2769    /* Save EBX */
2770    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2771
2772    x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2773    x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2774    x86_lea( func, soa_output,
2775             x86_make_disp( soa_output,
2776                            Offset(struct tgsi_exec_machine, Outputs) ) );
2777    x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2778
2779    /* do */
2780    inner_loop = x86_get_label( func );
2781    {
2782       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2783       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2784       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2785       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2786
2787       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2788       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2789       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2790       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2791       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2792       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2793
2794       x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2795       x86_push( func, aos_output );
2796       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2797       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2798       x86_add( func, aos_output, temp );
2799       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2800       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2801       x86_add( func, aos_output, temp );
2802       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2803       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2804       x86_add( func, aos_output, temp );
2805       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2806       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2807       x86_pop( func, aos_output );
2808
2809       /* Advance to next output */
2810       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2811       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2812    }
2813    /* while --num_outputs */
2814    x86_dec( func, num_outputs );
2815    x86_jcc( func, cc_NE, inner_loop );
2816
2817    /* Restore EBX */
2818    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2819 }
2820
2821 /**
2822  * Translate a TGSI vertex/fragment shader to SSE2 code.
2823  * Slightly different things are done for vertex vs. fragment shaders.
2824  *
2825  * \param tokens  the TGSI input shader
2826  * \param func  the output SSE code/function
2827  * \param immediates  buffer to place immediates, later passed to SSE func
2828  * \param return  1 for success, 0 if translation failed
2829  */
2830 unsigned
2831 tgsi_emit_sse2(
2832    const struct tgsi_token *tokens,
2833    struct x86_function *func,
2834    float (*immediates)[4],
2835    boolean do_swizzles )
2836 {
2837    struct tgsi_parse_context parse;
2838    unsigned ok = 1;
2839    uint num_immediates = 0;
2840
2841    util_init_math();
2842
2843    func->csr = func->store;
2844
2845    tgsi_parse_init( &parse, tokens );
2846
2847    /* Can't just use EDI, EBX without save/restoring them:
2848     */
2849    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2850    x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2851
2852    /*
2853     * Different function args for vertex/fragment shaders:
2854     */
2855    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2856       if (do_swizzles)
2857          aos_to_soa( func,
2858                      4,         /* aos_input */
2859                      1,         /* machine */
2860                      5,         /* num_inputs */
2861                      6 );       /* input_stride */
2862    }
2863
2864    x86_mov(
2865       func,
2866       get_machine_base(),
2867       x86_fn_arg( func, 1 ) );
2868    x86_mov(
2869       func,
2870       get_const_base(),
2871       x86_fn_arg( func, 2 ) );
2872    x86_mov(
2873       func,
2874       get_immediate_base(),
2875       x86_fn_arg( func, 3 ) );
2876
2877    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2878       x86_mov(
2879          func,
2880          get_coef_base(),
2881          x86_fn_arg( func, 4 ) );
2882    }
2883
2884    x86_mov(
2885       func,
2886       get_sampler_base(),
2887       x86_make_disp( get_machine_base(),
2888                      Offset( struct tgsi_exec_machine, Samplers ) ) );
2889
2890
2891    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2892       tgsi_parse_token( &parse );
2893
2894       switch( parse.FullToken.Token.Type ) {
2895       case TGSI_TOKEN_TYPE_DECLARATION:
2896          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2897             emit_declaration(
2898                func,
2899                &parse.FullToken.FullDeclaration );
2900          }
2901          break;
2902
2903       case TGSI_TOKEN_TYPE_INSTRUCTION:
2904          ok = emit_instruction(
2905             func,
2906             &parse.FullToken.FullInstruction );
2907
2908          if (!ok) {
2909             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2910             debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2911                          opcode,
2912                          tgsi_get_opcode_name(opcode),
2913                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2914                          "vertex shader" : "fragment shader");
2915          }
2916
2917          if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
2918             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2919
2920             /* XXX: we only handle src/dst aliasing in a few opcodes
2921              * currently.  Need to use an additional temporay to hold
2922              * the result in the cases where the code is too opaque to
2923              * fix.
2924              */
2925             if (opcode != TGSI_OPCODE_MOV) {
2926                debug_printf("Warning: src/dst aliasing in instruction"
2927                             " is not handled:\n");
2928                tgsi_dump_instruction(&parse.FullToken.FullInstruction, 1);
2929             }
2930          }
2931          break;
2932
2933       case TGSI_TOKEN_TYPE_IMMEDIATE:
2934          /* simply copy the immediate values into the next immediates[] slot */
2935          {
2936             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2937             uint i;
2938             assert(size <= 4);
2939             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2940             for( i = 0; i < size; i++ ) {
2941                immediates[num_immediates][i] =
2942                   parse.FullToken.FullImmediate.u[i].Float;
2943             }
2944 #if 0
2945             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2946                    num_immediates,
2947                    immediates[num_immediates][0],
2948                    immediates[num_immediates][1],
2949                    immediates[num_immediates][2],
2950                    immediates[num_immediates][3]);
2951 #endif
2952             num_immediates++;
2953          }
2954          break;
2955
2956       default:
2957          ok = 0;
2958          assert( 0 );
2959       }
2960    }
2961
2962    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2963       if (do_swizzles)
2964          soa_to_aos( func,
2965                      7,         /* aos_output */
2966                      1,         /* machine */
2967                      8,         /* num_outputs */
2968                      9 );       /* output_stride */
2969    }
2970
2971    /* Can't just use EBX, EDI without save/restoring them:
2972     */
2973    x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2974    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2975
2976    emit_ret( func );
2977
2978    tgsi_parse_free( &parse );
2979
2980    return ok;
2981 }
2982
2983 #endif /* PIPE_ARCH_X86 */
2984