src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_config.h"
  29
  30 #if defined(PIPE_ARCH_X86)
  31
  32 #include "util/u_debug.h"
  33 #include "pipe/p_shader_tokens.h"
  34 #include "util/u_math.h"
  35 #include "util/u_memory.h"
  36 #if defined(PIPE_ARCH_SSE)
  37 #include "util/u_sse.h"
  38 #endif
  39 #include "tgsi/tgsi_info.h"
  40 #include "tgsi/tgsi_parse.h"
  41 #include "tgsi/tgsi_util.h"
  42 #include "tgsi_exec.h"
  43 #include "tgsi_sse2.h"
  44
  45 #include "rtasm/rtasm_x86sse.h"
  46
  47 /* for 1/sqrt()
  48  *
  49  * This costs about 100fps (close to 10%) in gears:
  50  */
  51 #define HIGH_PRECISION 1
  52
  53 #define FAST_MATH 1
  54
  55
  56 #define FOR_EACH_CHANNEL( CHAN )\
  57    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  58
  59 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  60    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  61
  62 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  63    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  64
  65 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  66    FOR_EACH_CHANNEL( CHAN )\
  67       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  68
  69 #define CHAN_X 0
  70 #define CHAN_Y 1
  71 #define CHAN_Z 2
  72 #define CHAN_W 3
  73
  74 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  75 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  76
  77 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  78 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  79 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  80 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  81
  82
  83 /**
  84  * X86 utility functions.
  85  */
  86
  87 static struct x86_reg
  88 make_xmm(
  89    unsigned xmm )
  90 {
  91    return x86_make_reg(
  92       file_XMM,
  93       (enum x86_reg_name) xmm );
  94 }
  95
  96 /**
  97  * X86 register mapping helpers.
  98  */
  99
 100 static struct x86_reg
 101 get_const_base( void )
 102 {
 103    return x86_make_reg(
 104       file_REG32,
 105       reg_AX );
 106 }
 107
 108 static struct x86_reg
 109 get_machine_base( void )
 110 {
 111    return x86_make_reg(
 112       file_REG32,
 113       reg_CX );
 114 }
 115
 116 static struct x86_reg
 117 get_input_base( void )
 118 {
 119    return x86_make_disp(
 120       get_machine_base(),
 121       Offset(struct tgsi_exec_machine, Inputs) );
 122 }
 123
 124 static struct x86_reg
 125 get_output_base( void )
 126 {
 127    return x86_make_disp(
 128       get_machine_base(),
 129       Offset(struct tgsi_exec_machine, Outputs) );
 130 }
 131
 132 static struct x86_reg
 133 get_temp_base( void )
 134 {
 135    return x86_make_disp(
 136       get_machine_base(),
 137       Offset(struct tgsi_exec_machine, Temps) );
 138 }
 139
 140 static struct x86_reg
 141 get_coef_base( void )
 142 {
 143    return x86_make_reg(
 144       file_REG32,
 145       reg_BX );
 146 }
 147
 148 static struct x86_reg
 149 get_sampler_base( void )
 150 {
 151    return x86_make_reg(
 152       file_REG32,
 153       reg_DI );
 154 }
 155
 156 static struct x86_reg
 157 get_immediate_base( void )
 158 {
 159    return x86_make_reg(
 160       file_REG32,
 161       reg_DX );
 162 }
 163
 164
 165 /**
 166  * Data access helpers.
 167  */
 168
 169
 170 static struct x86_reg
 171 get_immediate(
 172    unsigned vec,
 173    unsigned chan )
 174 {
 175    return x86_make_disp(
 176       get_immediate_base(),
 177       (vec * 4 + chan) * 4 );
 178 }
 179
 180 static struct x86_reg
 181 get_const(
 182    unsigned vec,
 183    unsigned chan )
 184 {
 185    return x86_make_disp(
 186       get_const_base(),
 187       (vec * 4 + chan) * 4 );
 188 }
 189
 190 static struct x86_reg
 191 get_sampler_ptr(
 192    unsigned unit )
 193 {
 194    return x86_make_disp(
 195       get_sampler_base(),
 196       unit * sizeof( struct tgsi_sampler * ) );
 197 }
 198
 199 static struct x86_reg
 200 get_input(
 201    unsigned vec,
 202    unsigned chan )
 203 {
 204    return x86_make_disp(
 205       get_input_base(),
 206       (vec * 4 + chan) * 16 );
 207 }
 208
 209 static struct x86_reg
 210 get_output(
 211    unsigned vec,
 212    unsigned chan )
 213 {
 214    return x86_make_disp(
 215       get_output_base(),
 216       (vec * 4 + chan) * 16 );
 217 }
 218
 219 static struct x86_reg
 220 get_temp(
 221    unsigned vec,
 222    unsigned chan )
 223 {
 224    return x86_make_disp(
 225       get_temp_base(),
 226       (vec * 4 + chan) * 16 );
 227 }
 228
 229 static struct x86_reg
 230 get_coef(
 231    unsigned vec,
 232    unsigned chan,
 233    unsigned member )
 234 {
 235    return x86_make_disp(
 236       get_coef_base(),
 237       ((vec * 3 + member) * 4 + chan) * 4 );
 238 }
 239
 240
 241 static void
 242 emit_ret(
 243    struct x86_function  *func )
 244 {
 245    x86_ret( func );
 246 }
 247
 248
 249 /**
 250  * Data fetch helpers.
 251  */
 252
 253 /**
 254  * Copy a shader constant to xmm register
 255  * \param xmm  the destination xmm register
 256  * \param vec  the src const buffer index
 257  * \param chan  src channel to fetch (X, Y, Z or W)
 258  */
 259 static void
 260 emit_const(
 261    struct x86_function *func,
 262    uint xmm,
 263    int vec,
 264    uint chan,
 265    uint indirect,
 266    uint indirectFile,
 267    int indirectIndex )
 268 {
 269    if (indirect) {
 270       /* 'vec' is the offset from the address register's value.
 271        * We're loading CONST[ADDR+vec] into an xmm register.
 272        */
 273       struct x86_reg r0 = get_immediate_base();
 274       struct x86_reg r1 = get_coef_base();
 275       uint i;
 276
 277       assert( indirectFile == TGSI_FILE_ADDRESS );
 278       assert( indirectIndex == 0 );
 279       assert( r0.mod == mod_REG );
 280       assert( r1.mod == mod_REG );
 281
 282       x86_push( func, r0 );
 283       x86_push( func, r1 );
 284
 285       /*
 286        * Loop over the four pixels or vertices in the quad.
 287        * Get the value of the address (offset) register for pixel/vertex[i],
 288        * add it to the src offset and index into the constant buffer.
 289        * Note that we're working on SOA data.
 290        * If any of the pixel/vertex execution channels are unused their
 291        * values will be garbage.  It's very important that we don't use
 292        * those garbage values as indexes into the constant buffer since
 293        * that'll cause segfaults.
 294        * The solution is to bitwise-AND the offset with the execution mask
 295        * register whose values are either 0 or ~0.
 296        * The caller must setup the execution mask register to indicate
 297        * which channels are valid/alive before running the shader.
 298        * The execution mask will also figure into loops and conditionals
 299        * someday.
 300        */
 301       for (i = 0; i < QUAD_SIZE; i++) {
 302          /* r1 = address register[i] */
 303          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 304          /* r0 = execution mask[i] */
 305          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 306          /* r1 = r1 & r0 */
 307          x86_and( func, r1, r0 );
 308          /* r0 = 'vec', the offset */
 309          x86_lea( func, r0, get_const( vec, chan ) );
 310
 311          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 312           */
 313          x86_add( func, r1, r1 );
 314          x86_add( func, r1, r1 );
 315          x86_add( func, r1, r1 );
 316          x86_add( func, r1, r1 );
 317
 318          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 319          x86_mov( func, r1, x86_deref( r0 ) );
 320          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 321       }
 322
 323       x86_pop( func, r1 );
 324       x86_pop( func, r0 );
 325
 326       sse_movaps(
 327          func,
 328          make_xmm( xmm ),
 329          get_temp( TEMP_R0, CHAN_X ) );
 330    }
 331    else {
 332       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 333       assert( vec >= 0 );
 334
 335       sse_movss(
 336          func,
 337          make_xmm( xmm ),
 338          get_const( vec, chan ) );
 339       sse_shufps(
 340          func,
 341          make_xmm( xmm ),
 342          make_xmm( xmm ),
 343          SHUF( 0, 0, 0, 0 ) );
 344    }
 345 }
 346
 347 static void
 348 emit_immediate(
 349    struct x86_function *func,
 350    unsigned xmm,
 351    unsigned vec,
 352    unsigned chan )
 353 {
 354    sse_movss(
 355       func,
 356       make_xmm( xmm ),
 357       get_immediate( vec, chan ) );
 358    sse_shufps(
 359       func,
 360       make_xmm( xmm ),
 361       make_xmm( xmm ),
 362       SHUF( 0, 0, 0, 0 ) );
 363 }
 364
 365
 366 /**
 367  * Copy a shader input to xmm register
 368  * \param xmm  the destination xmm register
 369  * \param vec  the src input attrib
 370  * \param chan  src channel to fetch (X, Y, Z or W)
 371  */
 372 static void
 373 emit_inputf(
 374    struct x86_function *func,
 375    unsigned xmm,
 376    unsigned vec,
 377    unsigned chan )
 378 {
 379    sse_movups(
 380       func,
 381       make_xmm( xmm ),
 382       get_input( vec, chan ) );
 383 }
 384
 385 /**
 386  * Store an xmm register to a shader output
 387  * \param xmm  the source xmm register
 388  * \param vec  the dest output attrib
 389  * \param chan  src dest channel to store (X, Y, Z or W)
 390  */
 391 static void
 392 emit_output(
 393    struct x86_function *func,
 394    unsigned xmm,
 395    unsigned vec,
 396    unsigned chan )
 397 {
 398    sse_movups(
 399       func,
 400       get_output( vec, chan ),
 401       make_xmm( xmm ) );
 402 }
 403
 404 /**
 405  * Copy a shader temporary to xmm register
 406  * \param xmm  the destination xmm register
 407  * \param vec  the src temp register
 408  * \param chan  src channel to fetch (X, Y, Z or W)
 409  */
 410 static void
 411 emit_tempf(
 412    struct x86_function *func,
 413    unsigned xmm,
 414    unsigned vec,
 415    unsigned chan )
 416 {
 417    sse_movaps(
 418       func,
 419       make_xmm( xmm ),
 420       get_temp( vec, chan ) );
 421 }
 422
 423 /**
 424  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 425  * \param xmm  the destination xmm register
 426  * \param vec  the src input/attribute coefficient index
 427  * \param chan  src channel to fetch (X, Y, Z or W)
 428  * \param member  0=a0, 1=dadx, 2=dady
 429  */
 430 static void
 431 emit_coef(
 432    struct x86_function *func,
 433    unsigned xmm,
 434    unsigned vec,
 435    unsigned chan,
 436    unsigned member )
 437 {
 438    sse_movss(
 439       func,
 440       make_xmm( xmm ),
 441       get_coef( vec, chan, member ) );
 442    sse_shufps(
 443       func,
 444       make_xmm( xmm ),
 445       make_xmm( xmm ),
 446       SHUF( 0, 0, 0, 0 ) );
 447 }
 448
 449 /**
 450  * Data store helpers.
 451  */
 452
 453 static void
 454 emit_inputs(
 455    struct x86_function *func,
 456    unsigned xmm,
 457    unsigned vec,
 458    unsigned chan )
 459 {
 460    sse_movups(
 461       func,
 462       get_input( vec, chan ),
 463       make_xmm( xmm ) );
 464 }
 465
 466 static void
 467 emit_temps(
 468    struct x86_function *func,
 469    unsigned xmm,
 470    unsigned vec,
 471    unsigned chan )
 472 {
 473    sse_movaps(
 474       func,
 475       get_temp( vec, chan ),
 476       make_xmm( xmm ) );
 477 }
 478
 479 static void
 480 emit_addrs(
 481    struct x86_function *func,
 482    unsigned xmm,
 483    unsigned vec,
 484    unsigned chan )
 485 {
 486    assert( vec == 0 );
 487
 488    emit_temps(
 489       func,
 490       xmm,
 491       vec + TGSI_EXEC_TEMP_ADDR,
 492       chan );
 493 }
 494
 495 /**
 496  * Coefficent fetch helpers.
 497  */
 498
 499 static void
 500 emit_coef_a0(
 501    struct x86_function *func,
 502    unsigned xmm,
 503    unsigned vec,
 504    unsigned chan )
 505 {
 506    emit_coef(
 507       func,
 508       xmm,
 509       vec,
 510       chan,
 511       0 );
 512 }
 513
 514 static void
 515 emit_coef_dadx(
 516    struct x86_function *func,
 517    unsigned xmm,
 518    unsigned vec,
 519    unsigned chan )
 520 {
 521    emit_coef(
 522       func,
 523       xmm,
 524       vec,
 525       chan,
 526       1 );
 527 }
 528
 529 static void
 530 emit_coef_dady(
 531    struct x86_function *func,
 532    unsigned xmm,
 533    unsigned vec,
 534    unsigned chan )
 535 {
 536    emit_coef(
 537       func,
 538       xmm,
 539       vec,
 540       chan,
 541       2 );
 542 }
 543
 544 /**
 545  * Function call helpers.
 546  */
 547
 548 /**
 549  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 550  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 551  * that the stack pointer is 16 byte aligned, as expected.
 552  */
 553 static void
 554 emit_func_call(
 555    struct x86_function *func,
 556    unsigned xmm_save_mask,
 557    const struct x86_reg *arg,
 558    unsigned nr_args,
 559    void (PIPE_CDECL *code)() )
 560 {
 561    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 562    unsigned i, n;
 563
 564    x86_push(
 565       func,
 566       x86_make_reg( file_REG32, reg_AX) );
 567    x86_push(
 568       func,
 569       x86_make_reg( file_REG32, reg_CX) );
 570    x86_push(
 571       func,
 572       x86_make_reg( file_REG32, reg_DX) );
 573
 574    /* Store XMM regs to the stack
 575     */
 576    for(i = 0, n = 0; i < 8; ++i)
 577       if(xmm_save_mask & (1 << i))
 578          ++n;
 579
 580    x86_sub_imm(
 581       func,
 582       x86_make_reg( file_REG32, reg_SP ),
 583       n*16);
 584
 585    for(i = 0, n = 0; i < 8; ++i)
 586       if(xmm_save_mask & (1 << i)) {
 587          sse_movups(
 588             func,
 589             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 590             make_xmm( i ) );
 591          ++n;
 592       }
 593
 594    for (i = 0; i < nr_args; i++) {
 595       /* Load the address of the buffer we use for passing arguments and
 596        * receiving results:
 597        */
 598       x86_lea(
 599          func,
 600          ecx,
 601          arg[i] );
 602
 603       /* Push actual function arguments (currently just the pointer to
 604        * the buffer above), and call the function:
 605        */
 606       x86_push( func, ecx );
 607    }
 608
 609    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 610    x86_call( func, ecx );
 611
 612    /* Pop the arguments (or just add an immediate to esp)
 613     */
 614    for (i = 0; i < nr_args; i++) {
 615       x86_pop(func, ecx );
 616    }
 617
 618    /* Pop the saved XMM regs:
 619     */
 620    for(i = 0, n = 0; i < 8; ++i)
 621       if(xmm_save_mask & (1 << i)) {
 622          sse_movups(
 623             func,
 624             make_xmm( i ),
 625             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 626          ++n;
 627       }
 628
 629    x86_add_imm(
 630       func,
 631       x86_make_reg( file_REG32, reg_SP ),
 632       n*16);
 633
 634    /* Restore GP registers in a reverse order.
 635     */
 636    x86_pop(
 637       func,
 638       x86_make_reg( file_REG32, reg_DX) );
 639    x86_pop(
 640       func,
 641       x86_make_reg( file_REG32, reg_CX) );
 642    x86_pop(
 643       func,
 644       x86_make_reg( file_REG32, reg_AX) );
 645 }
 646
 647 static void
 648 emit_func_call_dst_src1(
 649    struct x86_function *func,
 650    unsigned xmm_save,
 651    unsigned xmm_dst,
 652    unsigned xmm_src0,
 653    void (PIPE_CDECL *code)() )
 654 {
 655    struct x86_reg store = get_temp( TEMP_R0, 0 );
 656    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 657
 658    /* Store our input parameters (in xmm regs) to the buffer we use
 659     * for passing arguments.  We will pass a pointer to this buffer as
 660     * the actual function argument.
 661     */
 662    sse_movaps(
 663       func,
 664       store,
 665       make_xmm( xmm_src0 ) );
 666
 667    emit_func_call( func,
 668                    xmm_mask,
 669                    &store,
 670                    1,
 671                    code );
 672
 673    sse_movaps(
 674       func,
 675       make_xmm( xmm_dst ),
 676       store );
 677 }
 678
 679
 680 static void
 681 emit_func_call_dst_src2(
 682    struct x86_function *func,
 683    unsigned xmm_save,
 684    unsigned xmm_dst,
 685    unsigned xmm_src0,
 686    unsigned xmm_src1,
 687    void (PIPE_CDECL *code)() )
 688 {
 689    struct x86_reg store = get_temp( TEMP_R0, 0 );
 690    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 691
 692    /* Store two inputs to parameter buffer.
 693     */
 694    sse_movaps(
 695       func,
 696       store,
 697       make_xmm( xmm_src0 ) );
 698
 699    sse_movaps(
 700       func,
 701       x86_make_disp( store, 4 * sizeof(float) ),
 702       make_xmm( xmm_src1 ) );
 703
 704
 705    /* Emit the call
 706     */
 707    emit_func_call( func,
 708                    xmm_mask,
 709                    &store,
 710                    1,
 711                    code );
 712
 713    /* Retrieve the results:
 714     */
 715    sse_movaps(
 716       func,
 717       make_xmm( xmm_dst ),
 718       store );
 719 }
 720
 721
 722
 723
 724
 725 #if defined(PIPE_ARCH_SSE)
 726
 727 /*
 728  * Fast SSE2 implementation of special math functions.
 729  */
 730
 731 #define POLY0(x, c0) _mm_set1_ps(c0)
 732 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 733 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 734 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 735 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 736 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 737
 738 #define EXP_POLY_DEGREE 3
 739 #define LOG_POLY_DEGREE 5
 740
 741 /**
 742  * See http://www.devmaster.net/forums/showthread.php?p=43580
 743  */
 744 static INLINE __m128
 745 exp2f4(__m128 x)
 746 {
 747    __m128i ipart;
 748    __m128 fpart, expipart, expfpart;
 749
 750    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 751    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 752
 753    /* ipart = int(x - 0.5) */
 754    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 755
 756    /* fpart = x - ipart */
 757    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 758
 759    /* expipart = (float) (1 << ipart) */
 760    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 761
 762    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 763 #if EXP_POLY_DEGREE == 5
 764    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 765 #elif EXP_POLY_DEGREE == 4
 766    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 767 #elif EXP_POLY_DEGREE == 3
 768    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 769 #elif EXP_POLY_DEGREE == 2
 770    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 771 #else
 772 #error
 773 #endif
 774
 775    return _mm_mul_ps(expipart, expfpart);
 776 }
 777
 778
 779 /**
 780  * See http://www.devmaster.net/forums/showthread.php?p=43580
 781  */
 782 static INLINE __m128
 783 log2f4(__m128 x)
 784 {
 785    __m128i expmask = _mm_set1_epi32(0x7f800000);
 786    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 787    __m128 one = _mm_set1_ps(1.0f);
 788
 789    __m128i i = _mm_castps_si128(x);
 790
 791    /* exp = (float) exponent(x) */
 792    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 793
 794    /* mant = (float) mantissa(x) */
 795    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 796
 797    __m128 logmant;
 798
 799    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 800     * These coefficients can be generate with
 801     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 802     */
 803 #if LOG_POLY_DEGREE == 6
 804    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 805 #elif LOG_POLY_DEGREE == 5
 806    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 807 #elif LOG_POLY_DEGREE == 4
 808    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 809 #elif LOG_POLY_DEGREE == 3
 810    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 811 #else
 812 #error
 813 #endif
 814
 815    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 816    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 817
 818    return _mm_add_ps(logmant, exp);
 819 }
 820
 821
 822 static INLINE __m128
 823 powf4(__m128 x, __m128 y)
 824 {
 825    return exp2f4(_mm_mul_ps(log2f4(x), y));
 826 }
 827
 828 #endif /* PIPE_ARCH_SSE */
 829
 830
 831
 832 /**
 833  * Low-level instruction translators.
 834  */
 835
 836 static void
 837 emit_abs(
 838    struct x86_function *func,
 839    unsigned xmm )
 840 {
 841    sse_andps(
 842       func,
 843       make_xmm( xmm ),
 844       get_temp(
 845          TGSI_EXEC_TEMP_7FFFFFFF_I,
 846          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 847 }
 848
 849 static void
 850 emit_add(
 851    struct x86_function *func,
 852    unsigned xmm_dst,
 853    unsigned xmm_src )
 854 {
 855    sse_addps(
 856       func,
 857       make_xmm( xmm_dst ),
 858       make_xmm( xmm_src ) );
 859 }
 860
 861 static void PIPE_CDECL
 862 cos4f(
 863    float *store )
 864 {
 865    store[0] = cosf( store[0] );
 866    store[1] = cosf( store[1] );
 867    store[2] = cosf( store[2] );
 868    store[3] = cosf( store[3] );
 869 }
 870
 871 static void
 872 emit_cos(
 873    struct x86_function *func,
 874    unsigned xmm_save,
 875    unsigned xmm_dst )
 876 {
 877    emit_func_call_dst_src1(
 878       func,
 879       xmm_save,
 880       xmm_dst,
 881       xmm_dst,
 882       cos4f );
 883 }
 884
 885 static void PIPE_CDECL
 886 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 887 __attribute__((force_align_arg_pointer))
 888 #endif
 889 ex24f(
 890    float *store )
 891 {
 892 #if defined(PIPE_ARCH_SSE)
 893    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 894 #else
 895    store[0] = util_fast_exp2( store[0] );
 896    store[1] = util_fast_exp2( store[1] );
 897    store[2] = util_fast_exp2( store[2] );
 898    store[3] = util_fast_exp2( store[3] );
 899 #endif
 900 }
 901
 902 static void
 903 emit_ex2(
 904    struct x86_function *func,
 905    unsigned xmm_save,
 906    unsigned xmm_dst )
 907 {
 908    emit_func_call_dst_src1(
 909       func,
 910       xmm_save,
 911       xmm_dst,
 912       xmm_dst,
 913       ex24f );
 914 }
 915
 916 static void
 917 emit_f2it(
 918    struct x86_function *func,
 919    unsigned xmm )
 920 {
 921    sse2_cvttps2dq(
 922       func,
 923       make_xmm( xmm ),
 924       make_xmm( xmm ) );
 925 }
 926
 927 static void
 928 emit_i2f(
 929    struct x86_function *func,
 930    unsigned xmm )
 931 {
 932    sse2_cvtdq2ps(
 933       func,
 934       make_xmm( xmm ),
 935       make_xmm( xmm ) );
 936 }
 937
 938 static void PIPE_CDECL
 939 flr4f(
 940    float *store )
 941 {
 942    store[0] = floorf( store[0] );
 943    store[1] = floorf( store[1] );
 944    store[2] = floorf( store[2] );
 945    store[3] = floorf( store[3] );
 946 }
 947
 948 static void
 949 emit_flr(
 950    struct x86_function *func,
 951    unsigned xmm_save,
 952    unsigned xmm_dst )
 953 {
 954    emit_func_call_dst_src1(
 955       func,
 956       xmm_save,
 957       xmm_dst,
 958       xmm_dst,
 959       flr4f );
 960 }
 961
 962 static void PIPE_CDECL
 963 frc4f(
 964    float *store )
 965 {
 966    store[0] -= floorf( store[0] );
 967    store[1] -= floorf( store[1] );
 968    store[2] -= floorf( store[2] );
 969    store[3] -= floorf( store[3] );
 970 }
 971
 972 static void
 973 emit_frc(
 974    struct x86_function *func,
 975    unsigned xmm_save,
 976    unsigned xmm_dst )
 977 {
 978    emit_func_call_dst_src1(
 979       func,
 980       xmm_save,
 981       xmm_dst,
 982       xmm_dst,
 983       frc4f );
 984 }
 985
 986 static void PIPE_CDECL
 987 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 988 __attribute__((force_align_arg_pointer))
 989 #endif
 990 lg24f(
 991    float *store )
 992 {
 993 #if defined(PIPE_ARCH_SSE)
 994    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 995 #else
 996    store[0] = util_fast_log2( store[0] );
 997    store[1] = util_fast_log2( store[1] );
 998    store[2] = util_fast_log2( store[2] );
 999    store[3] = util_fast_log2( store[3] );
1000 #endif
1001 }
1002
1003 static void
1004 emit_lg2(
1005    struct x86_function *func,
1006    unsigned xmm_save,
1007    unsigned xmm_dst )
1008 {
1009    emit_func_call_dst_src1(
1010       func,
1011       xmm_save,
1012       xmm_dst,
1013       xmm_dst,
1014       lg24f );
1015 }
1016
1017 static void
1018 emit_MOV(
1019    struct x86_function *func,
1020    unsigned xmm_dst,
1021    unsigned xmm_src )
1022 {
1023    sse_movups(
1024       func,
1025       make_xmm( xmm_dst ),
1026       make_xmm( xmm_src ) );
1027 }
1028
1029 static void
1030 emit_mul (struct x86_function *func,
1031           unsigned xmm_dst,
1032           unsigned xmm_src)
1033 {
1034    sse_mulps(
1035       func,
1036       make_xmm( xmm_dst ),
1037       make_xmm( xmm_src ) );
1038 }
1039
1040 static void
1041 emit_neg(
1042    struct x86_function *func,
1043    unsigned xmm )
1044 {
1045    sse_xorps(
1046       func,
1047       make_xmm( xmm ),
1048       get_temp(
1049          TGSI_EXEC_TEMP_80000000_I,
1050          TGSI_EXEC_TEMP_80000000_C ) );
1051 }
1052
1053 static void PIPE_CDECL
1054 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1055 __attribute__((force_align_arg_pointer))
1056 #endif
1057 pow4f(
1058    float *store )
1059 {
1060 #if defined(PIPE_ARCH_SSE)
1061    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1062 #else
1063    store[0] = util_fast_pow( store[0], store[4] );
1064    store[1] = util_fast_pow( store[1], store[5] );
1065    store[2] = util_fast_pow( store[2], store[6] );
1066    store[3] = util_fast_pow( store[3], store[7] );
1067 #endif
1068 }
1069
1070 static void
1071 emit_pow(
1072    struct x86_function *func,
1073    unsigned xmm_save,
1074    unsigned xmm_dst,
1075    unsigned xmm_src0,
1076    unsigned xmm_src1 )
1077 {
1078    emit_func_call_dst_src2(
1079       func,
1080       xmm_save,
1081       xmm_dst,
1082       xmm_src0,
1083       xmm_src1,
1084       pow4f );
1085 }
1086
1087 static void
1088 emit_rcp (
1089    struct x86_function *func,
1090    unsigned xmm_dst,
1091    unsigned xmm_src )
1092 {
1093    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1094     * good enough.  Need to either emit a proper divide or use the
1095     * iterative technique described below in emit_rsqrt().
1096     */
1097    sse2_rcpps(
1098       func,
1099       make_xmm( xmm_dst ),
1100       make_xmm( xmm_src ) );
1101 }
1102
1103 static void PIPE_CDECL
1104 rnd4f(
1105    float *store )
1106 {
1107    store[0] = floorf( store[0] + 0.5f );
1108    store[1] = floorf( store[1] + 0.5f );
1109    store[2] = floorf( store[2] + 0.5f );
1110    store[3] = floorf( store[3] + 0.5f );
1111 }
1112
1113 static void
1114 emit_rnd(
1115    struct x86_function *func,
1116    unsigned xmm_save,
1117    unsigned xmm_dst )
1118 {
1119    emit_func_call_dst_src1(
1120       func,
1121       xmm_save,
1122       xmm_dst,
1123       xmm_dst,
1124       rnd4f );
1125 }
1126
1127 static void
1128 emit_rsqrt(
1129    struct x86_function *func,
1130    unsigned xmm_dst,
1131    unsigned xmm_src )
1132 {
1133 #if HIGH_PRECISION
1134    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1135     * implementations, it is possible to improve its precision at
1136     * fairly low cost, using a newton/raphson step, as below:
1137     *
1138     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1139     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1140     *
1141     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1142     */
1143    {
1144       struct x86_reg dst = make_xmm( xmm_dst );
1145       struct x86_reg src = make_xmm( xmm_src );
1146       struct x86_reg tmp0 = make_xmm( 2 );
1147       struct x86_reg tmp1 = make_xmm( 3 );
1148
1149       assert( xmm_dst != xmm_src );
1150       assert( xmm_dst != 2 && xmm_dst != 3 );
1151       assert( xmm_src != 2 && xmm_src != 3 );
1152
1153       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1154       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1155       sse_rsqrtps( func, tmp1, src  );
1156       sse_mulps(   func, src,  tmp1 );
1157       sse_mulps(   func, dst,  tmp1 );
1158       sse_mulps(   func, src,  tmp1 );
1159       sse_subps(   func, tmp0, src  );
1160       sse_mulps(   func, dst,  tmp0 );
1161    }
1162 #else
1163    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1164     * good enough.
1165     */
1166    sse_rsqrtps(
1167       func,
1168       make_xmm( xmm_dst ),
1169       make_xmm( xmm_src ) );
1170 #endif
1171 }
1172
1173 static void
1174 emit_setsign(
1175    struct x86_function *func,
1176    unsigned xmm )
1177 {
1178    sse_orps(
1179       func,
1180       make_xmm( xmm ),
1181       get_temp(
1182          TGSI_EXEC_TEMP_80000000_I,
1183          TGSI_EXEC_TEMP_80000000_C ) );
1184 }
1185
1186 static void PIPE_CDECL
1187 sgn4f(
1188    float *store )
1189 {
1190    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1191    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1192    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1193    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1194 }
1195
1196 static void
1197 emit_sgn(
1198    struct x86_function *func,
1199    unsigned xmm_save,
1200    unsigned xmm_dst )
1201 {
1202    emit_func_call_dst_src1(
1203       func,
1204       xmm_save,
1205       xmm_dst,
1206       xmm_dst,
1207       sgn4f );
1208 }
1209
1210 static void PIPE_CDECL
1211 sin4f(
1212    float *store )
1213 {
1214    store[0] = sinf( store[0] );
1215    store[1] = sinf( store[1] );
1216    store[2] = sinf( store[2] );
1217    store[3] = sinf( store[3] );
1218 }
1219
1220 static void
1221 emit_sin (struct x86_function *func,
1222           unsigned xmm_save,
1223           unsigned xmm_dst)
1224 {
1225    emit_func_call_dst_src1(
1226       func,
1227       xmm_save,
1228       xmm_dst,
1229       xmm_dst,
1230       sin4f );
1231 }
1232
1233 static void
1234 emit_sub(
1235    struct x86_function *func,
1236    unsigned xmm_dst,
1237    unsigned xmm_src )
1238 {
1239    sse_subps(
1240       func,
1241       make_xmm( xmm_dst ),
1242       make_xmm( xmm_src ) );
1243 }
1244
1245
1246
1247
1248
1249
1250
1251 /**
1252  * Register fetch.
1253  */
1254
1255 static void
1256 emit_fetch(
1257    struct x86_function *func,
1258    unsigned xmm,
1259    const struct tgsi_full_src_register *reg,
1260    const unsigned chan_index )
1261 {
1262    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1263
1264    switch (swizzle) {
1265    case TGSI_EXTSWIZZLE_X:
1266    case TGSI_EXTSWIZZLE_Y:
1267    case TGSI_EXTSWIZZLE_Z:
1268    case TGSI_EXTSWIZZLE_W:
1269       switch (reg->SrcRegister.File) {
1270       case TGSI_FILE_CONSTANT:
1271          emit_const(
1272             func,
1273             xmm,
1274             reg->SrcRegister.Index,
1275             swizzle,
1276             reg->SrcRegister.Indirect,
1277             reg->SrcRegisterInd.File,
1278             reg->SrcRegisterInd.Index );
1279          break;
1280
1281       case TGSI_FILE_IMMEDIATE:
1282          emit_immediate(
1283             func,
1284             xmm,
1285             reg->SrcRegister.Index,
1286             swizzle );
1287          break;
1288
1289       case TGSI_FILE_INPUT:
1290          emit_inputf(
1291             func,
1292             xmm,
1293             reg->SrcRegister.Index,
1294             swizzle );
1295          break;
1296
1297       case TGSI_FILE_TEMPORARY:
1298          emit_tempf(
1299             func,
1300             xmm,
1301             reg->SrcRegister.Index,
1302             swizzle );
1303          break;
1304
1305       default:
1306          assert( 0 );
1307       }
1308       break;
1309
1310    case TGSI_EXTSWIZZLE_ZERO:
1311       emit_tempf(
1312          func,
1313          xmm,
1314          TGSI_EXEC_TEMP_00000000_I,
1315          TGSI_EXEC_TEMP_00000000_C );
1316       break;
1317
1318    case TGSI_EXTSWIZZLE_ONE:
1319       emit_tempf(
1320          func,
1321          xmm,
1322          TEMP_ONE_I,
1323          TEMP_ONE_C );
1324       break;
1325
1326    default:
1327       assert( 0 );
1328    }
1329
1330    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1331    case TGSI_UTIL_SIGN_CLEAR:
1332       emit_abs( func, xmm );
1333       break;
1334
1335    case TGSI_UTIL_SIGN_SET:
1336       emit_setsign( func, xmm );
1337       break;
1338
1339    case TGSI_UTIL_SIGN_TOGGLE:
1340       emit_neg( func, xmm );
1341       break;
1342
1343    case TGSI_UTIL_SIGN_KEEP:
1344       break;
1345    }
1346 }
1347
1348 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1349    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1350
1351 /**
1352  * Register store.
1353  */
1354
1355 static void
1356 emit_store(
1357    struct x86_function *func,
1358    unsigned xmm,
1359    const struct tgsi_full_dst_register *reg,
1360    const struct tgsi_full_instruction *inst,
1361    unsigned chan_index )
1362 {
1363    switch( reg->DstRegister.File ) {
1364    case TGSI_FILE_OUTPUT:
1365       emit_output(
1366          func,
1367          xmm,
1368          reg->DstRegister.Index,
1369          chan_index );
1370       break;
1371
1372    case TGSI_FILE_TEMPORARY:
1373       emit_temps(
1374          func,
1375          xmm,
1376          reg->DstRegister.Index,
1377          chan_index );
1378       break;
1379
1380    case TGSI_FILE_ADDRESS:
1381       emit_addrs(
1382          func,
1383          xmm,
1384          reg->DstRegister.Index,
1385          chan_index );
1386       break;
1387
1388    default:
1389       assert( 0 );
1390    }
1391
1392    switch( inst->Instruction.Saturate ) {
1393    case TGSI_SAT_NONE:
1394       break;
1395
1396    case TGSI_SAT_ZERO_ONE:
1397       /* assert( 0 ); */
1398       break;
1399
1400    case TGSI_SAT_MINUS_PLUS_ONE:
1401       assert( 0 );
1402       break;
1403    }
1404 }
1405
1406 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1407    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1408
1409
1410 static void PIPE_CDECL
1411 fetch_texel( struct tgsi_sampler **sampler,
1412              float *store )
1413 {
1414 #if 0
1415    uint j;
1416
1417    debug_printf("%s sampler: %p (%p) store: %p\n",
1418                 __FUNCTION__,
1419                 sampler, *sampler,
1420                 store );
1421
1422    debug_printf("lodbias %f\n", store[12]);
1423
1424    for (j = 0; j < 4; j++)
1425       debug_printf("sample %d texcoord %f %f\n",
1426                    j,
1427                    store[0+j],
1428                    store[4+j]);
1429 #endif
1430
1431    {
1432       float rgba[NUM_CHANNELS][QUAD_SIZE];
1433       (*sampler)->get_samples(*sampler,
1434                               &store[0],
1435                               &store[4],
1436                               &store[8],
1437                               0.0f, /*store[12],  lodbias */
1438                               rgba);
1439
1440       memcpy( store, rgba, 16 * sizeof(float));
1441    }
1442
1443 #if 0
1444    for (j = 0; j < 4; j++)
1445       debug_printf("sample %d result %f %f %f %f\n",
1446                    j,
1447                    store[0+j],
1448                    store[4+j],
1449                    store[8+j],
1450                    store[12+j]);
1451 #endif
1452 }
1453
1454 /**
1455  * High-level instruction translators.
1456  */
1457
1458 static void
1459 emit_tex( struct x86_function *func,
1460           const struct tgsi_full_instruction *inst,
1461           boolean lodbias,
1462           boolean projected)
1463 {
1464    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1465    struct x86_reg args[2];
1466    unsigned count;
1467    unsigned i;
1468
1469    switch (inst->InstructionExtTexture.Texture) {
1470    case TGSI_TEXTURE_1D:
1471       count = 1;
1472       break;
1473    case TGSI_TEXTURE_2D:
1474    case TGSI_TEXTURE_RECT:
1475       count = 2;
1476       break;
1477    case TGSI_TEXTURE_SHADOW1D:
1478    case TGSI_TEXTURE_SHADOW2D:
1479    case TGSI_TEXTURE_SHADOWRECT:
1480    case TGSI_TEXTURE_3D:
1481    case TGSI_TEXTURE_CUBE:
1482       count = 3;
1483       break;
1484    default:
1485       assert(0);
1486       return;
1487    }
1488
1489    if (lodbias) {
1490       FETCH( func, *inst, 3, 0, 3 );
1491    }
1492    else {
1493       emit_tempf(
1494          func,
1495          3,
1496          TGSI_EXEC_TEMP_00000000_I,
1497          TGSI_EXEC_TEMP_00000000_C );
1498
1499    }
1500
1501    /* store lodbias whether enabled or not -- fetch_texel currently
1502     * respects it always.
1503     */
1504    sse_movaps( func,
1505                get_temp( TEMP_R0, 3 ),
1506                make_xmm( 3 ) );
1507
1508
1509    if (projected) {
1510       FETCH( func, *inst, 3, 0, 3 );
1511
1512       emit_rcp( func, 3, 3 );
1513    }
1514
1515    for (i = 0; i < count; i++) {
1516       FETCH( func, *inst, i, 0, i );
1517
1518       if (projected) {
1519          sse_mulps(
1520             func,
1521             make_xmm( i ),
1522             make_xmm( 3 ) );
1523       }
1524
1525       /* Store in the argument buffer:
1526        */
1527       sse_movaps(
1528          func,
1529          get_temp( TEMP_R0, i ),
1530          make_xmm( i ) );
1531    }
1532
1533    args[0] = get_temp( TEMP_R0, 0 );
1534    args[1] = get_sampler_ptr( unit );
1535
1536
1537    emit_func_call( func,
1538                    0,
1539                    args,
1540                    Elements(args),
1541                    fetch_texel );
1542
1543    /* If all four channels are enabled, could use a pointer to
1544     * dst[0].x instead of TEMP_R0 for store?
1545     */
1546    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1547
1548       sse_movaps(
1549          func,
1550          make_xmm( 0 ),
1551          get_temp( TEMP_R0, i ) );
1552
1553       STORE( func, *inst, 0, 0, i );
1554    }
1555 }
1556
1557
1558 static void
1559 emit_kil(
1560    struct x86_function *func,
1561    const struct tgsi_full_src_register *reg )
1562 {
1563    unsigned uniquemask;
1564    unsigned unique_count = 0;
1565    unsigned chan_index;
1566    unsigned i;
1567
1568    /* This mask stores component bits that were already tested. Note that
1569     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1570     * tested. */
1571    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1572
1573    FOR_EACH_CHANNEL( chan_index ) {
1574       unsigned swizzle;
1575
1576       /* unswizzle channel */
1577       swizzle = tgsi_util_get_full_src_register_extswizzle(
1578          reg,
1579          chan_index );
1580
1581       /* check if the component has not been already tested */
1582       if( !(uniquemask & (1 << swizzle)) ) {
1583          uniquemask |= 1 << swizzle;
1584
1585          /* allocate register */
1586          emit_fetch(
1587             func,
1588             unique_count++,
1589             reg,
1590             chan_index );
1591       }
1592    }
1593
1594    x86_push(
1595       func,
1596       x86_make_reg( file_REG32, reg_AX ) );
1597    x86_push(
1598       func,
1599       x86_make_reg( file_REG32, reg_DX ) );
1600
1601    for (i = 0 ; i < unique_count; i++ ) {
1602       struct x86_reg dataXMM = make_xmm(i);
1603
1604       sse_cmpps(
1605          func,
1606          dataXMM,
1607          get_temp(
1608             TGSI_EXEC_TEMP_00000000_I,
1609             TGSI_EXEC_TEMP_00000000_C ),
1610          cc_LessThan );
1611
1612       if( i == 0 ) {
1613          sse_movmskps(
1614             func,
1615             x86_make_reg( file_REG32, reg_AX ),
1616             dataXMM );
1617       }
1618       else {
1619          sse_movmskps(
1620             func,
1621             x86_make_reg( file_REG32, reg_DX ),
1622             dataXMM );
1623          x86_or(
1624             func,
1625             x86_make_reg( file_REG32, reg_AX ),
1626             x86_make_reg( file_REG32, reg_DX ) );
1627       }
1628    }
1629
1630    x86_or(
1631       func,
1632       get_temp(
1633          TGSI_EXEC_TEMP_KILMASK_I,
1634          TGSI_EXEC_TEMP_KILMASK_C ),
1635       x86_make_reg( file_REG32, reg_AX ) );
1636
1637    x86_pop(
1638       func,
1639       x86_make_reg( file_REG32, reg_DX ) );
1640    x86_pop(
1641       func,
1642       x86_make_reg( file_REG32, reg_AX ) );
1643 }
1644
1645
1646 static void
1647 emit_kilp(
1648    struct x86_function *func )
1649 {
1650    /* XXX todo / fix me */
1651 }
1652
1653
1654 static void
1655 emit_setcc(
1656    struct x86_function *func,
1657    struct tgsi_full_instruction *inst,
1658    enum sse_cc cc )
1659 {
1660    unsigned chan_index;
1661
1662    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1663       FETCH( func, *inst, 0, 0, chan_index );
1664       FETCH( func, *inst, 1, 1, chan_index );
1665       sse_cmpps(
1666          func,
1667          make_xmm( 0 ),
1668          make_xmm( 1 ),
1669          cc );
1670       sse_andps(
1671          func,
1672          make_xmm( 0 ),
1673          get_temp(
1674             TEMP_ONE_I,
1675             TEMP_ONE_C ) );
1676       STORE( func, *inst, 0, 0, chan_index );
1677    }
1678 }
1679
1680 static void
1681 emit_cmp(
1682    struct x86_function *func,
1683    struct tgsi_full_instruction *inst )
1684 {
1685    unsigned chan_index;
1686
1687    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1688       FETCH( func, *inst, 0, 0, chan_index );
1689       FETCH( func, *inst, 1, 1, chan_index );
1690       FETCH( func, *inst, 2, 2, chan_index );
1691       sse_cmpps(
1692          func,
1693          make_xmm( 0 ),
1694          get_temp(
1695             TGSI_EXEC_TEMP_00000000_I,
1696             TGSI_EXEC_TEMP_00000000_C ),
1697          cc_LessThan );
1698       sse_andps(
1699          func,
1700          make_xmm( 1 ),
1701          make_xmm( 0 ) );
1702       sse_andnps(
1703          func,
1704          make_xmm( 0 ),
1705          make_xmm( 2 ) );
1706       sse_orps(
1707          func,
1708          make_xmm( 0 ),
1709          make_xmm( 1 ) );
1710       STORE( func, *inst, 0, 0, chan_index );
1711    }
1712 }
1713
1714
1715 /**
1716  * Check if inst src/dest regs use indirect addressing into temporary
1717  * register file.
1718  */
1719 static boolean
1720 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1721 {
1722    uint i;
1723    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1724       const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
1725       if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
1726           reg->SrcRegister.Indirect)
1727          return TRUE;
1728    }
1729    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1730       const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
1731       if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
1732           reg->DstRegister.Indirect)
1733          return TRUE;
1734    }
1735    return FALSE;
1736 }
1737
1738
1739 static int
1740 emit_instruction(
1741    struct x86_function *func,
1742    struct tgsi_full_instruction *inst )
1743 {
1744    unsigned chan_index;
1745
1746    /* we can't handle indirect addressing into temp register file yet */
1747    if (indirect_temp_reference(inst))
1748       return FALSE;
1749
1750    /* we don't handle saturation/clamping yet */
1751    if (inst->Instruction.Saturate != TGSI_SAT_NONE)
1752       return FALSE;
1753
1754    /* need to use extra temps to fix SOA dependencies : */
1755    if (tgsi_check_soa_dependencies(inst))
1756       return FALSE;
1757
1758    switch (inst->Instruction.Opcode) {
1759    case TGSI_OPCODE_ARL:
1760       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1761          FETCH( func, *inst, 0, 0, chan_index );
1762          emit_flr(func, 0, 0);
1763          emit_f2it( func, 0 );
1764          STORE( func, *inst, 0, 0, chan_index );
1765       }
1766       break;
1767
1768    case TGSI_OPCODE_MOV:
1769    case TGSI_OPCODE_SWZ:
1770       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1771          FETCH( func, *inst, 0, 0, chan_index );
1772          STORE( func, *inst, 0, 0, chan_index );
1773       }
1774       break;
1775
1776    case TGSI_OPCODE_LIT:
1777       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1778           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1779          emit_tempf(
1780             func,
1781             0,
1782             TEMP_ONE_I,
1783             TEMP_ONE_C);
1784          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1785             STORE( func, *inst, 0, 0, CHAN_X );
1786          }
1787          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1788             STORE( func, *inst, 0, 0, CHAN_W );
1789          }
1790       }
1791       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1792           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1793          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1794             FETCH( func, *inst, 0, 0, CHAN_X );
1795             sse_maxps(
1796                func,
1797                make_xmm( 0 ),
1798                get_temp(
1799                   TGSI_EXEC_TEMP_00000000_I,
1800                   TGSI_EXEC_TEMP_00000000_C ) );
1801             STORE( func, *inst, 0, 0, CHAN_Y );
1802          }
1803          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1804             /* XMM[1] = SrcReg[0].yyyy */
1805             FETCH( func, *inst, 1, 0, CHAN_Y );
1806             /* XMM[1] = max(XMM[1], 0) */
1807             sse_maxps(
1808                func,
1809                make_xmm( 1 ),
1810                get_temp(
1811                   TGSI_EXEC_TEMP_00000000_I,
1812                   TGSI_EXEC_TEMP_00000000_C ) );
1813             /* XMM[2] = SrcReg[0].wwww */
1814             FETCH( func, *inst, 2, 0, CHAN_W );
1815             /* XMM[2] = min(XMM[2], 128.0) */
1816             sse_minps(
1817                func,
1818                make_xmm( 2 ),
1819                get_temp(
1820                   TGSI_EXEC_TEMP_128_I,
1821                   TGSI_EXEC_TEMP_128_C ) );
1822             /* XMM[2] = max(XMM[2], -128.0) */
1823             sse_maxps(
1824                func,
1825                make_xmm( 2 ),
1826                get_temp(
1827                   TGSI_EXEC_TEMP_MINUS_128_I,
1828                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1829             emit_pow( func, 3, 1, 1, 2 );
1830             FETCH( func, *inst, 0, 0, CHAN_X );
1831             sse_xorps(
1832                func,
1833                make_xmm( 2 ),
1834                make_xmm( 2 ) );
1835             sse_cmpps(
1836                func,
1837                make_xmm( 2 ),
1838                make_xmm( 0 ),
1839                cc_LessThan );
1840             sse_andps(
1841                func,
1842                make_xmm( 2 ),
1843                make_xmm( 1 ) );
1844             STORE( func, *inst, 2, 0, CHAN_Z );
1845          }
1846       }
1847       break;
1848
1849    case TGSI_OPCODE_RCP:
1850    /* TGSI_OPCODE_RECIP */
1851       FETCH( func, *inst, 0, 0, CHAN_X );
1852       emit_rcp( func, 0, 0 );
1853       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1854          STORE( func, *inst, 0, 0, chan_index );
1855       }
1856       break;
1857
1858    case TGSI_OPCODE_RSQ:
1859    /* TGSI_OPCODE_RECIPSQRT */
1860       FETCH( func, *inst, 0, 0, CHAN_X );
1861       emit_abs( func, 0 );
1862       emit_rsqrt( func, 1, 0 );
1863       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1864          STORE( func, *inst, 1, 0, chan_index );
1865       }
1866       break;
1867
1868    case TGSI_OPCODE_EXP:
1869       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1870           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1871           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1872          FETCH( func, *inst, 0, 0, CHAN_X );
1873          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1874              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1875             emit_MOV( func, 1, 0 );
1876             emit_flr( func, 2, 1 );
1877             /* dst.x = ex2(floor(src.x)) */
1878             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1879                emit_MOV( func, 2, 1 );
1880                emit_ex2( func, 3, 2 );
1881                STORE( func, *inst, 2, 0, CHAN_X );
1882             }
1883             /* dst.y = src.x - floor(src.x) */
1884             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1885                emit_MOV( func, 2, 0 );
1886                emit_sub( func, 2, 1 );
1887                STORE( func, *inst, 2, 0, CHAN_Y );
1888             }
1889          }
1890          /* dst.z = ex2(src.x) */
1891          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1892             emit_ex2( func, 3, 0 );
1893             STORE( func, *inst, 0, 0, CHAN_Z );
1894          }
1895       }
1896       /* dst.w = 1.0 */
1897       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1898          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1899          STORE( func, *inst, 0, 0, CHAN_W );
1900       }
1901       break;
1902
1903    case TGSI_OPCODE_LOG:
1904       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1905           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1906           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1907          FETCH( func, *inst, 0, 0, CHAN_X );
1908          emit_abs( func, 0 );
1909          emit_MOV( func, 1, 0 );
1910          emit_lg2( func, 2, 1 );
1911          /* dst.z = lg2(abs(src.x)) */
1912          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1913             STORE( func, *inst, 1, 0, CHAN_Z );
1914          }
1915          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1916              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1917             emit_flr( func, 2, 1 );
1918             /* dst.x = floor(lg2(abs(src.x))) */
1919             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1920                STORE( func, *inst, 1, 0, CHAN_X );
1921             }
1922             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1923             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1924                emit_ex2( func, 2, 1 );
1925                emit_rcp( func, 1, 1 );
1926                emit_mul( func, 0, 1 );
1927                STORE( func, *inst, 0, 0, CHAN_Y );
1928             }
1929          }
1930       }
1931       /* dst.w = 1.0 */
1932       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1933          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1934          STORE( func, *inst, 0, 0, CHAN_W );
1935       }
1936       break;
1937
1938    case TGSI_OPCODE_MUL:
1939       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1940          FETCH( func, *inst, 0, 0, chan_index );
1941          FETCH( func, *inst, 1, 1, chan_index );
1942          emit_mul( func, 0, 1 );
1943          STORE( func, *inst, 0, 0, chan_index );
1944       }
1945       break;
1946
1947    case TGSI_OPCODE_ADD:
1948       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1949          FETCH( func, *inst, 0, 0, chan_index );
1950          FETCH( func, *inst, 1, 1, chan_index );
1951          emit_add( func, 0, 1 );
1952          STORE( func, *inst, 0, 0, chan_index );
1953       }
1954       break;
1955
1956    case TGSI_OPCODE_DP3:
1957    /* TGSI_OPCODE_DOT3 */
1958       FETCH( func, *inst, 0, 0, CHAN_X );
1959       FETCH( func, *inst, 1, 1, CHAN_X );
1960       emit_mul( func, 0, 1 );
1961       FETCH( func, *inst, 1, 0, CHAN_Y );
1962       FETCH( func, *inst, 2, 1, CHAN_Y );
1963       emit_mul( func, 1, 2 );
1964       emit_add( func, 0, 1 );
1965       FETCH( func, *inst, 1, 0, CHAN_Z );
1966       FETCH( func, *inst, 2, 1, CHAN_Z );
1967       emit_mul( func, 1, 2 );
1968       emit_add( func, 0, 1 );
1969       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1970          STORE( func, *inst, 0, 0, chan_index );
1971       }
1972       break;
1973
1974    case TGSI_OPCODE_DP4:
1975    /* TGSI_OPCODE_DOT4 */
1976       FETCH( func, *inst, 0, 0, CHAN_X );
1977       FETCH( func, *inst, 1, 1, CHAN_X );
1978       emit_mul( func, 0, 1 );
1979       FETCH( func, *inst, 1, 0, CHAN_Y );
1980       FETCH( func, *inst, 2, 1, CHAN_Y );
1981       emit_mul( func, 1, 2 );
1982       emit_add( func, 0, 1 );
1983       FETCH( func, *inst, 1, 0, CHAN_Z );
1984       FETCH( func, *inst, 2, 1, CHAN_Z );
1985       emit_mul(func, 1, 2 );
1986       emit_add(func, 0, 1 );
1987       FETCH( func, *inst, 1, 0, CHAN_W );
1988       FETCH( func, *inst, 2, 1, CHAN_W );
1989       emit_mul( func, 1, 2 );
1990       emit_add( func, 0, 1 );
1991       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1992          STORE( func, *inst, 0, 0, chan_index );
1993       }
1994       break;
1995
1996    case TGSI_OPCODE_DST:
1997       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1998          emit_tempf(
1999             func,
2000             0,
2001             TEMP_ONE_I,
2002             TEMP_ONE_C );
2003          STORE( func, *inst, 0, 0, CHAN_X );
2004       }
2005       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2006          FETCH( func, *inst, 0, 0, CHAN_Y );
2007          FETCH( func, *inst, 1, 1, CHAN_Y );
2008          emit_mul( func, 0, 1 );
2009          STORE( func, *inst, 0, 0, CHAN_Y );
2010       }
2011       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2012          FETCH( func, *inst, 0, 0, CHAN_Z );
2013          STORE( func, *inst, 0, 0, CHAN_Z );
2014       }
2015       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2016          FETCH( func, *inst, 0, 1, CHAN_W );
2017          STORE( func, *inst, 0, 0, CHAN_W );
2018       }
2019       break;
2020
2021    case TGSI_OPCODE_MIN:
2022       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2023          FETCH( func, *inst, 0, 0, chan_index );
2024          FETCH( func, *inst, 1, 1, chan_index );
2025          sse_minps(
2026             func,
2027             make_xmm( 0 ),
2028             make_xmm( 1 ) );
2029          STORE( func, *inst, 0, 0, chan_index );
2030       }
2031       break;
2032
2033    case TGSI_OPCODE_MAX:
2034       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2035          FETCH( func, *inst, 0, 0, chan_index );
2036          FETCH( func, *inst, 1, 1, chan_index );
2037          sse_maxps(
2038             func,
2039             make_xmm( 0 ),
2040             make_xmm( 1 ) );
2041          STORE( func, *inst, 0, 0, chan_index );
2042       }
2043       break;
2044
2045    case TGSI_OPCODE_SLT:
2046    /* TGSI_OPCODE_SETLT */
2047       emit_setcc( func, inst, cc_LessThan );
2048       break;
2049
2050    case TGSI_OPCODE_SGE:
2051    /* TGSI_OPCODE_SETGE */
2052       emit_setcc( func, inst, cc_NotLessThan );
2053       break;
2054
2055    case TGSI_OPCODE_MAD:
2056    /* TGSI_OPCODE_MADD */
2057       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2058          FETCH( func, *inst, 0, 0, chan_index );
2059          FETCH( func, *inst, 1, 1, chan_index );
2060          FETCH( func, *inst, 2, 2, chan_index );
2061          emit_mul( func, 0, 1 );
2062          emit_add( func, 0, 2 );
2063          STORE( func, *inst, 0, 0, chan_index );
2064       }
2065       break;
2066
2067    case TGSI_OPCODE_SUB:
2068       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2069          FETCH( func, *inst, 0, 0, chan_index );
2070          FETCH( func, *inst, 1, 1, chan_index );
2071          emit_sub( func, 0, 1 );
2072          STORE( func, *inst, 0, 0, chan_index );
2073       }
2074       break;
2075
2076    case TGSI_OPCODE_LRP:
2077       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2078          FETCH( func, *inst, 0, 0, chan_index );
2079          FETCH( func, *inst, 1, 1, chan_index );
2080          FETCH( func, *inst, 2, 2, chan_index );
2081          emit_sub( func, 1, 2 );
2082          emit_mul( func, 0, 1 );
2083          emit_add( func, 0, 2 );
2084          STORE( func, *inst, 0, 0, chan_index );
2085       }
2086       break;
2087
2088    case TGSI_OPCODE_CND:
2089       return 0;
2090       break;
2091
2092    case TGSI_OPCODE_DP2A:
2093       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2094       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2095       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2096       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2097       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2098       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2099       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2100       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
2101       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2102       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2103          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2104       }
2105       break;
2106
2107    case TGSI_OPCODE_FRC:
2108       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2109          FETCH( func, *inst, 0, 0, chan_index );
2110          emit_frc( func, 0, 0 );
2111          STORE( func, *inst, 0, 0, chan_index );
2112       }
2113       break;
2114
2115    case TGSI_OPCODE_CLAMP:
2116       return 0;
2117       break;
2118
2119    case TGSI_OPCODE_FLR:
2120       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2121          FETCH( func, *inst, 0, 0, chan_index );
2122          emit_flr( func, 0, 0 );
2123          STORE( func, *inst, 0, 0, chan_index );
2124       }
2125       break;
2126
2127    case TGSI_OPCODE_ROUND:
2128       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2129          FETCH( func, *inst, 0, 0, chan_index );
2130          emit_rnd( func, 0, 0 );
2131          STORE( func, *inst, 0, 0, chan_index );
2132       }
2133       break;
2134
2135    case TGSI_OPCODE_EX2:
2136       FETCH( func, *inst, 0, 0, CHAN_X );
2137       emit_ex2( func, 0, 0 );
2138       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2139          STORE( func, *inst, 0, 0, chan_index );
2140       }
2141       break;
2142
2143    case TGSI_OPCODE_LG2:
2144       FETCH( func, *inst, 0, 0, CHAN_X );
2145       emit_lg2( func, 0, 0 );
2146       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2147          STORE( func, *inst, 0, 0, chan_index );
2148       }
2149       break;
2150
2151    case TGSI_OPCODE_POW:
2152       FETCH( func, *inst, 0, 0, CHAN_X );
2153       FETCH( func, *inst, 1, 1, CHAN_X );
2154       emit_pow( func, 0, 0, 0, 1 );
2155       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2156          STORE( func, *inst, 0, 0, chan_index );
2157       }
2158       break;
2159
2160    case TGSI_OPCODE_XPD:
2161       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2162           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2163          FETCH( func, *inst, 1, 1, CHAN_Z );
2164          FETCH( func, *inst, 3, 0, CHAN_Z );
2165       }
2166       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2167           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2168          FETCH( func, *inst, 0, 0, CHAN_Y );
2169          FETCH( func, *inst, 4, 1, CHAN_Y );
2170       }
2171       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2172          emit_MOV( func, 2, 0 );
2173          emit_mul( func, 2, 1 );
2174          emit_MOV( func, 5, 3 );
2175          emit_mul( func, 5, 4 );
2176          emit_sub( func, 2, 5 );
2177          STORE( func, *inst, 2, 0, CHAN_X );
2178       }
2179       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2180           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2181          FETCH( func, *inst, 2, 1, CHAN_X );
2182          FETCH( func, *inst, 5, 0, CHAN_X );
2183       }
2184       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2185          emit_mul( func, 3, 2 );
2186          emit_mul( func, 1, 5 );
2187          emit_sub( func, 3, 1 );
2188          STORE( func, *inst, 3, 0, CHAN_Y );
2189       }
2190       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2191          emit_mul( func, 5, 4 );
2192          emit_mul( func, 0, 2 );
2193          emit_sub( func, 5, 0 );
2194          STORE( func, *inst, 5, 0, CHAN_Z );
2195       }
2196       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2197          emit_tempf(
2198             func,
2199             0,
2200             TEMP_ONE_I,
2201             TEMP_ONE_C );
2202          STORE( func, *inst, 0, 0, CHAN_W );
2203       }
2204       break;
2205
2206    case TGSI_OPCODE_ABS:
2207       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2208          FETCH( func, *inst, 0, 0, chan_index );
2209          emit_abs( func, 0) ;
2210
2211          STORE( func, *inst, 0, 0, chan_index );
2212       }
2213       break;
2214
2215    case TGSI_OPCODE_RCC:
2216       return 0;
2217       break;
2218
2219    case TGSI_OPCODE_DPH:
2220       FETCH( func, *inst, 0, 0, CHAN_X );
2221       FETCH( func, *inst, 1, 1, CHAN_X );
2222       emit_mul( func, 0, 1 );
2223       FETCH( func, *inst, 1, 0, CHAN_Y );
2224       FETCH( func, *inst, 2, 1, CHAN_Y );
2225       emit_mul( func, 1, 2 );
2226       emit_add( func, 0, 1 );
2227       FETCH( func, *inst, 1, 0, CHAN_Z );
2228       FETCH( func, *inst, 2, 1, CHAN_Z );
2229       emit_mul( func, 1, 2 );
2230       emit_add( func, 0, 1 );
2231       FETCH( func, *inst, 1, 1, CHAN_W );
2232       emit_add( func, 0, 1 );
2233       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2234          STORE( func, *inst, 0, 0, chan_index );
2235       }
2236       break;
2237
2238    case TGSI_OPCODE_COS:
2239       FETCH( func, *inst, 0, 0, CHAN_X );
2240       emit_cos( func, 0, 0 );
2241       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2242          STORE( func, *inst, 0, 0, chan_index );
2243       }
2244       break;
2245
2246    case TGSI_OPCODE_DDX:
2247       return 0;
2248       break;
2249
2250    case TGSI_OPCODE_DDY:
2251       return 0;
2252       break;
2253
2254    case TGSI_OPCODE_KILP:
2255       /* predicated kill */
2256       emit_kilp( func );
2257       return 0; /* XXX fix me */
2258       break;
2259
2260    case TGSI_OPCODE_KIL:
2261       /* conditional kill */
2262       emit_kil( func, &inst->FullSrcRegisters[0] );
2263       break;
2264
2265    case TGSI_OPCODE_PK2H:
2266       return 0;
2267       break;
2268
2269    case TGSI_OPCODE_PK2US:
2270       return 0;
2271       break;
2272
2273    case TGSI_OPCODE_PK4B:
2274       return 0;
2275       break;
2276
2277    case TGSI_OPCODE_PK4UB:
2278       return 0;
2279       break;
2280
2281    case TGSI_OPCODE_RFL:
2282       return 0;
2283       break;
2284
2285    case TGSI_OPCODE_SEQ:
2286       return 0;
2287       break;
2288
2289    case TGSI_OPCODE_SFL:
2290       return 0;
2291       break;
2292
2293    case TGSI_OPCODE_SGT:
2294       return 0;
2295       break;
2296
2297    case TGSI_OPCODE_SIN:
2298       FETCH( func, *inst, 0, 0, CHAN_X );
2299       emit_sin( func, 0, 0 );
2300       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2301          STORE( func, *inst, 0, 0, chan_index );
2302       }
2303       break;
2304
2305    case TGSI_OPCODE_SLE:
2306       return 0;
2307       break;
2308
2309    case TGSI_OPCODE_SNE:
2310       return 0;
2311       break;
2312
2313    case TGSI_OPCODE_STR:
2314       return 0;
2315       break;
2316
2317    case TGSI_OPCODE_TEX:
2318       emit_tex( func, inst, FALSE, FALSE );
2319       break;
2320
2321    case TGSI_OPCODE_TXD:
2322       return 0;
2323       break;
2324
2325    case TGSI_OPCODE_UP2H:
2326       return 0;
2327       break;
2328
2329    case TGSI_OPCODE_UP2US:
2330       return 0;
2331       break;
2332
2333    case TGSI_OPCODE_UP4B:
2334       return 0;
2335       break;
2336
2337    case TGSI_OPCODE_UP4UB:
2338       return 0;
2339       break;
2340
2341    case TGSI_OPCODE_X2D:
2342       return 0;
2343       break;
2344
2345    case TGSI_OPCODE_ARA:
2346       return 0;
2347       break;
2348
2349    case TGSI_OPCODE_ARR:
2350       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2351          FETCH( func, *inst, 0, 0, chan_index );
2352          emit_rnd( func, 0, 0 );
2353          emit_f2it( func, 0 );
2354          STORE( func, *inst, 0, 0, chan_index );
2355       }
2356       break;
2357
2358    case TGSI_OPCODE_BRA:
2359       return 0;
2360       break;
2361
2362    case TGSI_OPCODE_CAL:
2363       return 0;
2364       break;
2365
2366    case TGSI_OPCODE_RET:
2367       emit_ret( func );
2368       break;
2369
2370    case TGSI_OPCODE_END:
2371       break;
2372
2373    case TGSI_OPCODE_SSG:
2374    /* TGSI_OPCODE_SGN */
2375       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2376          FETCH( func, *inst, 0, 0, chan_index );
2377          emit_sgn( func, 0, 0 );
2378          STORE( func, *inst, 0, 0, chan_index );
2379       }
2380       break;
2381
2382    case TGSI_OPCODE_CMP:
2383       emit_cmp (func, inst);
2384       break;
2385
2386    case TGSI_OPCODE_SCS:
2387       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2388          FETCH( func, *inst, 0, 0, CHAN_X );
2389          emit_cos( func, 0, 0 );
2390          STORE( func, *inst, 0, 0, CHAN_X );
2391       }
2392       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2393          FETCH( func, *inst, 0, 0, CHAN_X );
2394          emit_sin( func, 0, 0 );
2395          STORE( func, *inst, 0, 0, CHAN_Y );
2396       }
2397       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2398          emit_tempf(
2399             func,
2400             0,
2401             TGSI_EXEC_TEMP_00000000_I,
2402             TGSI_EXEC_TEMP_00000000_C );
2403          STORE( func, *inst, 0, 0, CHAN_Z );
2404       }
2405       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2406          emit_tempf(
2407             func,
2408             0,
2409             TEMP_ONE_I,
2410             TEMP_ONE_C );
2411          STORE( func, *inst, 0, 0, CHAN_W );
2412       }
2413       break;
2414
2415    case TGSI_OPCODE_TXB:
2416       emit_tex( func, inst, TRUE, FALSE );
2417       break;
2418
2419    case TGSI_OPCODE_NRM:
2420       /* fall-through */
2421    case TGSI_OPCODE_NRM4:
2422       /* 3 or 4-component normalization */
2423       {
2424          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2425
2426          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2427              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2428              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2429              (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2430
2431             /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2432
2433             /* xmm4 = src.x */
2434             /* xmm0 = src.x * src.x */
2435             FETCH(func, *inst, 0, 0, CHAN_X);
2436             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2437                emit_MOV(func, 4, 0);
2438             }
2439             emit_mul(func, 0, 0);
2440
2441             /* xmm5 = src.y */
2442             /* xmm0 = xmm0 + src.y * src.y */
2443             FETCH(func, *inst, 1, 0, CHAN_Y);
2444             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2445                emit_MOV(func, 5, 1);
2446             }
2447             emit_mul(func, 1, 1);
2448             emit_add(func, 0, 1);
2449
2450             /* xmm6 = src.z */
2451             /* xmm0 = xmm0 + src.z * src.z */
2452             FETCH(func, *inst, 1, 0, CHAN_Z);
2453             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2454                emit_MOV(func, 6, 1);
2455             }
2456             emit_mul(func, 1, 1);
2457             emit_add(func, 0, 1);
2458
2459             if (dims == 4) {
2460                /* xmm7 = src.w */
2461                /* xmm0 = xmm0 + src.w * src.w */
2462                FETCH(func, *inst, 1, 0, CHAN_W);
2463                if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2464                   emit_MOV(func, 7, 1);
2465                }
2466                emit_mul(func, 1, 1);
2467                emit_add(func, 0, 1);
2468             }
2469
2470             /* xmm1 = 1 / sqrt(xmm0) */
2471             emit_rsqrt(func, 1, 0);
2472
2473             /* dst.x = xmm1 * src.x */
2474             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2475                emit_mul(func, 4, 1);
2476                STORE(func, *inst, 4, 0, CHAN_X);
2477             }
2478
2479             /* dst.y = xmm1 * src.y */
2480             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2481                emit_mul(func, 5, 1);
2482                STORE(func, *inst, 5, 0, CHAN_Y);
2483             }
2484
2485             /* dst.z = xmm1 * src.z */
2486             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2487                emit_mul(func, 6, 1);
2488                STORE(func, *inst, 6, 0, CHAN_Z);
2489             }
2490
2491             /* dst.w = xmm1 * src.w */
2492             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2493                emit_mul(func, 7, 1);
2494                STORE(func, *inst, 7, 0, CHAN_W);
2495             }
2496          }
2497
2498          /* dst0.w = 1.0 */
2499          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2500             emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2501             STORE(func, *inst, 0, 0, CHAN_W);
2502          }
2503       }
2504       break;
2505
2506    case TGSI_OPCODE_DIV:
2507       return 0;
2508       break;
2509
2510    case TGSI_OPCODE_DP2:
2511       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2512       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2513       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2514       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2515       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2516       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2517       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2518       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2519          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2520       }
2521       break;
2522
2523    case TGSI_OPCODE_TXL:
2524       emit_tex( func, inst, TRUE, FALSE );
2525       break;
2526
2527    case TGSI_OPCODE_TXP:
2528       emit_tex( func, inst, FALSE, TRUE );
2529       break;
2530
2531    case TGSI_OPCODE_BRK:
2532       return 0;
2533       break;
2534
2535    case TGSI_OPCODE_IF:
2536       return 0;
2537       break;
2538
2539    case TGSI_OPCODE_BGNFOR:
2540       return 0;
2541       break;
2542
2543    case TGSI_OPCODE_REP:
2544       return 0;
2545       break;
2546
2547    case TGSI_OPCODE_ELSE:
2548       return 0;
2549       break;
2550
2551    case TGSI_OPCODE_ENDIF:
2552       return 0;
2553       break;
2554
2555    case TGSI_OPCODE_ENDFOR:
2556       return 0;
2557       break;
2558
2559    case TGSI_OPCODE_ENDREP:
2560       return 0;
2561       break;
2562
2563    case TGSI_OPCODE_PUSHA:
2564       return 0;
2565       break;
2566
2567    case TGSI_OPCODE_POPA:
2568       return 0;
2569       break;
2570
2571    case TGSI_OPCODE_CEIL:
2572       return 0;
2573       break;
2574
2575    case TGSI_OPCODE_I2F:
2576       return 0;
2577       break;
2578
2579    case TGSI_OPCODE_NOT:
2580       return 0;
2581       break;
2582
2583    case TGSI_OPCODE_TRUNC:
2584       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2585          FETCH( func, *inst, 0, 0, chan_index );
2586          emit_f2it( func, 0 );
2587          emit_i2f( func, 0 );
2588          STORE( func, *inst, 0, 0, chan_index );
2589       }
2590       break;
2591
2592    case TGSI_OPCODE_SHL:
2593       return 0;
2594       break;
2595
2596    case TGSI_OPCODE_SHR:
2597       return 0;
2598       break;
2599
2600    case TGSI_OPCODE_AND:
2601       return 0;
2602       break;
2603
2604    case TGSI_OPCODE_OR:
2605       return 0;
2606       break;
2607
2608    case TGSI_OPCODE_MOD:
2609       return 0;
2610       break;
2611
2612    case TGSI_OPCODE_XOR:
2613       return 0;
2614       break;
2615
2616    case TGSI_OPCODE_SAD:
2617       return 0;
2618       break;
2619
2620    case TGSI_OPCODE_TXF:
2621       return 0;
2622       break;
2623
2624    case TGSI_OPCODE_TXQ:
2625       return 0;
2626       break;
2627
2628    case TGSI_OPCODE_CONT:
2629       return 0;
2630       break;
2631
2632    case TGSI_OPCODE_EMIT:
2633       return 0;
2634       break;
2635
2636    case TGSI_OPCODE_ENDPRIM:
2637       return 0;
2638       break;
2639
2640    default:
2641       return 0;
2642    }
2643
2644    return 1;
2645 }
2646
2647 static void
2648 emit_declaration(
2649    struct x86_function *func,
2650    struct tgsi_full_declaration *decl )
2651 {
2652    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2653       unsigned first, last, mask;
2654       unsigned i, j;
2655
2656       first = decl->DeclarationRange.First;
2657       last = decl->DeclarationRange.Last;
2658       mask = decl->Declaration.UsageMask;
2659
2660       for( i = first; i <= last; i++ ) {
2661          for( j = 0; j < NUM_CHANNELS; j++ ) {
2662             if( mask & (1 << j) ) {
2663                switch( decl->Declaration.Interpolate ) {
2664                case TGSI_INTERPOLATE_CONSTANT:
2665                   emit_coef_a0( func, 0, i, j );
2666                   emit_inputs( func, 0, i, j );
2667                   break;
2668
2669                case TGSI_INTERPOLATE_LINEAR:
2670                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2671                   emit_coef_dadx( func, 1, i, j );
2672                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2673                   emit_coef_dady( func, 3, i, j );
2674                   emit_mul( func, 0, 1 );    /* x * dadx */
2675                   emit_coef_a0( func, 4, i, j );
2676                   emit_mul( func, 2, 3 );    /* y * dady */
2677                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2678                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2679                   emit_inputs( func, 0, i, j );
2680                   break;
2681
2682                case TGSI_INTERPOLATE_PERSPECTIVE:
2683                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2684                   emit_coef_dadx( func, 1, i, j );
2685                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2686                   emit_coef_dady( func, 3, i, j );
2687                   emit_mul( func, 0, 1 );    /* x * dadx */
2688                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2689                   emit_coef_a0( func, 5, i, j );
2690                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2691                   emit_mul( func, 2, 3 );    /* y * dady */
2692                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2693                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2694                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2695                   emit_inputs( func, 0, i, j );
2696                   break;
2697
2698                default:
2699                   assert( 0 );
2700                   break;
2701                }
2702             }
2703          }
2704       }
2705    }
2706 }
2707
2708 static void aos_to_soa( struct x86_function *func,
2709                         uint arg_aos,
2710                         uint arg_machine,
2711                         uint arg_num,
2712                         uint arg_stride )
2713 {
2714    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2715    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2716    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2717    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2718    int inner_loop;
2719
2720
2721    /* Save EBX */
2722    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2723
2724    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2725    x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
2726    x86_lea( func, soa_input,
2727             x86_make_disp( soa_input,
2728                            Offset(struct tgsi_exec_machine, Inputs) ) );
2729    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2730    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2731
2732    /* do */
2733    inner_loop = x86_get_label( func );
2734    {
2735       x86_push( func, aos_input );
2736       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2737       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2738       x86_add( func, aos_input, stride );
2739       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2740       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2741       x86_add( func, aos_input, stride );
2742       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2743       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2744       x86_add( func, aos_input, stride );
2745       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2746       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2747       x86_pop( func, aos_input );
2748
2749       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2750       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2751       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2752       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2753       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2754       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2755
2756       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2757       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2758       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2759       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2760
2761       /* Advance to next input */
2762       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2763       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2764    }
2765    /* while --num_inputs */
2766    x86_dec( func, num_inputs );
2767    x86_jcc( func, cc_NE, inner_loop );
2768
2769    /* Restore EBX */
2770    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2771 }
2772
2773 static void soa_to_aos( struct x86_function *func,
2774                         uint arg_aos,
2775                         uint arg_machine,
2776                         uint arg_num,
2777                         uint arg_stride )
2778 {
2779    struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2780    struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2781    struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2782    struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2783    int inner_loop;
2784
2785    /* Save EBX */
2786    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2787
2788    x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2789    x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2790    x86_lea( func, soa_output,
2791             x86_make_disp( soa_output,
2792                            Offset(struct tgsi_exec_machine, Outputs) ) );
2793    x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2794
2795    /* do */
2796    inner_loop = x86_get_label( func );
2797    {
2798       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2799       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2800       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2801       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2802
2803       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2804       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2805       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2806       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2807       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2808       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2809
2810       x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2811       x86_push( func, aos_output );
2812       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2813       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2814       x86_add( func, aos_output, temp );
2815       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2816       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2817       x86_add( func, aos_output, temp );
2818       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2819       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2820       x86_add( func, aos_output, temp );
2821       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2822       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2823       x86_pop( func, aos_output );
2824
2825       /* Advance to next output */
2826       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2827       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2828    }
2829    /* while --num_outputs */
2830    x86_dec( func, num_outputs );
2831    x86_jcc( func, cc_NE, inner_loop );
2832
2833    /* Restore EBX */
2834    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2835 }
2836
2837 /**
2838  * Translate a TGSI vertex/fragment shader to SSE2 code.
2839  * Slightly different things are done for vertex vs. fragment shaders.
2840  *
2841  * \param tokens  the TGSI input shader
2842  * \param func  the output SSE code/function
2843  * \param immediates  buffer to place immediates, later passed to SSE func
2844  * \param return  1 for success, 0 if translation failed
2845  */
2846 unsigned
2847 tgsi_emit_sse2(
2848    const struct tgsi_token *tokens,
2849    struct x86_function *func,
2850    float (*immediates)[4],
2851    boolean do_swizzles )
2852 {
2853    struct tgsi_parse_context parse;
2854    unsigned ok = 1;
2855    uint num_immediates = 0;
2856
2857    util_init_math();
2858
2859    func->csr = func->store;
2860
2861    tgsi_parse_init( &parse, tokens );
2862
2863    /* Can't just use EDI, EBX without save/restoring them:
2864     */
2865    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2866    x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2867
2868    /*
2869     * Different function args for vertex/fragment shaders:
2870     */
2871    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2872       if (do_swizzles)
2873          aos_to_soa( func,
2874                      4,         /* aos_input */
2875                      1,         /* machine */
2876                      5,         /* num_inputs */
2877                      6 );       /* input_stride */
2878    }
2879
2880    x86_mov(
2881       func,
2882       get_machine_base(),
2883       x86_fn_arg( func, 1 ) );
2884    x86_mov(
2885       func,
2886       get_const_base(),
2887       x86_fn_arg( func, 2 ) );
2888    x86_mov(
2889       func,
2890       get_immediate_base(),
2891       x86_fn_arg( func, 3 ) );
2892
2893    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2894       x86_mov(
2895          func,
2896          get_coef_base(),
2897          x86_fn_arg( func, 4 ) );
2898    }
2899
2900    x86_mov(
2901       func,
2902       get_sampler_base(),
2903       x86_make_disp( get_machine_base(),
2904                      Offset( struct tgsi_exec_machine, Samplers ) ) );
2905
2906
2907    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2908       tgsi_parse_token( &parse );
2909
2910       switch( parse.FullToken.Token.Type ) {
2911       case TGSI_TOKEN_TYPE_DECLARATION:
2912          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2913             emit_declaration(
2914                func,
2915                &parse.FullToken.FullDeclaration );
2916          }
2917          break;
2918
2919       case TGSI_TOKEN_TYPE_INSTRUCTION:
2920          ok = emit_instruction(
2921             func,
2922             &parse.FullToken.FullInstruction );
2923
2924          if (!ok) {
2925             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2926             debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2927                          opcode,
2928                          tgsi_get_opcode_name(opcode),
2929                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2930                          "vertex shader" : "fragment shader");
2931          }
2932          break;
2933
2934       case TGSI_TOKEN_TYPE_IMMEDIATE:
2935          /* simply copy the immediate values into the next immediates[] slot */
2936          {
2937             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2938             uint i;
2939             assert(size <= 4);
2940             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2941             for( i = 0; i < size; i++ ) {
2942                immediates[num_immediates][i] =
2943                   parse.FullToken.FullImmediate.u[i].Float;
2944             }
2945 #if 0
2946             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2947                    num_immediates,
2948                    immediates[num_immediates][0],
2949                    immediates[num_immediates][1],
2950                    immediates[num_immediates][2],
2951                    immediates[num_immediates][3]);
2952 #endif
2953             num_immediates++;
2954          }
2955          break;
2956
2957       default:
2958          ok = 0;
2959          assert( 0 );
2960       }
2961    }
2962
2963    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2964       if (do_swizzles)
2965          soa_to_aos( func,
2966                      7,         /* aos_output */
2967                      1,         /* machine */
2968                      8,         /* num_outputs */
2969                      9 );       /* output_stride */
2970    }
2971
2972    /* Can't just use EBX, EDI without save/restoring them:
2973     */
2974    x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2975    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2976
2977    emit_ret( func );
2978
2979    tgsi_parse_free( &parse );
2980
2981    return ok;
2982 }
2983
2984 #endif /* PIPE_ARCH_X86 */
2985