src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_config.h"
  29
  30 #if defined(PIPE_ARCH_X86)
  31
  32 #include "util/u_debug.h"
  33 #include "pipe/p_shader_tokens.h"
  34 #include "util/u_math.h"
  35 #include "util/u_memory.h"
  36 #if defined(PIPE_ARCH_SSE)
  37 #include "util/u_sse.h"
  38 #endif
  39 #include "tgsi/tgsi_info.h"
  40 #include "tgsi/tgsi_parse.h"
  41 #include "tgsi/tgsi_util.h"
  42 #include "tgsi/tgsi_dump.h"
  43 #include "tgsi/tgsi_exec.h"
  44 #include "tgsi/tgsi_sse2.h"
  45
  46 #include "rtasm/rtasm_x86sse.h"
  47
  48 /* for 1/sqrt()
  49  *
  50  * This costs about 100fps (close to 10%) in gears:
  51  */
  52 #define HIGH_PRECISION 1
  53
  54 #define FAST_MATH 1
  55
  56
  57 #define FOR_EACH_CHANNEL( CHAN )\
  58    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  59
  60 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  61    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  62
  63 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  64    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  65
  66 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  67    FOR_EACH_CHANNEL( CHAN )\
  68       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  69
  70 #define CHAN_X 0
  71 #define CHAN_Y 1
  72 #define CHAN_Z 2
  73 #define CHAN_W 3
  74
  75 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  76 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  77
  78 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  79 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  80 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  81 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  82
  83
  84 /**
  85  * X86 utility functions.
  86  */
  87
  88 static struct x86_reg
  89 make_xmm(
  90    unsigned xmm )
  91 {
  92    return x86_make_reg(
  93       file_XMM,
  94       (enum x86_reg_name) xmm );
  95 }
  96
  97 /**
  98  * X86 register mapping helpers.
  99  */
 100
 101 static struct x86_reg
 102 get_const_base( void )
 103 {
 104    return x86_make_reg(
 105       file_REG32,
 106       reg_AX );
 107 }
 108
 109 static struct x86_reg
 110 get_machine_base( void )
 111 {
 112    return x86_make_reg(
 113       file_REG32,
 114       reg_CX );
 115 }
 116
 117 static struct x86_reg
 118 get_input_base( void )
 119 {
 120    return x86_make_disp(
 121       get_machine_base(),
 122       Offset(struct tgsi_exec_machine, Inputs) );
 123 }
 124
 125 static struct x86_reg
 126 get_output_base( void )
 127 {
 128    return x86_make_disp(
 129       get_machine_base(),
 130       Offset(struct tgsi_exec_machine, Outputs) );
 131 }
 132
 133 static struct x86_reg
 134 get_temp_base( void )
 135 {
 136    return x86_make_disp(
 137       get_machine_base(),
 138       Offset(struct tgsi_exec_machine, Temps) );
 139 }
 140
 141 static struct x86_reg
 142 get_coef_base( void )
 143 {
 144    return x86_make_reg(
 145       file_REG32,
 146       reg_BX );
 147 }
 148
 149 static struct x86_reg
 150 get_sampler_base( void )
 151 {
 152    return x86_make_reg(
 153       file_REG32,
 154       reg_DI );
 155 }
 156
 157 static struct x86_reg
 158 get_immediate_base( void )
 159 {
 160    return x86_make_reg(
 161       file_REG32,
 162       reg_DX );
 163 }
 164
 165
 166 /**
 167  * Data access helpers.
 168  */
 169
 170
 171 static struct x86_reg
 172 get_immediate(
 173    unsigned vec,
 174    unsigned chan )
 175 {
 176    return x86_make_disp(
 177       get_immediate_base(),
 178       (vec * 4 + chan) * 4 );
 179 }
 180
 181 static struct x86_reg
 182 get_const(
 183    unsigned vec,
 184    unsigned chan )
 185 {
 186    return x86_make_disp(
 187       get_const_base(),
 188       (vec * 4 + chan) * 4 );
 189 }
 190
 191 static struct x86_reg
 192 get_sampler_ptr(
 193    unsigned unit )
 194 {
 195    return x86_make_disp(
 196       get_sampler_base(),
 197       unit * sizeof( struct tgsi_sampler * ) );
 198 }
 199
 200 static struct x86_reg
 201 get_input(
 202    unsigned vec,
 203    unsigned chan )
 204 {
 205    return x86_make_disp(
 206       get_input_base(),
 207       (vec * 4 + chan) * 16 );
 208 }
 209
 210 static struct x86_reg
 211 get_output(
 212    unsigned vec,
 213    unsigned chan )
 214 {
 215    return x86_make_disp(
 216       get_output_base(),
 217       (vec * 4 + chan) * 16 );
 218 }
 219
 220 static struct x86_reg
 221 get_temp(
 222    unsigned vec,
 223    unsigned chan )
 224 {
 225    return x86_make_disp(
 226       get_temp_base(),
 227       (vec * 4 + chan) * 16 );
 228 }
 229
 230 static struct x86_reg
 231 get_coef(
 232    unsigned vec,
 233    unsigned chan,
 234    unsigned member )
 235 {
 236    return x86_make_disp(
 237       get_coef_base(),
 238       ((vec * 3 + member) * 4 + chan) * 4 );
 239 }
 240
 241
 242 static void
 243 emit_ret(
 244    struct x86_function  *func )
 245 {
 246    x86_ret( func );
 247 }
 248
 249
 250 /**
 251  * Data fetch helpers.
 252  */
 253
 254 /**
 255  * Copy a shader constant to xmm register
 256  * \param xmm  the destination xmm register
 257  * \param vec  the src const buffer index
 258  * \param chan  src channel to fetch (X, Y, Z or W)
 259  */
 260 static void
 261 emit_const(
 262    struct x86_function *func,
 263    uint xmm,
 264    int vec,
 265    uint chan,
 266    uint indirect,
 267    uint indirectFile,
 268    int indirectIndex )
 269 {
 270    if (indirect) {
 271       /* 'vec' is the offset from the address register's value.
 272        * We're loading CONST[ADDR+vec] into an xmm register.
 273        */
 274       struct x86_reg r0 = get_immediate_base();
 275       struct x86_reg r1 = get_coef_base();
 276       uint i;
 277
 278       assert( indirectFile == TGSI_FILE_ADDRESS );
 279       assert( indirectIndex == 0 );
 280       assert( r0.mod == mod_REG );
 281       assert( r1.mod == mod_REG );
 282
 283       x86_push( func, r0 );
 284       x86_push( func, r1 );
 285
 286       /*
 287        * Loop over the four pixels or vertices in the quad.
 288        * Get the value of the address (offset) register for pixel/vertex[i],
 289        * add it to the src offset and index into the constant buffer.
 290        * Note that we're working on SOA data.
 291        * If any of the pixel/vertex execution channels are unused their
 292        * values will be garbage.  It's very important that we don't use
 293        * those garbage values as indexes into the constant buffer since
 294        * that'll cause segfaults.
 295        * The solution is to bitwise-AND the offset with the execution mask
 296        * register whose values are either 0 or ~0.
 297        * The caller must setup the execution mask register to indicate
 298        * which channels are valid/alive before running the shader.
 299        * The execution mask will also figure into loops and conditionals
 300        * someday.
 301        */
 302       for (i = 0; i < QUAD_SIZE; i++) {
 303          /* r1 = address register[i] */
 304          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 305          /* r0 = execution mask[i] */
 306          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 307          /* r1 = r1 & r0 */
 308          x86_and( func, r1, r0 );
 309          /* r0 = 'vec', the offset */
 310          x86_lea( func, r0, get_const( vec, chan ) );
 311
 312          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 313           */
 314          x86_add( func, r1, r1 );
 315          x86_add( func, r1, r1 );
 316          x86_add( func, r1, r1 );
 317          x86_add( func, r1, r1 );
 318
 319          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 320          x86_mov( func, r1, x86_deref( r0 ) );
 321          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 322       }
 323
 324       x86_pop( func, r1 );
 325       x86_pop( func, r0 );
 326
 327       sse_movaps(
 328          func,
 329          make_xmm( xmm ),
 330          get_temp( TEMP_R0, CHAN_X ) );
 331    }
 332    else {
 333       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 334       assert( vec >= 0 );
 335
 336       sse_movss(
 337          func,
 338          make_xmm( xmm ),
 339          get_const( vec, chan ) );
 340       sse_shufps(
 341          func,
 342          make_xmm( xmm ),
 343          make_xmm( xmm ),
 344          SHUF( 0, 0, 0, 0 ) );
 345    }
 346 }
 347
 348 static void
 349 emit_immediate(
 350    struct x86_function *func,
 351    unsigned xmm,
 352    unsigned vec,
 353    unsigned chan )
 354 {
 355    sse_movss(
 356       func,
 357       make_xmm( xmm ),
 358       get_immediate( vec, chan ) );
 359    sse_shufps(
 360       func,
 361       make_xmm( xmm ),
 362       make_xmm( xmm ),
 363       SHUF( 0, 0, 0, 0 ) );
 364 }
 365
 366
 367 /**
 368  * Copy a shader input to xmm register
 369  * \param xmm  the destination xmm register
 370  * \param vec  the src input attrib
 371  * \param chan  src channel to fetch (X, Y, Z or W)
 372  */
 373 static void
 374 emit_inputf(
 375    struct x86_function *func,
 376    unsigned xmm,
 377    unsigned vec,
 378    unsigned chan )
 379 {
 380    sse_movups(
 381       func,
 382       make_xmm( xmm ),
 383       get_input( vec, chan ) );
 384 }
 385
 386 /**
 387  * Store an xmm register to a shader output
 388  * \param xmm  the source xmm register
 389  * \param vec  the dest output attrib
 390  * \param chan  src dest channel to store (X, Y, Z or W)
 391  */
 392 static void
 393 emit_output(
 394    struct x86_function *func,
 395    unsigned xmm,
 396    unsigned vec,
 397    unsigned chan )
 398 {
 399    sse_movups(
 400       func,
 401       get_output( vec, chan ),
 402       make_xmm( xmm ) );
 403 }
 404
 405 /**
 406  * Copy a shader temporary to xmm register
 407  * \param xmm  the destination xmm register
 408  * \param vec  the src temp register
 409  * \param chan  src channel to fetch (X, Y, Z or W)
 410  */
 411 static void
 412 emit_tempf(
 413    struct x86_function *func,
 414    unsigned xmm,
 415    unsigned vec,
 416    unsigned chan )
 417 {
 418    sse_movaps(
 419       func,
 420       make_xmm( xmm ),
 421       get_temp( vec, chan ) );
 422 }
 423
 424 /**
 425  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 426  * \param xmm  the destination xmm register
 427  * \param vec  the src input/attribute coefficient index
 428  * \param chan  src channel to fetch (X, Y, Z or W)
 429  * \param member  0=a0, 1=dadx, 2=dady
 430  */
 431 static void
 432 emit_coef(
 433    struct x86_function *func,
 434    unsigned xmm,
 435    unsigned vec,
 436    unsigned chan,
 437    unsigned member )
 438 {
 439    sse_movss(
 440       func,
 441       make_xmm( xmm ),
 442       get_coef( vec, chan, member ) );
 443    sse_shufps(
 444       func,
 445       make_xmm( xmm ),
 446       make_xmm( xmm ),
 447       SHUF( 0, 0, 0, 0 ) );
 448 }
 449
 450 /**
 451  * Data store helpers.
 452  */
 453
 454 static void
 455 emit_inputs(
 456    struct x86_function *func,
 457    unsigned xmm,
 458    unsigned vec,
 459    unsigned chan )
 460 {
 461    sse_movups(
 462       func,
 463       get_input( vec, chan ),
 464       make_xmm( xmm ) );
 465 }
 466
 467 static void
 468 emit_temps(
 469    struct x86_function *func,
 470    unsigned xmm,
 471    unsigned vec,
 472    unsigned chan )
 473 {
 474    sse_movaps(
 475       func,
 476       get_temp( vec, chan ),
 477       make_xmm( xmm ) );
 478 }
 479
 480 static void
 481 emit_addrs(
 482    struct x86_function *func,
 483    unsigned xmm,
 484    unsigned vec,
 485    unsigned chan )
 486 {
 487    assert( vec == 0 );
 488
 489    emit_temps(
 490       func,
 491       xmm,
 492       vec + TGSI_EXEC_TEMP_ADDR,
 493       chan );
 494 }
 495
 496 /**
 497  * Coefficent fetch helpers.
 498  */
 499
 500 static void
 501 emit_coef_a0(
 502    struct x86_function *func,
 503    unsigned xmm,
 504    unsigned vec,
 505    unsigned chan )
 506 {
 507    emit_coef(
 508       func,
 509       xmm,
 510       vec,
 511       chan,
 512       0 );
 513 }
 514
 515 static void
 516 emit_coef_dadx(
 517    struct x86_function *func,
 518    unsigned xmm,
 519    unsigned vec,
 520    unsigned chan )
 521 {
 522    emit_coef(
 523       func,
 524       xmm,
 525       vec,
 526       chan,
 527       1 );
 528 }
 529
 530 static void
 531 emit_coef_dady(
 532    struct x86_function *func,
 533    unsigned xmm,
 534    unsigned vec,
 535    unsigned chan )
 536 {
 537    emit_coef(
 538       func,
 539       xmm,
 540       vec,
 541       chan,
 542       2 );
 543 }
 544
 545 /**
 546  * Function call helpers.
 547  */
 548
 549 /**
 550  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 551  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 552  * that the stack pointer is 16 byte aligned, as expected.
 553  */
 554 static void
 555 emit_func_call(
 556    struct x86_function *func,
 557    unsigned xmm_save_mask,
 558    const struct x86_reg *arg,
 559    unsigned nr_args,
 560    void (PIPE_CDECL *code)() )
 561 {
 562    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 563    unsigned i, n;
 564
 565    x86_push(
 566       func,
 567       x86_make_reg( file_REG32, reg_AX) );
 568    x86_push(
 569       func,
 570       x86_make_reg( file_REG32, reg_CX) );
 571    x86_push(
 572       func,
 573       x86_make_reg( file_REG32, reg_DX) );
 574
 575    /* Store XMM regs to the stack
 576     */
 577    for(i = 0, n = 0; i < 8; ++i)
 578       if(xmm_save_mask & (1 << i))
 579          ++n;
 580
 581    x86_sub_imm(
 582       func,
 583       x86_make_reg( file_REG32, reg_SP ),
 584       n*16);
 585
 586    for(i = 0, n = 0; i < 8; ++i)
 587       if(xmm_save_mask & (1 << i)) {
 588          sse_movups(
 589             func,
 590             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 591             make_xmm( i ) );
 592          ++n;
 593       }
 594
 595    for (i = 0; i < nr_args; i++) {
 596       /* Load the address of the buffer we use for passing arguments and
 597        * receiving results:
 598        */
 599       x86_lea(
 600          func,
 601          ecx,
 602          arg[i] );
 603
 604       /* Push actual function arguments (currently just the pointer to
 605        * the buffer above), and call the function:
 606        */
 607       x86_push( func, ecx );
 608    }
 609
 610    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 611    x86_call( func, ecx );
 612
 613    /* Pop the arguments (or just add an immediate to esp)
 614     */
 615    for (i = 0; i < nr_args; i++) {
 616       x86_pop(func, ecx );
 617    }
 618
 619    /* Pop the saved XMM regs:
 620     */
 621    for(i = 0, n = 0; i < 8; ++i)
 622       if(xmm_save_mask & (1 << i)) {
 623          sse_movups(
 624             func,
 625             make_xmm( i ),
 626             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 627          ++n;
 628       }
 629
 630    x86_add_imm(
 631       func,
 632       x86_make_reg( file_REG32, reg_SP ),
 633       n*16);
 634
 635    /* Restore GP registers in a reverse order.
 636     */
 637    x86_pop(
 638       func,
 639       x86_make_reg( file_REG32, reg_DX) );
 640    x86_pop(
 641       func,
 642       x86_make_reg( file_REG32, reg_CX) );
 643    x86_pop(
 644       func,
 645       x86_make_reg( file_REG32, reg_AX) );
 646 }
 647
 648 static void
 649 emit_func_call_dst_src1(
 650    struct x86_function *func,
 651    unsigned xmm_save,
 652    unsigned xmm_dst,
 653    unsigned xmm_src0,
 654    void (PIPE_CDECL *code)() )
 655 {
 656    struct x86_reg store = get_temp( TEMP_R0, 0 );
 657    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 658
 659    /* Store our input parameters (in xmm regs) to the buffer we use
 660     * for passing arguments.  We will pass a pointer to this buffer as
 661     * the actual function argument.
 662     */
 663    sse_movaps(
 664       func,
 665       store,
 666       make_xmm( xmm_src0 ) );
 667
 668    emit_func_call( func,
 669                    xmm_mask,
 670                    &store,
 671                    1,
 672                    code );
 673
 674    sse_movaps(
 675       func,
 676       make_xmm( xmm_dst ),
 677       store );
 678 }
 679
 680
 681 static void
 682 emit_func_call_dst_src2(
 683    struct x86_function *func,
 684    unsigned xmm_save,
 685    unsigned xmm_dst,
 686    unsigned xmm_src0,
 687    unsigned xmm_src1,
 688    void (PIPE_CDECL *code)() )
 689 {
 690    struct x86_reg store = get_temp( TEMP_R0, 0 );
 691    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 692
 693    /* Store two inputs to parameter buffer.
 694     */
 695    sse_movaps(
 696       func,
 697       store,
 698       make_xmm( xmm_src0 ) );
 699
 700    sse_movaps(
 701       func,
 702       x86_make_disp( store, 4 * sizeof(float) ),
 703       make_xmm( xmm_src1 ) );
 704
 705
 706    /* Emit the call
 707     */
 708    emit_func_call( func,
 709                    xmm_mask,
 710                    &store,
 711                    1,
 712                    code );
 713
 714    /* Retrieve the results:
 715     */
 716    sse_movaps(
 717       func,
 718       make_xmm( xmm_dst ),
 719       store );
 720 }
 721
 722
 723
 724
 725
 726 #if defined(PIPE_ARCH_SSE)
 727
 728 /*
 729  * Fast SSE2 implementation of special math functions.
 730  */
 731
 732 #define POLY0(x, c0) _mm_set1_ps(c0)
 733 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 734 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 735 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 736 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 737 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 738
 739 #define EXP_POLY_DEGREE 3
 740 #define LOG_POLY_DEGREE 5
 741
 742 /**
 743  * See http://www.devmaster.net/forums/showthread.php?p=43580
 744  */
 745 static INLINE __m128
 746 exp2f4(__m128 x)
 747 {
 748    __m128i ipart;
 749    __m128 fpart, expipart, expfpart;
 750
 751    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 752    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 753
 754    /* ipart = int(x - 0.5) */
 755    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 756
 757    /* fpart = x - ipart */
 758    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 759
 760    /* expipart = (float) (1 << ipart) */
 761    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 762
 763    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 764 #if EXP_POLY_DEGREE == 5
 765    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 766 #elif EXP_POLY_DEGREE == 4
 767    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 768 #elif EXP_POLY_DEGREE == 3
 769    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 770 #elif EXP_POLY_DEGREE == 2
 771    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 772 #else
 773 #error
 774 #endif
 775
 776    return _mm_mul_ps(expipart, expfpart);
 777 }
 778
 779
 780 /**
 781  * See http://www.devmaster.net/forums/showthread.php?p=43580
 782  */
 783 static INLINE __m128
 784 log2f4(__m128 x)
 785 {
 786    __m128i expmask = _mm_set1_epi32(0x7f800000);
 787    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 788    __m128 one = _mm_set1_ps(1.0f);
 789
 790    __m128i i = _mm_castps_si128(x);
 791
 792    /* exp = (float) exponent(x) */
 793    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 794
 795    /* mant = (float) mantissa(x) */
 796    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 797
 798    __m128 logmant;
 799
 800    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 801     * These coefficients can be generate with
 802     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 803     */
 804 #if LOG_POLY_DEGREE == 6
 805    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 806 #elif LOG_POLY_DEGREE == 5
 807    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 808 #elif LOG_POLY_DEGREE == 4
 809    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 810 #elif LOG_POLY_DEGREE == 3
 811    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 812 #else
 813 #error
 814 #endif
 815
 816    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 817    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 818
 819    return _mm_add_ps(logmant, exp);
 820 }
 821
 822
 823 static INLINE __m128
 824 powf4(__m128 x, __m128 y)
 825 {
 826    return exp2f4(_mm_mul_ps(log2f4(x), y));
 827 }
 828
 829 #endif /* PIPE_ARCH_SSE */
 830
 831
 832
 833 /**
 834  * Low-level instruction translators.
 835  */
 836
 837 static void
 838 emit_abs(
 839    struct x86_function *func,
 840    unsigned xmm )
 841 {
 842    sse_andps(
 843       func,
 844       make_xmm( xmm ),
 845       get_temp(
 846          TGSI_EXEC_TEMP_7FFFFFFF_I,
 847          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 848 }
 849
 850 static void
 851 emit_add(
 852    struct x86_function *func,
 853    unsigned xmm_dst,
 854    unsigned xmm_src )
 855 {
 856    sse_addps(
 857       func,
 858       make_xmm( xmm_dst ),
 859       make_xmm( xmm_src ) );
 860 }
 861
 862 static void PIPE_CDECL
 863 cos4f(
 864    float *store )
 865 {
 866    store[0] = cosf( store[0] );
 867    store[1] = cosf( store[1] );
 868    store[2] = cosf( store[2] );
 869    store[3] = cosf( store[3] );
 870 }
 871
 872 static void
 873 emit_cos(
 874    struct x86_function *func,
 875    unsigned xmm_save,
 876    unsigned xmm_dst )
 877 {
 878    emit_func_call_dst_src1(
 879       func,
 880       xmm_save,
 881       xmm_dst,
 882       xmm_dst,
 883       cos4f );
 884 }
 885
 886 static void PIPE_CDECL
 887 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 888 __attribute__((force_align_arg_pointer))
 889 #endif
 890 ex24f(
 891    float *store )
 892 {
 893 #if defined(PIPE_ARCH_SSE)
 894    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 895 #else
 896    store[0] = util_fast_exp2( store[0] );
 897    store[1] = util_fast_exp2( store[1] );
 898    store[2] = util_fast_exp2( store[2] );
 899    store[3] = util_fast_exp2( store[3] );
 900 #endif
 901 }
 902
 903 static void
 904 emit_ex2(
 905    struct x86_function *func,
 906    unsigned xmm_save,
 907    unsigned xmm_dst )
 908 {
 909    emit_func_call_dst_src1(
 910       func,
 911       xmm_save,
 912       xmm_dst,
 913       xmm_dst,
 914       ex24f );
 915 }
 916
 917 static void
 918 emit_f2it(
 919    struct x86_function *func,
 920    unsigned xmm )
 921 {
 922    sse2_cvttps2dq(
 923       func,
 924       make_xmm( xmm ),
 925       make_xmm( xmm ) );
 926 }
 927
 928 static void
 929 emit_i2f(
 930    struct x86_function *func,
 931    unsigned xmm )
 932 {
 933    sse2_cvtdq2ps(
 934       func,
 935       make_xmm( xmm ),
 936       make_xmm( xmm ) );
 937 }
 938
 939 static void PIPE_CDECL
 940 flr4f(
 941    float *store )
 942 {
 943    store[0] = floorf( store[0] );
 944    store[1] = floorf( store[1] );
 945    store[2] = floorf( store[2] );
 946    store[3] = floorf( store[3] );
 947 }
 948
 949 static void
 950 emit_flr(
 951    struct x86_function *func,
 952    unsigned xmm_save,
 953    unsigned xmm_dst )
 954 {
 955    emit_func_call_dst_src1(
 956       func,
 957       xmm_save,
 958       xmm_dst,
 959       xmm_dst,
 960       flr4f );
 961 }
 962
 963 static void PIPE_CDECL
 964 frc4f(
 965    float *store )
 966 {
 967    store[0] -= floorf( store[0] );
 968    store[1] -= floorf( store[1] );
 969    store[2] -= floorf( store[2] );
 970    store[3] -= floorf( store[3] );
 971 }
 972
 973 static void
 974 emit_frc(
 975    struct x86_function *func,
 976    unsigned xmm_save,
 977    unsigned xmm_dst )
 978 {
 979    emit_func_call_dst_src1(
 980       func,
 981       xmm_save,
 982       xmm_dst,
 983       xmm_dst,
 984       frc4f );
 985 }
 986
 987 static void PIPE_CDECL
 988 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 989 __attribute__((force_align_arg_pointer))
 990 #endif
 991 lg24f(
 992    float *store )
 993 {
 994 #if defined(PIPE_ARCH_SSE)
 995    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 996 #else
 997    store[0] = util_fast_log2( store[0] );
 998    store[1] = util_fast_log2( store[1] );
 999    store[2] = util_fast_log2( store[2] );
1000    store[3] = util_fast_log2( store[3] );
1001 #endif
1002 }
1003
1004 static void
1005 emit_lg2(
1006    struct x86_function *func,
1007    unsigned xmm_save,
1008    unsigned xmm_dst )
1009 {
1010    emit_func_call_dst_src1(
1011       func,
1012       xmm_save,
1013       xmm_dst,
1014       xmm_dst,
1015       lg24f );
1016 }
1017
1018 static void
1019 emit_MOV(
1020    struct x86_function *func,
1021    unsigned xmm_dst,
1022    unsigned xmm_src )
1023 {
1024    sse_movups(
1025       func,
1026       make_xmm( xmm_dst ),
1027       make_xmm( xmm_src ) );
1028 }
1029
1030 static void
1031 emit_mul (struct x86_function *func,
1032           unsigned xmm_dst,
1033           unsigned xmm_src)
1034 {
1035    sse_mulps(
1036       func,
1037       make_xmm( xmm_dst ),
1038       make_xmm( xmm_src ) );
1039 }
1040
1041 static void
1042 emit_neg(
1043    struct x86_function *func,
1044    unsigned xmm )
1045 {
1046    sse_xorps(
1047       func,
1048       make_xmm( xmm ),
1049       get_temp(
1050          TGSI_EXEC_TEMP_80000000_I,
1051          TGSI_EXEC_TEMP_80000000_C ) );
1052 }
1053
1054 static void PIPE_CDECL
1055 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1056 __attribute__((force_align_arg_pointer))
1057 #endif
1058 pow4f(
1059    float *store )
1060 {
1061 #if defined(PIPE_ARCH_SSE)
1062    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1063 #else
1064    store[0] = util_fast_pow( store[0], store[4] );
1065    store[1] = util_fast_pow( store[1], store[5] );
1066    store[2] = util_fast_pow( store[2], store[6] );
1067    store[3] = util_fast_pow( store[3], store[7] );
1068 #endif
1069 }
1070
1071 static void
1072 emit_pow(
1073    struct x86_function *func,
1074    unsigned xmm_save,
1075    unsigned xmm_dst,
1076    unsigned xmm_src0,
1077    unsigned xmm_src1 )
1078 {
1079    emit_func_call_dst_src2(
1080       func,
1081       xmm_save,
1082       xmm_dst,
1083       xmm_src0,
1084       xmm_src1,
1085       pow4f );
1086 }
1087
1088 static void
1089 emit_rcp (
1090    struct x86_function *func,
1091    unsigned xmm_dst,
1092    unsigned xmm_src )
1093 {
1094    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1095     * good enough.  Need to either emit a proper divide or use the
1096     * iterative technique described below in emit_rsqrt().
1097     */
1098    sse2_rcpps(
1099       func,
1100       make_xmm( xmm_dst ),
1101       make_xmm( xmm_src ) );
1102 }
1103
1104 static void PIPE_CDECL
1105 rnd4f(
1106    float *store )
1107 {
1108    store[0] = floorf( store[0] + 0.5f );
1109    store[1] = floorf( store[1] + 0.5f );
1110    store[2] = floorf( store[2] + 0.5f );
1111    store[3] = floorf( store[3] + 0.5f );
1112 }
1113
1114 static void
1115 emit_rnd(
1116    struct x86_function *func,
1117    unsigned xmm_save,
1118    unsigned xmm_dst )
1119 {
1120    emit_func_call_dst_src1(
1121       func,
1122       xmm_save,
1123       xmm_dst,
1124       xmm_dst,
1125       rnd4f );
1126 }
1127
1128 static void
1129 emit_rsqrt(
1130    struct x86_function *func,
1131    unsigned xmm_dst,
1132    unsigned xmm_src )
1133 {
1134 #if HIGH_PRECISION
1135    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1136     * implementations, it is possible to improve its precision at
1137     * fairly low cost, using a newton/raphson step, as below:
1138     *
1139     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1140     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1141     *
1142     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1143     */
1144    {
1145       struct x86_reg dst = make_xmm( xmm_dst );
1146       struct x86_reg src = make_xmm( xmm_src );
1147       struct x86_reg tmp0 = make_xmm( 2 );
1148       struct x86_reg tmp1 = make_xmm( 3 );
1149
1150       assert( xmm_dst != xmm_src );
1151       assert( xmm_dst != 2 && xmm_dst != 3 );
1152       assert( xmm_src != 2 && xmm_src != 3 );
1153
1154       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1155       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1156       sse_rsqrtps( func, tmp1, src  );
1157       sse_mulps(   func, src,  tmp1 );
1158       sse_mulps(   func, dst,  tmp1 );
1159       sse_mulps(   func, src,  tmp1 );
1160       sse_subps(   func, tmp0, src  );
1161       sse_mulps(   func, dst,  tmp0 );
1162    }
1163 #else
1164    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1165     * good enough.
1166     */
1167    sse_rsqrtps(
1168       func,
1169       make_xmm( xmm_dst ),
1170       make_xmm( xmm_src ) );
1171 #endif
1172 }
1173
1174 static void
1175 emit_setsign(
1176    struct x86_function *func,
1177    unsigned xmm )
1178 {
1179    sse_orps(
1180       func,
1181       make_xmm( xmm ),
1182       get_temp(
1183          TGSI_EXEC_TEMP_80000000_I,
1184          TGSI_EXEC_TEMP_80000000_C ) );
1185 }
1186
1187 static void PIPE_CDECL
1188 sgn4f(
1189    float *store )
1190 {
1191    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1192    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1193    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1194    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1195 }
1196
1197 static void
1198 emit_sgn(
1199    struct x86_function *func,
1200    unsigned xmm_save,
1201    unsigned xmm_dst )
1202 {
1203    emit_func_call_dst_src1(
1204       func,
1205       xmm_save,
1206       xmm_dst,
1207       xmm_dst,
1208       sgn4f );
1209 }
1210
1211 static void PIPE_CDECL
1212 sin4f(
1213    float *store )
1214 {
1215    store[0] = sinf( store[0] );
1216    store[1] = sinf( store[1] );
1217    store[2] = sinf( store[2] );
1218    store[3] = sinf( store[3] );
1219 }
1220
1221 static void
1222 emit_sin (struct x86_function *func,
1223           unsigned xmm_save,
1224           unsigned xmm_dst)
1225 {
1226    emit_func_call_dst_src1(
1227       func,
1228       xmm_save,
1229       xmm_dst,
1230       xmm_dst,
1231       sin4f );
1232 }
1233
1234 static void
1235 emit_sub(
1236    struct x86_function *func,
1237    unsigned xmm_dst,
1238    unsigned xmm_src )
1239 {
1240    sse_subps(
1241       func,
1242       make_xmm( xmm_dst ),
1243       make_xmm( xmm_src ) );
1244 }
1245
1246
1247
1248
1249
1250
1251
1252 /**
1253  * Register fetch.
1254  */
1255
1256 static void
1257 emit_fetch(
1258    struct x86_function *func,
1259    unsigned xmm,
1260    const struct tgsi_full_src_register *reg,
1261    const unsigned chan_index )
1262 {
1263    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1264
1265    switch (swizzle) {
1266    case TGSI_EXTSWIZZLE_X:
1267    case TGSI_EXTSWIZZLE_Y:
1268    case TGSI_EXTSWIZZLE_Z:
1269    case TGSI_EXTSWIZZLE_W:
1270       switch (reg->SrcRegister.File) {
1271       case TGSI_FILE_CONSTANT:
1272          emit_const(
1273             func,
1274             xmm,
1275             reg->SrcRegister.Index,
1276             swizzle,
1277             reg->SrcRegister.Indirect,
1278             reg->SrcRegisterInd.File,
1279             reg->SrcRegisterInd.Index );
1280          break;
1281
1282       case TGSI_FILE_IMMEDIATE:
1283          emit_immediate(
1284             func,
1285             xmm,
1286             reg->SrcRegister.Index,
1287             swizzle );
1288          break;
1289
1290       case TGSI_FILE_INPUT:
1291          emit_inputf(
1292             func,
1293             xmm,
1294             reg->SrcRegister.Index,
1295             swizzle );
1296          break;
1297
1298       case TGSI_FILE_TEMPORARY:
1299          emit_tempf(
1300             func,
1301             xmm,
1302             reg->SrcRegister.Index,
1303             swizzle );
1304          break;
1305
1306       default:
1307          assert( 0 );
1308       }
1309       break;
1310
1311    case TGSI_EXTSWIZZLE_ZERO:
1312       emit_tempf(
1313          func,
1314          xmm,
1315          TGSI_EXEC_TEMP_00000000_I,
1316          TGSI_EXEC_TEMP_00000000_C );
1317       break;
1318
1319    case TGSI_EXTSWIZZLE_ONE:
1320       emit_tempf(
1321          func,
1322          xmm,
1323          TEMP_ONE_I,
1324          TEMP_ONE_C );
1325       break;
1326
1327    default:
1328       assert( 0 );
1329    }
1330
1331    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1332    case TGSI_UTIL_SIGN_CLEAR:
1333       emit_abs( func, xmm );
1334       break;
1335
1336    case TGSI_UTIL_SIGN_SET:
1337       emit_setsign( func, xmm );
1338       break;
1339
1340    case TGSI_UTIL_SIGN_TOGGLE:
1341       emit_neg( func, xmm );
1342       break;
1343
1344    case TGSI_UTIL_SIGN_KEEP:
1345       break;
1346    }
1347 }
1348
1349 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1350    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1351
1352 /**
1353  * Register store.
1354  */
1355
1356 static void
1357 emit_store(
1358    struct x86_function *func,
1359    unsigned xmm,
1360    const struct tgsi_full_dst_register *reg,
1361    const struct tgsi_full_instruction *inst,
1362    unsigned chan_index )
1363 {
1364    switch( inst->Instruction.Saturate ) {
1365    case TGSI_SAT_NONE:
1366       break;
1367
1368    case TGSI_SAT_ZERO_ONE:
1369       sse_maxps(
1370          func,
1371          make_xmm( xmm ),
1372          get_temp(
1373             TGSI_EXEC_TEMP_00000000_I,
1374             TGSI_EXEC_TEMP_00000000_C ) );
1375
1376       sse_minps(
1377          func,
1378          make_xmm( xmm ),
1379          get_temp(
1380             TGSI_EXEC_TEMP_ONE_I,
1381             TGSI_EXEC_TEMP_ONE_C ) );
1382       break;
1383
1384    case TGSI_SAT_MINUS_PLUS_ONE:
1385       assert( 0 );
1386       break;
1387    }
1388
1389
1390    switch( reg->DstRegister.File ) {
1391    case TGSI_FILE_OUTPUT:
1392       emit_output(
1393          func,
1394          xmm,
1395          reg->DstRegister.Index,
1396          chan_index );
1397       break;
1398
1399    case TGSI_FILE_TEMPORARY:
1400       emit_temps(
1401          func,
1402          xmm,
1403          reg->DstRegister.Index,
1404          chan_index );
1405       break;
1406
1407    case TGSI_FILE_ADDRESS:
1408       emit_addrs(
1409          func,
1410          xmm,
1411          reg->DstRegister.Index,
1412          chan_index );
1413       break;
1414
1415    default:
1416       assert( 0 );
1417    }
1418 }
1419
1420 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1421    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1422
1423
1424 static void PIPE_CDECL
1425 fetch_texel( struct tgsi_sampler **sampler,
1426              float *store )
1427 {
1428 #if 0
1429    uint j;
1430
1431    debug_printf("%s sampler: %p (%p) store: %p\n",
1432                 __FUNCTION__,
1433                 sampler, *sampler,
1434                 store );
1435
1436    debug_printf("lodbias %f\n", store[12]);
1437
1438    for (j = 0; j < 4; j++)
1439       debug_printf("sample %d texcoord %f %f\n",
1440                    j,
1441                    store[0+j],
1442                    store[4+j]);
1443 #endif
1444
1445    {
1446       float rgba[NUM_CHANNELS][QUAD_SIZE];
1447       (*sampler)->get_samples(*sampler,
1448                               &store[0],
1449                               &store[4],
1450                               &store[8],
1451                               0.0f, /*store[12],  lodbias */
1452                               rgba);
1453
1454       memcpy( store, rgba, 16 * sizeof(float));
1455    }
1456
1457 #if 0
1458    for (j = 0; j < 4; j++)
1459       debug_printf("sample %d result %f %f %f %f\n",
1460                    j,
1461                    store[0+j],
1462                    store[4+j],
1463                    store[8+j],
1464                    store[12+j]);
1465 #endif
1466 }
1467
1468 /**
1469  * High-level instruction translators.
1470  */
1471
1472 static void
1473 emit_tex( struct x86_function *func,
1474           const struct tgsi_full_instruction *inst,
1475           boolean lodbias,
1476           boolean projected)
1477 {
1478    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1479    struct x86_reg args[2];
1480    unsigned count;
1481    unsigned i;
1482
1483    switch (inst->InstructionExtTexture.Texture) {
1484    case TGSI_TEXTURE_1D:
1485       count = 1;
1486       break;
1487    case TGSI_TEXTURE_2D:
1488    case TGSI_TEXTURE_RECT:
1489       count = 2;
1490       break;
1491    case TGSI_TEXTURE_SHADOW1D:
1492    case TGSI_TEXTURE_SHADOW2D:
1493    case TGSI_TEXTURE_SHADOWRECT:
1494    case TGSI_TEXTURE_3D:
1495    case TGSI_TEXTURE_CUBE:
1496       count = 3;
1497       break;
1498    default:
1499       assert(0);
1500       return;
1501    }
1502
1503    if (lodbias) {
1504       FETCH( func, *inst, 3, 0, 3 );
1505    }
1506    else {
1507       emit_tempf(
1508          func,
1509          3,
1510          TGSI_EXEC_TEMP_00000000_I,
1511          TGSI_EXEC_TEMP_00000000_C );
1512
1513    }
1514
1515    /* store lodbias whether enabled or not -- fetch_texel currently
1516     * respects it always.
1517     */
1518    sse_movaps( func,
1519                get_temp( TEMP_R0, 3 ),
1520                make_xmm( 3 ) );
1521
1522
1523    if (projected) {
1524       FETCH( func, *inst, 3, 0, 3 );
1525
1526       emit_rcp( func, 3, 3 );
1527    }
1528
1529    for (i = 0; i < count; i++) {
1530       FETCH( func, *inst, i, 0, i );
1531
1532       if (projected) {
1533          sse_mulps(
1534             func,
1535             make_xmm( i ),
1536             make_xmm( 3 ) );
1537       }
1538
1539       /* Store in the argument buffer:
1540        */
1541       sse_movaps(
1542          func,
1543          get_temp( TEMP_R0, i ),
1544          make_xmm( i ) );
1545    }
1546
1547    args[0] = get_temp( TEMP_R0, 0 );
1548    args[1] = get_sampler_ptr( unit );
1549
1550
1551    emit_func_call( func,
1552                    0,
1553                    args,
1554                    Elements(args),
1555                    fetch_texel );
1556
1557    /* If all four channels are enabled, could use a pointer to
1558     * dst[0].x instead of TEMP_R0 for store?
1559     */
1560    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1561
1562       sse_movaps(
1563          func,
1564          make_xmm( 0 ),
1565          get_temp( TEMP_R0, i ) );
1566
1567       STORE( func, *inst, 0, 0, i );
1568    }
1569 }
1570
1571
1572 static void
1573 emit_kil(
1574    struct x86_function *func,
1575    const struct tgsi_full_src_register *reg )
1576 {
1577    unsigned uniquemask;
1578    unsigned unique_count = 0;
1579    unsigned chan_index;
1580    unsigned i;
1581
1582    /* This mask stores component bits that were already tested. Note that
1583     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1584     * tested. */
1585    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1586
1587    FOR_EACH_CHANNEL( chan_index ) {
1588       unsigned swizzle;
1589
1590       /* unswizzle channel */
1591       swizzle = tgsi_util_get_full_src_register_extswizzle(
1592          reg,
1593          chan_index );
1594
1595       /* check if the component has not been already tested */
1596       if( !(uniquemask & (1 << swizzle)) ) {
1597          uniquemask |= 1 << swizzle;
1598
1599          /* allocate register */
1600          emit_fetch(
1601             func,
1602             unique_count++,
1603             reg,
1604             chan_index );
1605       }
1606    }
1607
1608    x86_push(
1609       func,
1610       x86_make_reg( file_REG32, reg_AX ) );
1611    x86_push(
1612       func,
1613       x86_make_reg( file_REG32, reg_DX ) );
1614
1615    for (i = 0 ; i < unique_count; i++ ) {
1616       struct x86_reg dataXMM = make_xmm(i);
1617
1618       sse_cmpps(
1619          func,
1620          dataXMM,
1621          get_temp(
1622             TGSI_EXEC_TEMP_00000000_I,
1623             TGSI_EXEC_TEMP_00000000_C ),
1624          cc_LessThan );
1625
1626       if( i == 0 ) {
1627          sse_movmskps(
1628             func,
1629             x86_make_reg( file_REG32, reg_AX ),
1630             dataXMM );
1631       }
1632       else {
1633          sse_movmskps(
1634             func,
1635             x86_make_reg( file_REG32, reg_DX ),
1636             dataXMM );
1637          x86_or(
1638             func,
1639             x86_make_reg( file_REG32, reg_AX ),
1640             x86_make_reg( file_REG32, reg_DX ) );
1641       }
1642    }
1643
1644    x86_or(
1645       func,
1646       get_temp(
1647          TGSI_EXEC_TEMP_KILMASK_I,
1648          TGSI_EXEC_TEMP_KILMASK_C ),
1649       x86_make_reg( file_REG32, reg_AX ) );
1650
1651    x86_pop(
1652       func,
1653       x86_make_reg( file_REG32, reg_DX ) );
1654    x86_pop(
1655       func,
1656       x86_make_reg( file_REG32, reg_AX ) );
1657 }
1658
1659
1660 static void
1661 emit_kilp(
1662    struct x86_function *func )
1663 {
1664    /* XXX todo / fix me */
1665 }
1666
1667
1668 static void
1669 emit_setcc(
1670    struct x86_function *func,
1671    struct tgsi_full_instruction *inst,
1672    enum sse_cc cc )
1673 {
1674    unsigned chan_index;
1675
1676    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1677       FETCH( func, *inst, 0, 0, chan_index );
1678       FETCH( func, *inst, 1, 1, chan_index );
1679       sse_cmpps(
1680          func,
1681          make_xmm( 0 ),
1682          make_xmm( 1 ),
1683          cc );
1684       sse_andps(
1685          func,
1686          make_xmm( 0 ),
1687          get_temp(
1688             TEMP_ONE_I,
1689             TEMP_ONE_C ) );
1690       STORE( func, *inst, 0, 0, chan_index );
1691    }
1692 }
1693
1694 static void
1695 emit_cmp(
1696    struct x86_function *func,
1697    struct tgsi_full_instruction *inst )
1698 {
1699    unsigned chan_index;
1700
1701    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1702       FETCH( func, *inst, 0, 0, chan_index );
1703       FETCH( func, *inst, 1, 1, chan_index );
1704       FETCH( func, *inst, 2, 2, chan_index );
1705       sse_cmpps(
1706          func,
1707          make_xmm( 0 ),
1708          get_temp(
1709             TGSI_EXEC_TEMP_00000000_I,
1710             TGSI_EXEC_TEMP_00000000_C ),
1711          cc_LessThan );
1712       sse_andps(
1713          func,
1714          make_xmm( 1 ),
1715          make_xmm( 0 ) );
1716       sse_andnps(
1717          func,
1718          make_xmm( 0 ),
1719          make_xmm( 2 ) );
1720       sse_orps(
1721          func,
1722          make_xmm( 0 ),
1723          make_xmm( 1 ) );
1724       STORE( func, *inst, 0, 0, chan_index );
1725    }
1726 }
1727
1728
1729 /**
1730  * Check if inst src/dest regs use indirect addressing into temporary
1731  * register file.
1732  */
1733 static boolean
1734 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1735 {
1736    uint i;
1737    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1738       const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
1739       if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
1740           reg->SrcRegister.Indirect)
1741          return TRUE;
1742    }
1743    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1744       const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
1745       if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
1746           reg->DstRegister.Indirect)
1747          return TRUE;
1748    }
1749    return FALSE;
1750 }
1751
1752
1753 static int
1754 emit_instruction(
1755    struct x86_function *func,
1756    struct tgsi_full_instruction *inst )
1757 {
1758    unsigned chan_index;
1759
1760    /* we can't handle indirect addressing into temp register file yet */
1761    if (indirect_temp_reference(inst))
1762       return FALSE;
1763
1764    switch (inst->Instruction.Opcode) {
1765    case TGSI_OPCODE_ARL:
1766       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1767          FETCH( func, *inst, 0, 0, chan_index );
1768          emit_flr(func, 0, 0);
1769          emit_f2it( func, 0 );
1770          STORE( func, *inst, 0, 0, chan_index );
1771       }
1772       break;
1773
1774    case TGSI_OPCODE_MOV:
1775    case TGSI_OPCODE_SWZ:
1776       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1777          FETCH( func, *inst, 4 + chan_index, 0, chan_index );
1778       }
1779       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1780          STORE( func, *inst, 4 + chan_index, 0, chan_index );
1781       }
1782       break;
1783
1784    case TGSI_OPCODE_LIT:
1785       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1786           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1787          emit_tempf(
1788             func,
1789             0,
1790             TEMP_ONE_I,
1791             TEMP_ONE_C);
1792          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1793             STORE( func, *inst, 0, 0, CHAN_X );
1794          }
1795          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1796             STORE( func, *inst, 0, 0, CHAN_W );
1797          }
1798       }
1799       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1800           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1801          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1802             FETCH( func, *inst, 0, 0, CHAN_X );
1803             sse_maxps(
1804                func,
1805                make_xmm( 0 ),
1806                get_temp(
1807                   TGSI_EXEC_TEMP_00000000_I,
1808                   TGSI_EXEC_TEMP_00000000_C ) );
1809             STORE( func, *inst, 0, 0, CHAN_Y );
1810          }
1811          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1812             /* XMM[1] = SrcReg[0].yyyy */
1813             FETCH( func, *inst, 1, 0, CHAN_Y );
1814             /* XMM[1] = max(XMM[1], 0) */
1815             sse_maxps(
1816                func,
1817                make_xmm( 1 ),
1818                get_temp(
1819                   TGSI_EXEC_TEMP_00000000_I,
1820                   TGSI_EXEC_TEMP_00000000_C ) );
1821             /* XMM[2] = SrcReg[0].wwww */
1822             FETCH( func, *inst, 2, 0, CHAN_W );
1823             /* XMM[2] = min(XMM[2], 128.0) */
1824             sse_minps(
1825                func,
1826                make_xmm( 2 ),
1827                get_temp(
1828                   TGSI_EXEC_TEMP_128_I,
1829                   TGSI_EXEC_TEMP_128_C ) );
1830             /* XMM[2] = max(XMM[2], -128.0) */
1831             sse_maxps(
1832                func,
1833                make_xmm( 2 ),
1834                get_temp(
1835                   TGSI_EXEC_TEMP_MINUS_128_I,
1836                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1837             emit_pow( func, 3, 1, 1, 2 );
1838             FETCH( func, *inst, 0, 0, CHAN_X );
1839             sse_xorps(
1840                func,
1841                make_xmm( 2 ),
1842                make_xmm( 2 ) );
1843             sse_cmpps(
1844                func,
1845                make_xmm( 2 ),
1846                make_xmm( 0 ),
1847                cc_LessThan );
1848             sse_andps(
1849                func,
1850                make_xmm( 2 ),
1851                make_xmm( 1 ) );
1852             STORE( func, *inst, 2, 0, CHAN_Z );
1853          }
1854       }
1855       break;
1856
1857    case TGSI_OPCODE_RCP:
1858    /* TGSI_OPCODE_RECIP */
1859       FETCH( func, *inst, 0, 0, CHAN_X );
1860       emit_rcp( func, 0, 0 );
1861       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1862          STORE( func, *inst, 0, 0, chan_index );
1863       }
1864       break;
1865
1866    case TGSI_OPCODE_RSQ:
1867    /* TGSI_OPCODE_RECIPSQRT */
1868       FETCH( func, *inst, 0, 0, CHAN_X );
1869       emit_abs( func, 0 );
1870       emit_rsqrt( func, 1, 0 );
1871       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1872          STORE( func, *inst, 1, 0, chan_index );
1873       }
1874       break;
1875
1876    case TGSI_OPCODE_EXP:
1877       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1878           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1879           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1880          FETCH( func, *inst, 0, 0, CHAN_X );
1881          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1882              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1883             emit_MOV( func, 1, 0 );
1884             emit_flr( func, 2, 1 );
1885             /* dst.x = ex2(floor(src.x)) */
1886             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1887                emit_MOV( func, 2, 1 );
1888                emit_ex2( func, 3, 2 );
1889                STORE( func, *inst, 2, 0, CHAN_X );
1890             }
1891             /* dst.y = src.x - floor(src.x) */
1892             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1893                emit_MOV( func, 2, 0 );
1894                emit_sub( func, 2, 1 );
1895                STORE( func, *inst, 2, 0, CHAN_Y );
1896             }
1897          }
1898          /* dst.z = ex2(src.x) */
1899          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1900             emit_ex2( func, 3, 0 );
1901             STORE( func, *inst, 0, 0, CHAN_Z );
1902          }
1903       }
1904       /* dst.w = 1.0 */
1905       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1906          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1907          STORE( func, *inst, 0, 0, CHAN_W );
1908       }
1909       break;
1910
1911    case TGSI_OPCODE_LOG:
1912       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1913           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1914           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1915          FETCH( func, *inst, 0, 0, CHAN_X );
1916          emit_abs( func, 0 );
1917          emit_MOV( func, 1, 0 );
1918          emit_lg2( func, 2, 1 );
1919          /* dst.z = lg2(abs(src.x)) */
1920          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1921             STORE( func, *inst, 1, 0, CHAN_Z );
1922          }
1923          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1924              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1925             emit_flr( func, 2, 1 );
1926             /* dst.x = floor(lg2(abs(src.x))) */
1927             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1928                STORE( func, *inst, 1, 0, CHAN_X );
1929             }
1930             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1931             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1932                emit_ex2( func, 2, 1 );
1933                emit_rcp( func, 1, 1 );
1934                emit_mul( func, 0, 1 );
1935                STORE( func, *inst, 0, 0, CHAN_Y );
1936             }
1937          }
1938       }
1939       /* dst.w = 1.0 */
1940       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1941          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1942          STORE( func, *inst, 0, 0, CHAN_W );
1943       }
1944       break;
1945
1946    case TGSI_OPCODE_MUL:
1947       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1948          FETCH( func, *inst, 0, 0, chan_index );
1949          FETCH( func, *inst, 1, 1, chan_index );
1950          emit_mul( func, 0, 1 );
1951          STORE( func, *inst, 0, 0, chan_index );
1952       }
1953       break;
1954
1955    case TGSI_OPCODE_ADD:
1956       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1957          FETCH( func, *inst, 0, 0, chan_index );
1958          FETCH( func, *inst, 1, 1, chan_index );
1959          emit_add( func, 0, 1 );
1960          STORE( func, *inst, 0, 0, chan_index );
1961       }
1962       break;
1963
1964    case TGSI_OPCODE_DP3:
1965    /* TGSI_OPCODE_DOT3 */
1966       FETCH( func, *inst, 0, 0, CHAN_X );
1967       FETCH( func, *inst, 1, 1, CHAN_X );
1968       emit_mul( func, 0, 1 );
1969       FETCH( func, *inst, 1, 0, CHAN_Y );
1970       FETCH( func, *inst, 2, 1, CHAN_Y );
1971       emit_mul( func, 1, 2 );
1972       emit_add( func, 0, 1 );
1973       FETCH( func, *inst, 1, 0, CHAN_Z );
1974       FETCH( func, *inst, 2, 1, CHAN_Z );
1975       emit_mul( func, 1, 2 );
1976       emit_add( func, 0, 1 );
1977       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1978          STORE( func, *inst, 0, 0, chan_index );
1979       }
1980       break;
1981
1982    case TGSI_OPCODE_DP4:
1983    /* TGSI_OPCODE_DOT4 */
1984       FETCH( func, *inst, 0, 0, CHAN_X );
1985       FETCH( func, *inst, 1, 1, CHAN_X );
1986       emit_mul( func, 0, 1 );
1987       FETCH( func, *inst, 1, 0, CHAN_Y );
1988       FETCH( func, *inst, 2, 1, CHAN_Y );
1989       emit_mul( func, 1, 2 );
1990       emit_add( func, 0, 1 );
1991       FETCH( func, *inst, 1, 0, CHAN_Z );
1992       FETCH( func, *inst, 2, 1, CHAN_Z );
1993       emit_mul(func, 1, 2 );
1994       emit_add(func, 0, 1 );
1995       FETCH( func, *inst, 1, 0, CHAN_W );
1996       FETCH( func, *inst, 2, 1, CHAN_W );
1997       emit_mul( func, 1, 2 );
1998       emit_add( func, 0, 1 );
1999       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2000          STORE( func, *inst, 0, 0, chan_index );
2001       }
2002       break;
2003
2004    case TGSI_OPCODE_DST:
2005       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2006          emit_tempf(
2007             func,
2008             0,
2009             TEMP_ONE_I,
2010             TEMP_ONE_C );
2011          STORE( func, *inst, 0, 0, CHAN_X );
2012       }
2013       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2014          FETCH( func, *inst, 0, 0, CHAN_Y );
2015          FETCH( func, *inst, 1, 1, CHAN_Y );
2016          emit_mul( func, 0, 1 );
2017          STORE( func, *inst, 0, 0, CHAN_Y );
2018       }
2019       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2020          FETCH( func, *inst, 0, 0, CHAN_Z );
2021          STORE( func, *inst, 0, 0, CHAN_Z );
2022       }
2023       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2024          FETCH( func, *inst, 0, 1, CHAN_W );
2025          STORE( func, *inst, 0, 0, CHAN_W );
2026       }
2027       break;
2028
2029    case TGSI_OPCODE_MIN:
2030       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2031          FETCH( func, *inst, 0, 0, chan_index );
2032          FETCH( func, *inst, 1, 1, chan_index );
2033          sse_minps(
2034             func,
2035             make_xmm( 0 ),
2036             make_xmm( 1 ) );
2037          STORE( func, *inst, 0, 0, chan_index );
2038       }
2039       break;
2040
2041    case TGSI_OPCODE_MAX:
2042       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2043          FETCH( func, *inst, 0, 0, chan_index );
2044          FETCH( func, *inst, 1, 1, chan_index );
2045          sse_maxps(
2046             func,
2047             make_xmm( 0 ),
2048             make_xmm( 1 ) );
2049          STORE( func, *inst, 0, 0, chan_index );
2050       }
2051       break;
2052
2053    case TGSI_OPCODE_SLT:
2054    /* TGSI_OPCODE_SETLT */
2055       emit_setcc( func, inst, cc_LessThan );
2056       break;
2057
2058    case TGSI_OPCODE_SGE:
2059    /* TGSI_OPCODE_SETGE */
2060       emit_setcc( func, inst, cc_NotLessThan );
2061       break;
2062
2063    case TGSI_OPCODE_MAD:
2064    /* TGSI_OPCODE_MADD */
2065       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2066          FETCH( func, *inst, 0, 0, chan_index );
2067          FETCH( func, *inst, 1, 1, chan_index );
2068          FETCH( func, *inst, 2, 2, chan_index );
2069          emit_mul( func, 0, 1 );
2070          emit_add( func, 0, 2 );
2071          STORE( func, *inst, 0, 0, chan_index );
2072       }
2073       break;
2074
2075    case TGSI_OPCODE_SUB:
2076       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2077          FETCH( func, *inst, 0, 0, chan_index );
2078          FETCH( func, *inst, 1, 1, chan_index );
2079          emit_sub( func, 0, 1 );
2080          STORE( func, *inst, 0, 0, chan_index );
2081       }
2082       break;
2083
2084    case TGSI_OPCODE_LRP:
2085       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2086          FETCH( func, *inst, 0, 0, chan_index );
2087          FETCH( func, *inst, 1, 1, chan_index );
2088          FETCH( func, *inst, 2, 2, chan_index );
2089          emit_sub( func, 1, 2 );
2090          emit_mul( func, 0, 1 );
2091          emit_add( func, 0, 2 );
2092          STORE( func, *inst, 0, 0, chan_index );
2093       }
2094       break;
2095
2096    case TGSI_OPCODE_CND:
2097       return 0;
2098       break;
2099
2100    case TGSI_OPCODE_DP2A:
2101       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2102       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2103       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2104       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2105       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2106       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2107       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2108       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
2109       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2110       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2111          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2112       }
2113       break;
2114
2115    case TGSI_OPCODE_FRC:
2116       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2117          FETCH( func, *inst, 0, 0, chan_index );
2118          emit_frc( func, 0, 0 );
2119          STORE( func, *inst, 0, 0, chan_index );
2120       }
2121       break;
2122
2123    case TGSI_OPCODE_CLAMP:
2124       return 0;
2125       break;
2126
2127    case TGSI_OPCODE_FLR:
2128       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2129          FETCH( func, *inst, 0, 0, chan_index );
2130          emit_flr( func, 0, 0 );
2131          STORE( func, *inst, 0, 0, chan_index );
2132       }
2133       break;
2134
2135    case TGSI_OPCODE_ROUND:
2136       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2137          FETCH( func, *inst, 0, 0, chan_index );
2138          emit_rnd( func, 0, 0 );
2139          STORE( func, *inst, 0, 0, chan_index );
2140       }
2141       break;
2142
2143    case TGSI_OPCODE_EX2:
2144       FETCH( func, *inst, 0, 0, CHAN_X );
2145       emit_ex2( func, 0, 0 );
2146       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2147          STORE( func, *inst, 0, 0, chan_index );
2148       }
2149       break;
2150
2151    case TGSI_OPCODE_LG2:
2152       FETCH( func, *inst, 0, 0, CHAN_X );
2153       emit_lg2( func, 0, 0 );
2154       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2155          STORE( func, *inst, 0, 0, chan_index );
2156       }
2157       break;
2158
2159    case TGSI_OPCODE_POW:
2160       FETCH( func, *inst, 0, 0, CHAN_X );
2161       FETCH( func, *inst, 1, 1, CHAN_X );
2162       emit_pow( func, 0, 0, 0, 1 );
2163       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2164          STORE( func, *inst, 0, 0, chan_index );
2165       }
2166       break;
2167
2168    case TGSI_OPCODE_XPD:
2169       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2170           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2171          FETCH( func, *inst, 1, 1, CHAN_Z );
2172          FETCH( func, *inst, 3, 0, CHAN_Z );
2173       }
2174       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2175           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2176          FETCH( func, *inst, 0, 0, CHAN_Y );
2177          FETCH( func, *inst, 4, 1, CHAN_Y );
2178       }
2179       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2180          emit_MOV( func, 2, 0 );
2181          emit_mul( func, 2, 1 );
2182          emit_MOV( func, 5, 3 );
2183          emit_mul( func, 5, 4 );
2184          emit_sub( func, 2, 5 );
2185          STORE( func, *inst, 2, 0, CHAN_X );
2186       }
2187       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2188           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2189          FETCH( func, *inst, 2, 1, CHAN_X );
2190          FETCH( func, *inst, 5, 0, CHAN_X );
2191       }
2192       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2193          emit_mul( func, 3, 2 );
2194          emit_mul( func, 1, 5 );
2195          emit_sub( func, 3, 1 );
2196          STORE( func, *inst, 3, 0, CHAN_Y );
2197       }
2198       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2199          emit_mul( func, 5, 4 );
2200          emit_mul( func, 0, 2 );
2201          emit_sub( func, 5, 0 );
2202          STORE( func, *inst, 5, 0, CHAN_Z );
2203       }
2204       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2205          emit_tempf(
2206             func,
2207             0,
2208             TEMP_ONE_I,
2209             TEMP_ONE_C );
2210          STORE( func, *inst, 0, 0, CHAN_W );
2211       }
2212       break;
2213
2214    case TGSI_OPCODE_ABS:
2215       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2216          FETCH( func, *inst, 0, 0, chan_index );
2217          emit_abs( func, 0) ;
2218
2219          STORE( func, *inst, 0, 0, chan_index );
2220       }
2221       break;
2222
2223    case TGSI_OPCODE_RCC:
2224       return 0;
2225       break;
2226
2227    case TGSI_OPCODE_DPH:
2228       FETCH( func, *inst, 0, 0, CHAN_X );
2229       FETCH( func, *inst, 1, 1, CHAN_X );
2230       emit_mul( func, 0, 1 );
2231       FETCH( func, *inst, 1, 0, CHAN_Y );
2232       FETCH( func, *inst, 2, 1, CHAN_Y );
2233       emit_mul( func, 1, 2 );
2234       emit_add( func, 0, 1 );
2235       FETCH( func, *inst, 1, 0, CHAN_Z );
2236       FETCH( func, *inst, 2, 1, CHAN_Z );
2237       emit_mul( func, 1, 2 );
2238       emit_add( func, 0, 1 );
2239       FETCH( func, *inst, 1, 1, CHAN_W );
2240       emit_add( func, 0, 1 );
2241       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2242          STORE( func, *inst, 0, 0, chan_index );
2243       }
2244       break;
2245
2246    case TGSI_OPCODE_COS:
2247       FETCH( func, *inst, 0, 0, CHAN_X );
2248       emit_cos( func, 0, 0 );
2249       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2250          STORE( func, *inst, 0, 0, chan_index );
2251       }
2252       break;
2253
2254    case TGSI_OPCODE_DDX:
2255       return 0;
2256       break;
2257
2258    case TGSI_OPCODE_DDY:
2259       return 0;
2260       break;
2261
2262    case TGSI_OPCODE_KILP:
2263       /* predicated kill */
2264       emit_kilp( func );
2265       return 0; /* XXX fix me */
2266       break;
2267
2268    case TGSI_OPCODE_KIL:
2269       /* conditional kill */
2270       emit_kil( func, &inst->FullSrcRegisters[0] );
2271       break;
2272
2273    case TGSI_OPCODE_PK2H:
2274       return 0;
2275       break;
2276
2277    case TGSI_OPCODE_PK2US:
2278       return 0;
2279       break;
2280
2281    case TGSI_OPCODE_PK4B:
2282       return 0;
2283       break;
2284
2285    case TGSI_OPCODE_PK4UB:
2286       return 0;
2287       break;
2288
2289    case TGSI_OPCODE_RFL:
2290       return 0;
2291       break;
2292
2293    case TGSI_OPCODE_SEQ:
2294       return 0;
2295       break;
2296
2297    case TGSI_OPCODE_SFL:
2298       return 0;
2299       break;
2300
2301    case TGSI_OPCODE_SGT:
2302       return 0;
2303       break;
2304
2305    case TGSI_OPCODE_SIN:
2306       FETCH( func, *inst, 0, 0, CHAN_X );
2307       emit_sin( func, 0, 0 );
2308       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2309          STORE( func, *inst, 0, 0, chan_index );
2310       }
2311       break;
2312
2313    case TGSI_OPCODE_SLE:
2314       return 0;
2315       break;
2316
2317    case TGSI_OPCODE_SNE:
2318       return 0;
2319       break;
2320
2321    case TGSI_OPCODE_STR:
2322       return 0;
2323       break;
2324
2325    case TGSI_OPCODE_TEX:
2326       emit_tex( func, inst, FALSE, FALSE );
2327       break;
2328
2329    case TGSI_OPCODE_TXD:
2330       return 0;
2331       break;
2332
2333    case TGSI_OPCODE_UP2H:
2334       return 0;
2335       break;
2336
2337    case TGSI_OPCODE_UP2US:
2338       return 0;
2339       break;
2340
2341    case TGSI_OPCODE_UP4B:
2342       return 0;
2343       break;
2344
2345    case TGSI_OPCODE_UP4UB:
2346       return 0;
2347       break;
2348
2349    case TGSI_OPCODE_X2D:
2350       return 0;
2351       break;
2352
2353    case TGSI_OPCODE_ARA:
2354       return 0;
2355       break;
2356
2357    case TGSI_OPCODE_ARR:
2358       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2359          FETCH( func, *inst, 0, 0, chan_index );
2360          emit_rnd( func, 0, 0 );
2361          emit_f2it( func, 0 );
2362          STORE( func, *inst, 0, 0, chan_index );
2363       }
2364       break;
2365
2366    case TGSI_OPCODE_BRA:
2367       return 0;
2368       break;
2369
2370    case TGSI_OPCODE_CAL:
2371       return 0;
2372       break;
2373
2374    case TGSI_OPCODE_RET:
2375       emit_ret( func );
2376       break;
2377
2378    case TGSI_OPCODE_END:
2379       break;
2380
2381    case TGSI_OPCODE_SSG:
2382    /* TGSI_OPCODE_SGN */
2383       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2384          FETCH( func, *inst, 0, 0, chan_index );
2385          emit_sgn( func, 0, 0 );
2386          STORE( func, *inst, 0, 0, chan_index );
2387       }
2388       break;
2389
2390    case TGSI_OPCODE_CMP:
2391       emit_cmp (func, inst);
2392       break;
2393
2394    case TGSI_OPCODE_SCS:
2395       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2396          FETCH( func, *inst, 0, 0, CHAN_X );
2397          emit_cos( func, 0, 0 );
2398          STORE( func, *inst, 0, 0, CHAN_X );
2399       }
2400       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2401          FETCH( func, *inst, 0, 0, CHAN_X );
2402          emit_sin( func, 0, 0 );
2403          STORE( func, *inst, 0, 0, CHAN_Y );
2404       }
2405       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2406          emit_tempf(
2407             func,
2408             0,
2409             TGSI_EXEC_TEMP_00000000_I,
2410             TGSI_EXEC_TEMP_00000000_C );
2411          STORE( func, *inst, 0, 0, CHAN_Z );
2412       }
2413       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2414          emit_tempf(
2415             func,
2416             0,
2417             TEMP_ONE_I,
2418             TEMP_ONE_C );
2419          STORE( func, *inst, 0, 0, CHAN_W );
2420       }
2421       break;
2422
2423    case TGSI_OPCODE_TXB:
2424       emit_tex( func, inst, TRUE, FALSE );
2425       break;
2426
2427    case TGSI_OPCODE_NRM:
2428       /* fall-through */
2429    case TGSI_OPCODE_NRM4:
2430       /* 3 or 4-component normalization */
2431       {
2432          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2433
2434          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2435              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2436              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2437              (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2438
2439             /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2440
2441             /* xmm4 = src.x */
2442             /* xmm0 = src.x * src.x */
2443             FETCH(func, *inst, 0, 0, CHAN_X);
2444             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2445                emit_MOV(func, 4, 0);
2446             }
2447             emit_mul(func, 0, 0);
2448
2449             /* xmm5 = src.y */
2450             /* xmm0 = xmm0 + src.y * src.y */
2451             FETCH(func, *inst, 1, 0, CHAN_Y);
2452             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2453                emit_MOV(func, 5, 1);
2454             }
2455             emit_mul(func, 1, 1);
2456             emit_add(func, 0, 1);
2457
2458             /* xmm6 = src.z */
2459             /* xmm0 = xmm0 + src.z * src.z */
2460             FETCH(func, *inst, 1, 0, CHAN_Z);
2461             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2462                emit_MOV(func, 6, 1);
2463             }
2464             emit_mul(func, 1, 1);
2465             emit_add(func, 0, 1);
2466
2467             if (dims == 4) {
2468                /* xmm7 = src.w */
2469                /* xmm0 = xmm0 + src.w * src.w */
2470                FETCH(func, *inst, 1, 0, CHAN_W);
2471                if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2472                   emit_MOV(func, 7, 1);
2473                }
2474                emit_mul(func, 1, 1);
2475                emit_add(func, 0, 1);
2476             }
2477
2478             /* xmm1 = 1 / sqrt(xmm0) */
2479             emit_rsqrt(func, 1, 0);
2480
2481             /* dst.x = xmm1 * src.x */
2482             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2483                emit_mul(func, 4, 1);
2484                STORE(func, *inst, 4, 0, CHAN_X);
2485             }
2486
2487             /* dst.y = xmm1 * src.y */
2488             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2489                emit_mul(func, 5, 1);
2490                STORE(func, *inst, 5, 0, CHAN_Y);
2491             }
2492
2493             /* dst.z = xmm1 * src.z */
2494             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2495                emit_mul(func, 6, 1);
2496                STORE(func, *inst, 6, 0, CHAN_Z);
2497             }
2498
2499             /* dst.w = xmm1 * src.w */
2500             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2501                emit_mul(func, 7, 1);
2502                STORE(func, *inst, 7, 0, CHAN_W);
2503             }
2504          }
2505
2506          /* dst0.w = 1.0 */
2507          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2508             emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2509             STORE(func, *inst, 0, 0, CHAN_W);
2510          }
2511       }
2512       break;
2513
2514    case TGSI_OPCODE_DIV:
2515       return 0;
2516       break;
2517
2518    case TGSI_OPCODE_DP2:
2519       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2520       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2521       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2522       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2523       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2524       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2525       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2526       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2527          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2528       }
2529       break;
2530
2531    case TGSI_OPCODE_TXL:
2532       emit_tex( func, inst, TRUE, FALSE );
2533       break;
2534
2535    case TGSI_OPCODE_TXP:
2536       emit_tex( func, inst, FALSE, TRUE );
2537       break;
2538
2539    case TGSI_OPCODE_BRK:
2540       return 0;
2541       break;
2542
2543    case TGSI_OPCODE_IF:
2544       return 0;
2545       break;
2546
2547    case TGSI_OPCODE_BGNFOR:
2548       return 0;
2549       break;
2550
2551    case TGSI_OPCODE_REP:
2552       return 0;
2553       break;
2554
2555    case TGSI_OPCODE_ELSE:
2556       return 0;
2557       break;
2558
2559    case TGSI_OPCODE_ENDIF:
2560       return 0;
2561       break;
2562
2563    case TGSI_OPCODE_ENDFOR:
2564       return 0;
2565       break;
2566
2567    case TGSI_OPCODE_ENDREP:
2568       return 0;
2569       break;
2570
2571    case TGSI_OPCODE_PUSHA:
2572       return 0;
2573       break;
2574
2575    case TGSI_OPCODE_POPA:
2576       return 0;
2577       break;
2578
2579    case TGSI_OPCODE_CEIL:
2580       return 0;
2581       break;
2582
2583    case TGSI_OPCODE_I2F:
2584       return 0;
2585       break;
2586
2587    case TGSI_OPCODE_NOT:
2588       return 0;
2589       break;
2590
2591    case TGSI_OPCODE_TRUNC:
2592       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2593          FETCH( func, *inst, 0, 0, chan_index );
2594          emit_f2it( func, 0 );
2595          emit_i2f( func, 0 );
2596          STORE( func, *inst, 0, 0, chan_index );
2597       }
2598       break;
2599
2600    case TGSI_OPCODE_SHL:
2601       return 0;
2602       break;
2603
2604    case TGSI_OPCODE_SHR:
2605       return 0;
2606       break;
2607
2608    case TGSI_OPCODE_AND:
2609       return 0;
2610       break;
2611
2612    case TGSI_OPCODE_OR:
2613       return 0;
2614       break;
2615
2616    case TGSI_OPCODE_MOD:
2617       return 0;
2618       break;
2619
2620    case TGSI_OPCODE_XOR:
2621       return 0;
2622       break;
2623
2624    case TGSI_OPCODE_SAD:
2625       return 0;
2626       break;
2627
2628    case TGSI_OPCODE_TXF:
2629       return 0;
2630       break;
2631
2632    case TGSI_OPCODE_TXQ:
2633       return 0;
2634       break;
2635
2636    case TGSI_OPCODE_CONT:
2637       return 0;
2638       break;
2639
2640    case TGSI_OPCODE_EMIT:
2641       return 0;
2642       break;
2643
2644    case TGSI_OPCODE_ENDPRIM:
2645       return 0;
2646       break;
2647
2648    default:
2649       return 0;
2650    }
2651
2652    return 1;
2653 }
2654
2655 static void
2656 emit_declaration(
2657    struct x86_function *func,
2658    struct tgsi_full_declaration *decl )
2659 {
2660    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2661       unsigned first, last, mask;
2662       unsigned i, j;
2663
2664       first = decl->DeclarationRange.First;
2665       last = decl->DeclarationRange.Last;
2666       mask = decl->Declaration.UsageMask;
2667
2668       for( i = first; i <= last; i++ ) {
2669          for( j = 0; j < NUM_CHANNELS; j++ ) {
2670             if( mask & (1 << j) ) {
2671                switch( decl->Declaration.Interpolate ) {
2672                case TGSI_INTERPOLATE_CONSTANT:
2673                   emit_coef_a0( func, 0, i, j );
2674                   emit_inputs( func, 0, i, j );
2675                   break;
2676
2677                case TGSI_INTERPOLATE_LINEAR:
2678                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2679                   emit_coef_dadx( func, 1, i, j );
2680                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2681                   emit_coef_dady( func, 3, i, j );
2682                   emit_mul( func, 0, 1 );    /* x * dadx */
2683                   emit_coef_a0( func, 4, i, j );
2684                   emit_mul( func, 2, 3 );    /* y * dady */
2685                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2686                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2687                   emit_inputs( func, 0, i, j );
2688                   break;
2689
2690                case TGSI_INTERPOLATE_PERSPECTIVE:
2691                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2692                   emit_coef_dadx( func, 1, i, j );
2693                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2694                   emit_coef_dady( func, 3, i, j );
2695                   emit_mul( func, 0, 1 );    /* x * dadx */
2696                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2697                   emit_coef_a0( func, 5, i, j );
2698                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2699                   emit_mul( func, 2, 3 );    /* y * dady */
2700                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2701                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2702                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2703                   emit_inputs( func, 0, i, j );
2704                   break;
2705
2706                default:
2707                   assert( 0 );
2708                   break;
2709                }
2710             }
2711          }
2712       }
2713    }
2714 }
2715
2716 static void aos_to_soa( struct x86_function *func,
2717                         uint arg_aos,
2718                         uint arg_machine,
2719                         uint arg_num,
2720                         uint arg_stride )
2721 {
2722    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2723    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2724    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2725    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2726    int inner_loop;
2727
2728
2729    /* Save EBX */
2730    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2731
2732    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2733    x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
2734    x86_lea( func, soa_input,
2735             x86_make_disp( soa_input,
2736                            Offset(struct tgsi_exec_machine, Inputs) ) );
2737    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2738    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2739
2740    /* do */
2741    inner_loop = x86_get_label( func );
2742    {
2743       x86_push( func, aos_input );
2744       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2745       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2746       x86_add( func, aos_input, stride );
2747       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2748       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2749       x86_add( func, aos_input, stride );
2750       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2751       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2752       x86_add( func, aos_input, stride );
2753       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2754       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2755       x86_pop( func, aos_input );
2756
2757       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2758       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2759       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2760       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2761       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2762       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2763
2764       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2765       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2766       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2767       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2768
2769       /* Advance to next input */
2770       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2771       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2772    }
2773    /* while --num_inputs */
2774    x86_dec( func, num_inputs );
2775    x86_jcc( func, cc_NE, inner_loop );
2776
2777    /* Restore EBX */
2778    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2779 }
2780
2781 static void soa_to_aos( struct x86_function *func,
2782                         uint arg_aos,
2783                         uint arg_machine,
2784                         uint arg_num,
2785                         uint arg_stride )
2786 {
2787    struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2788    struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2789    struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2790    struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2791    int inner_loop;
2792
2793    /* Save EBX */
2794    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2795
2796    x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2797    x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2798    x86_lea( func, soa_output,
2799             x86_make_disp( soa_output,
2800                            Offset(struct tgsi_exec_machine, Outputs) ) );
2801    x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2802
2803    /* do */
2804    inner_loop = x86_get_label( func );
2805    {
2806       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2807       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2808       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2809       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2810
2811       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2812       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2813       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2814       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2815       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2816       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2817
2818       x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2819       x86_push( func, aos_output );
2820       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2821       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2822       x86_add( func, aos_output, temp );
2823       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2824       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2825       x86_add( func, aos_output, temp );
2826       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2827       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2828       x86_add( func, aos_output, temp );
2829       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2830       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2831       x86_pop( func, aos_output );
2832
2833       /* Advance to next output */
2834       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2835       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2836    }
2837    /* while --num_outputs */
2838    x86_dec( func, num_outputs );
2839    x86_jcc( func, cc_NE, inner_loop );
2840
2841    /* Restore EBX */
2842    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2843 }
2844
2845 /**
2846  * Translate a TGSI vertex/fragment shader to SSE2 code.
2847  * Slightly different things are done for vertex vs. fragment shaders.
2848  *
2849  * \param tokens  the TGSI input shader
2850  * \param func  the output SSE code/function
2851  * \param immediates  buffer to place immediates, later passed to SSE func
2852  * \param return  1 for success, 0 if translation failed
2853  */
2854 unsigned
2855 tgsi_emit_sse2(
2856    const struct tgsi_token *tokens,
2857    struct x86_function *func,
2858    float (*immediates)[4],
2859    boolean do_swizzles )
2860 {
2861    struct tgsi_parse_context parse;
2862    unsigned ok = 1;
2863    uint num_immediates = 0;
2864
2865    util_init_math();
2866
2867    func->csr = func->store;
2868
2869    tgsi_parse_init( &parse, tokens );
2870
2871    /* Can't just use EDI, EBX without save/restoring them:
2872     */
2873    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2874    x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2875
2876    /*
2877     * Different function args for vertex/fragment shaders:
2878     */
2879    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2880       if (do_swizzles)
2881          aos_to_soa( func,
2882                      4,         /* aos_input */
2883                      1,         /* machine */
2884                      5,         /* num_inputs */
2885                      6 );       /* input_stride */
2886    }
2887
2888    x86_mov(
2889       func,
2890       get_machine_base(),
2891       x86_fn_arg( func, 1 ) );
2892    x86_mov(
2893       func,
2894       get_const_base(),
2895       x86_fn_arg( func, 2 ) );
2896    x86_mov(
2897       func,
2898       get_immediate_base(),
2899       x86_fn_arg( func, 3 ) );
2900
2901    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2902       x86_mov(
2903          func,
2904          get_coef_base(),
2905          x86_fn_arg( func, 4 ) );
2906    }
2907
2908    x86_mov(
2909       func,
2910       get_sampler_base(),
2911       x86_make_disp( get_machine_base(),
2912                      Offset( struct tgsi_exec_machine, Samplers ) ) );
2913
2914
2915    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2916       tgsi_parse_token( &parse );
2917
2918       switch( parse.FullToken.Token.Type ) {
2919       case TGSI_TOKEN_TYPE_DECLARATION:
2920          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2921             emit_declaration(
2922                func,
2923                &parse.FullToken.FullDeclaration );
2924          }
2925          break;
2926
2927       case TGSI_TOKEN_TYPE_INSTRUCTION:
2928          ok = emit_instruction(
2929             func,
2930             &parse.FullToken.FullInstruction );
2931
2932          if (!ok) {
2933             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2934             debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2935                          opcode,
2936                          tgsi_get_opcode_name(opcode),
2937                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2938                          "vertex shader" : "fragment shader");
2939          }
2940
2941          if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
2942             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2943
2944             /* XXX: we only handle src/dst aliasing in a few opcodes
2945              * currently.  Need to use an additional temporay to hold
2946              * the result in the cases where the code is too opaque to
2947              * fix.
2948              */
2949             if (opcode != TGSI_OPCODE_MOV &&
2950                 opcode != TGSI_OPCODE_SWZ) {
2951                debug_printf("Warning: src/dst aliasing in instruction"
2952                             " is not handled:\n");
2953                tgsi_dump_instruction(&parse.FullToken.FullInstruction, 1);
2954             }
2955          }
2956          break;
2957
2958       case TGSI_TOKEN_TYPE_IMMEDIATE:
2959          /* simply copy the immediate values into the next immediates[] slot */
2960          {
2961             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2962             uint i;
2963             assert(size <= 4);
2964             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2965             for( i = 0; i < size; i++ ) {
2966                immediates[num_immediates][i] =
2967                   parse.FullToken.FullImmediate.u[i].Float;
2968             }
2969 #if 0
2970             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2971                    num_immediates,
2972                    immediates[num_immediates][0],
2973                    immediates[num_immediates][1],
2974                    immediates[num_immediates][2],
2975                    immediates[num_immediates][3]);
2976 #endif
2977             num_immediates++;
2978          }
2979          break;
2980
2981       default:
2982          ok = 0;
2983          assert( 0 );
2984       }
2985    }
2986
2987    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2988       if (do_swizzles)
2989          soa_to_aos( func,
2990                      7,         /* aos_output */
2991                      1,         /* machine */
2992                      8,         /* num_outputs */
2993                      9 );       /* output_stride */
2994    }
2995
2996    /* Can't just use EBX, EDI without save/restoring them:
2997     */
2998    x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2999    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
3000
3001    emit_ret( func );
3002
3003    tgsi_parse_free( &parse );
3004
3005    return ok;
3006 }
3007
3008 #endif /* PIPE_ARCH_X86 */
3009